Skip to content

Commit cb6f63a

Browse files
authored
Merge pull request #70 from thughari/dev
handled the indeed links and smooth finish
2 parents fbbe3da + 4189595 commit cb6f63a

7 files changed

Lines changed: 128 additions & 61 deletions

File tree

backend/src/main/java/com/thughari/jobtrackerpro/service/GeminiExtractionService.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,6 @@ private String buildBatchPrompt(List<EmailBatchItem> items) {
395395
- help
396396
- privacy
397397
- settings
398-
- account management
399398
400399
If no job-related link exists:
401400

backend/src/main/java/com/thughari/jobtrackerpro/service/GmailIntegrationService.java

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ public void connectAndSetupPush(String authCode, String email) throws Exception
119119

120120
userRepository.saveAndFlush(user);
121121

122-
log.info("Gmail Automation enabled with 1 DB transaction for: {}", user.getEmail());
122+
log.info("User {} successfully connected Gmail. Watch set with label ID: {}", email, labelId);
123123
}
124124

125125
@Async("taskExecutor")
@@ -151,8 +151,6 @@ public void initiateManualSync(String email) {
151151
String currentHistoryId = service.users().getProfile("me").execute().getHistoryId().toString();
152152

153153
jobService.finalizeManualSync(email, currentHistoryId);
154-
155-
log.info("Manual sync finished for {}. Found {} jobs.", email, found);
156154
} catch (Exception e) {
157155
log.error("Manual sync failed for {}: {}", email, e.getMessage());
158156
} finally {
@@ -340,8 +338,6 @@ public void disconnectGmail(String email) {
340338
userRepository.saveAndFlush(user);
341339

342340
cleanupGoogleResourcesAsync(refreshToken, labelId);
343-
344-
log.info("User {} disconnected from Gmail. Local state cleared.", email);
345341
}
346342

347343
@Async("taskExecutor")
@@ -358,7 +354,6 @@ protected void cleanupGoogleResourcesAsync(String refreshToken, String labelId)
358354
.uri("https://oauth2.googleapis.com/revoke?token=" + refreshToken)
359355
.retrieve();
360356

361-
log.info("Google resources cleaned up and token revoked.");
362357
} catch (Exception e) {
363358
log.warn("Non-critical: Google resource cleanup failed: {}", e.getMessage());
364359
}

backend/src/main/java/com/thughari/jobtrackerpro/service/GmailWebhookService.java

Lines changed: 88 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
import java.util.ArrayList;
2525
import java.util.Base64;
2626
import java.util.List;
27+
import java.util.regex.Matcher;
28+
import java.util.regex.Pattern;
2729

2830
@Service
2931
@Slf4j
@@ -50,19 +52,14 @@ public GmailWebhookService(GeminiService geminiService, JobService jobService, U
5052
@Async("taskExecutor")
5153
public void processHistorySync(String userEmail) {
5254
final String email = userEmail.toLowerCase();
53-
55+
5456
LocalDateTime now = LocalDateTime.now();
5557
LocalDateTime expiryThreshold = now.minusMinutes(15);
5658

57-
int updatedRows = userRepository.claimSyncLock(email, now, expiryThreshold);
58-
if (updatedRows == 0) return;
59-
60-
cacheEvictService.evictAllForUser(email);
59+
if (userRepository.claimSyncLock(email, now, expiryThreshold) == 0) return;
6160

6261
try {
63-
User user = userRepository.findByEmail(email)
64-
.orElseThrow(() -> new RuntimeException("User not found after lock"));
65-
62+
User user = userRepository.findByEmail(email).orElseThrow();
6663
if (user.getGmailRefreshToken() == null) return;
6764

6865
String accessToken = getFreshAccessToken(user.getGmailRefreshToken());
@@ -87,15 +84,12 @@ public void processHistorySync(String userEmail) {
8784
List<EmailBatchItem> batchItems = collectMessages(service, historyResponse.getHistory());
8885

8986
if (!batchItems.isEmpty()) {
90-
91-
List<JobDTO> extractedJobs = geminiService.extractJobsFromBatch(batchItems);
92-
93-
log.info("Ingesting batch of {} emails via Gemini for {}", batchItems.size(), email);
94-
95-
jobService.saveBatchResults(email, batchItems, extractedJobs);
96-
}
87+
log.info("Ingesting batch of {} emails for {}", batchItems.size(), email);
88+
List<JobDTO> extractedJobs = geminiService.extractJobsFromBatch(batchItems);
89+
jobService.saveBatchResults(email, batchItems, extractedJobs);
90+
}
9791
} catch (Exception e) {
98-
log.error("High-Performance Sync failed for {}: ", email, e);
92+
log.error("Sync failed for {}: ", email, e);
9993
} finally {
10094
userRepository.releaseSyncLock(email);
10195
cacheEvictService.evictAllForUser(email);
@@ -110,45 +104,105 @@ private List<EmailBatchItem> collectMessages(Gmail service, List<History> histor
110104
if (history.getMessagesAdded() == null) continue;
111105
for (HistoryMessageAdded added : history.getMessagesAdded()) {
112106
try {
113-
Message m = service.users().messages().get("me", added.getMessage().getId())
114-
.setFormat("full").execute();
115-
116-
long millisecondTimestamp = m.getInternalDate();
117-
LocalDateTime emailDate = LocalDateTime.ofInstant(
118-
Instant.ofEpochMilli(millisecondTimestamp), ZoneOffset.UTC);
107+
Message m = service.users().messages().get("me", added.getMessage().getId()).setFormat("full").execute();
108+
LocalDateTime emailDate = LocalDateTime.ofInstant(Instant.ofEpochMilli(m.getInternalDate()), ZoneOffset.UTC);
119109

120-
String from = "", subj = "", replyTo="";
110+
String from = "", subj = "", replyTo = "";
121111
for (var h : m.getPayload().getHeaders()) {
122112
if ("From".equalsIgnoreCase(h.getName())) from = h.getValue();
123113
if ("Subject".equalsIgnoreCase(h.getName())) subj = h.getValue();
124114
if ("Reply-To".equalsIgnoreCase(h.getName())) replyTo = h.getValue();
125115
}
126116

127117
if (!isSystemNoise(subj)) {
128-
String body = extractTextFromBody(m.getPayload());
118+
String body = extractProcessedBody(m.getPayload());
129119
items.add(new EmailBatchItem(from, subj, replyTo, body, emailDate));
130120
}
131121
} catch (Exception e) {
132-
log.warn("Failed to fetch message {}: {}", added.getMessage().getId(), e.getMessage());
122+
log.warn("Failed message fetch: {}", e.getMessage());
133123
}
134124
}
135125
}
136126
return items;
137127
}
138128

139-
private String extractTextFromBody(MessagePart part) {
129+
private String extractProcessedBody(MessagePart payload) {
130+
StringBuilder rawBuffer = new StringBuilder();
131+
recursiveRawCollect(payload, rawBuffer);
132+
133+
String cleaned = surgicalClean(rawBuffer.toString());
134+
135+
return cleaned;
136+
}
137+
138+
private void recursiveRawCollect(MessagePart part, StringBuilder buffer) {
139+
if (part.getParts() != null) {
140+
for (MessagePart subPart : part.getParts()) recursiveRawCollect(subPart, buffer);
141+
}
140142
if (part.getBody() != null && part.getBody().getData() != null) {
141-
String content = new String(Base64.getUrlDecoder().decode(part.getBody().getData()));
142-
if (part.getMimeType().contains("text/plain")) return content;
143-
if (part.getMimeType().contains("text/html")) return content.replaceAll("<[^>]*>", " ");
143+
buffer.append(new String(Base64.getUrlDecoder().decode(part.getBody().getData()))).append("\n");
144144
}
145-
if (part.getParts() != null) {
146-
for (MessagePart subPart : part.getParts()) {
147-
String text = extractTextFromBody(subPart);
148-
if (text != null && !text.isBlank()) return text;
145+
}
146+
147+
private String surgicalClean(String rawHtml) {
148+
if (rawHtml == null || rawHtml.isBlank()) return "";
149+
150+
String content = rawHtml.replaceAll("(?is)<style.*?>.*?</style>", "")
151+
.replaceAll("(?is)<script.*?>.*?</script>", "");
152+
153+
StringBuilder sb = new StringBuilder();
154+
Matcher m = Pattern.compile("(?is)<a\\s+[^>]*?href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*?>(.*?)</a>").matcher(content);
155+
156+
int lastEnd = 0;
157+
while (m.find()) {
158+
sb.append(content, lastEnd, m.start());
159+
160+
String rawUrl = m.group(1).replace("&amp;", "&");
161+
String linkText = m.group(2).replaceAll("<[^>]*>", "").trim();
162+
163+
String processedUrl = processUrlByDomain(rawUrl);
164+
165+
boolean isJobLink = processedUrl.contains("viewjob") || processedUrl.contains("confirmemail") ||
166+
processedUrl.contains("linkedin.com/jobs") || processedUrl.contains("careers") ||
167+
processedUrl.contains("apply");
168+
169+
if (isJobLink && processedUrl.length() > 15) {
170+
sb.append(" [LINK_START]").append(linkText).append("[LINK_URL]").append(processedUrl).append("[LINK_END] ");
171+
} else {
172+
sb.append(" ").append(linkText).append(" ");
149173
}
174+
175+
lastEnd = m.end();
176+
}
177+
sb.append(content.substring(lastEnd));
178+
179+
return sb.toString()
180+
.replaceAll("(?i)<br\\s*/?>", "\n")
181+
.replaceAll("(?i)</td>", " ")
182+
.replaceAll("<[^>]*>", " ")
183+
.replaceAll("&nbsp;", " ")
184+
.replaceAll("\\s+", " ")
185+
.trim();
186+
}
187+
188+
private String processUrlByDomain(String url) {
189+
if (url == null) return "";
190+
String lowerUrl = url.toLowerCase();
191+
192+
if (lowerUrl.contains("linkedin.com/jobs") || lowerUrl.contains("linkedin.com/comm/jobs")) {
193+
int queryIndex = url.indexOf("?");
194+
return queryIndex > 0 ? url.substring(0, queryIndex) : url;
150195
}
151-
return "";
196+
197+
if (lowerUrl.contains("indeed.com")) {
198+
return url;
199+
}
200+
201+
if (url.contains("utm_") || url.contains("ref=")) {
202+
return url.replaceAll("[?&]utm_[^&]+", "").replaceAll("[?&]ref=[^&]+", "");
203+
}
204+
205+
return url;
152206
}
153207

154208
private void bootstrapUserHistory(Gmail service, User user) throws Exception {
@@ -164,13 +218,6 @@ private boolean isSystemNoise(String subject) {
164218
return s.contains("security alert") || s.contains("sign-in") || s.contains("verification code");
165219
}
166220

167-
// private void evictUserCaches(String email) {
168-
// Cache userCache = cacheManager.getCache("users");
169-
// Cache entityCache = cacheManager.getCache("userEntities");
170-
// if (userCache != null) userCache.evict(email);
171-
// if (entityCache != null) entityCache.evict(email);
172-
// }
173-
174221
public String getFreshAccessToken(String refreshToken) throws Exception {
175222
return new GoogleRefreshTokenRequest(GoogleNetHttpTransport.newTrustedTransport(), GsonFactory.getDefaultInstance(),
176223
refreshToken, clientId, clientSecret).execute().getAccessToken();

backend/src/main/java/com/thughari/jobtrackerpro/service/IngestionService.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ public void handleManualForward(String from, String subject, String body, String
3030
if (user == null) return;
3131

3232
if (Boolean.TRUE.equals(user.getGmailConnected())) {
33-
log.info("Discarding forwarded email for {}: Direct Sync is active.", userEmail);
33+
log.warn("Discarding forwarded email for {}: Direct Sync is active.", userEmail);
3434
return;
3535
}
3636

@@ -39,7 +39,6 @@ public void handleManualForward(String from, String subject, String body, String
3939

4040
if (job != null) {
4141
jobService.createOrUpdateJob(job, userEmail);
42-
log.info("Successfully ingested forwarded job: {} for {}", job.getCompany(), userEmail);
4342
}
4443
}
4544
}

backend/src/main/java/com/thughari/jobtrackerpro/service/JobService.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ public void saveBatchResults(String email, List<EmailBatchItem> batchItems, List
175175

176176
List<List<String>> batchUrlLists = batchItems.parallelStream()
177177
.map(item -> UrlParser.extractAndCleanUrls(item.body()))
178-
.toList();
178+
.toList();
179179
for (JobDTO job : extractedJobs) {
180180
Integer idx = job.getInputIndex();
181181

backend/src/main/java/com/thughari/jobtrackerpro/util/UrlParser.java

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,30 +8,50 @@
88

99
public class UrlParser {
1010

11-
private static final Pattern URL_PATTERN = Pattern.compile("https?://[a-zA-Z0-9./?=&%_\\-]+");
11+
private static final Pattern URL_PATTERN = Pattern.compile("(https?://|www\\.)[a-zA-Z0-9./?=&%_\\-+]+(?<![.,!?:;])");
1212

1313
public static List<String> extractAndCleanUrls(String text) {
1414
if (text == null) return List.of();
1515
List<String> urls = new ArrayList<>();
1616
Matcher matcher = URL_PATTERN.matcher(text);
1717
while (matcher.find()) {
18-
urls.add(cleanTrackingParams(matcher.group()));
18+
String rawUrl = matcher.group();
19+
urls.add(processUrlByDomain(rawUrl));
1920
}
20-
return urls.stream().distinct().collect(Collectors.toList());
21+
return urls.stream()
22+
.filter(url -> !url.isBlank())
23+
.distinct()
24+
.collect(Collectors.toList());
2125
}
2226

23-
private static String cleanTrackingParams(String url) {
24-
int qIndex = url.indexOf("?");
25-
return qIndex > 0 ? url.substring(0, qIndex) : url;
27+
28+
private static String processUrlByDomain(String url) {
29+
String lowerUrl = url.toLowerCase();
30+
31+
if (lowerUrl.contains("indeed.com")) {
32+
return url;
33+
}
34+
35+
if (lowerUrl.contains("linkedin.com") || lowerUrl.contains("utm_") || lowerUrl.contains("ref=") || lowerUrl.contains("trk=")) {
36+
int qIndex = url.indexOf("?");
37+
return qIndex > 0 ? url.substring(0, qIndex) : url;
38+
}
39+
40+
return url;
2641
}
2742

2843
public static String trimNoise(String body) {
2944
if (body == null) return "";
30-
String[] markers = {"View similar jobs", "Unsubscribe", "©", "Help Center", "References"};
45+
46+
String cleanBody = body.replaceAll("(?is)<style.*?>.*?</style>", "")
47+
.replaceAll("(?is)<script.*?>.*?</script>", "");
48+
49+
String[] markers = {"View similar jobs", "Unsubscribe", "©", "Help Center", "References", "Privacy Policy"};
3150
for (String marker : markers) {
32-
int index = body.indexOf(marker);
33-
if (index > 0) body = body.substring(0, index);
51+
int index = cleanBody.indexOf(marker);
52+
if (index > 0) cleanBody = cleanBody.substring(0, index);
3453
}
35-
return body.length() > 3000 ? body.substring(0, 3000 ) : body;
54+
55+
return cleanBody.length() > 3000 ? cleanBody.substring(0, 3000) : cleanBody;
3656
}
3757
}

backend/src/main/resources/application-prod.properties

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@ spring.jpa.properties.hibernate.jdbc.batch_size=25
2626
spring.jpa.properties.hibernate.order_inserts=true
2727
spring.jpa.properties.hibernate.order_updates=true
2828

29+
# Logging Levels
30+
logging.level.root=WARN
31+
logging.level.org.springframework=WARN
32+
logging.level.org.hibernate=WARN
33+
logging.level.com.thughari.jobtrackerpro=WARN
34+
logging.level.com.thughari.jobtrackerpro.scheduler=INFO
35+
2936
# Gemini AI
3037
app.gemini.enabled=true
3138
gemini.api.key=${GEMINI_API_KEY}

0 commit comments

Comments
 (0)