-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathurl_suggester.py
More file actions
203 lines (165 loc) · 7.88 KB
/
url_suggester.py
File metadata and controls
203 lines (165 loc) · 7.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/usr/bin/env python3
"""Module for suggesting URLs for BibTeX entries that are missing them."""
import re
from typing import Optional, Dict, List
from bibtex_parser import BibEntry
import requests
import urllib.parse
class URLSuggester:
"""Suggests URLs for BibTeX entries based on their metadata."""
def __init__(self):
self.suggestion_cache = {}
def suggest_url(self, entry: BibEntry) -> Optional[str]:
"""Suggest a URL for an entry based on its metadata."""
# Check cache first
if entry.key in self.suggestion_cache:
return self.suggestion_cache[entry.key]
url = None
# If there's already a URL, return it
if 'url' in entry.fields:
return entry.fields['url']
# Try different strategies based on entry type and available fields
# 1. Check for DOI - most reliable
if 'doi' in entry.fields:
doi = entry.fields['doi'].strip()
# Clean up DOI if needed
if doi.startswith('10.'):
url = f"https://doi.org/{doi}"
elif 'doi.org' in doi:
url = doi # Already a full URL
else:
# Try to extract DOI pattern
match = re.search(r'(10\.\d{4,}/[-._;()/:\w]+)', doi)
if match:
url = f"https://doi.org/{match.group(1)}"
# 2. Check for arXiv
elif 'eprint' in entry.fields or 'archiveprefix' in entry.fields:
eprint = entry.fields.get('eprint', '')
if eprint:
# Clean up eprint ID
eprint = eprint.replace('arXiv:', '').strip()
url = f"https://arxiv.org/abs/{eprint}"
# 3. Check for specific conference/journal patterns
elif 'booktitle' in entry.fields or 'journal' in entry.fields:
venue = entry.fields.get('booktitle', entry.fields.get('journal', ''))
title = entry.fields.get('title', '')
# ACL Anthology papers
if any(conf in venue.upper() for conf in ['ACL', 'EMNLP', 'NAACL', 'COLING', 'EACL']):
# Try to construct ACL Anthology URL if we have enough info
# This is a heuristic and may not always work
if entry.key and '-' in entry.key and 'etal' in entry.key:
# Entries like "merrill-etal-2020-formal" often map to ACL anthology
url = self._suggest_acl_url(entry)
# NeurIPS/ICML/ICLR papers
elif any(conf in venue.upper() for conf in ['NEURIPS', 'NIPS', 'ICML', 'ICLR']):
url = self._suggest_ml_conference_url(entry)
# IEEE papers
elif 'IEEE' in venue:
url = self._suggest_ieee_url(entry)
# Cache the result
if url:
self.suggestion_cache[entry.key] = url
return url
def _suggest_acl_url(self, entry: BibEntry) -> Optional[str]:
"""Suggest ACL Anthology URL based on entry metadata."""
# This is a heuristic - ACL Anthology URLs often follow patterns
# but this won't always be accurate
# If we have the entry key in format: author-etal-year-shortname
if entry.key and '-' in entry.key:
parts = entry.key.split('-')
if len(parts) >= 3 and parts[-2].isdigit():
year = parts[-2]
# This is very approximate - real implementation would need
# actual lookup or database
return f"https://aclanthology.org/{year}.{entry.key}"
return None
def _suggest_ml_conference_url(self, entry: BibEntry) -> Optional[str]:
"""Suggest URL for ML conference papers."""
# These conferences often have predictable URL patterns
# but would need actual lookup for accuracy
venue = entry.fields.get('booktitle', entry.fields.get('journal', ''))
year = entry.fields.get('year', '')
if 'NEURIPS' in venue.upper() or 'NIPS' in venue.upper():
# NeurIPS papers are often on proceedings.neurips.cc
# But we'd need the paper hash/ID for the actual URL
return None # Can't construct without paper ID
elif 'ICML' in venue.upper():
# ICML uses proceedings.mlr.press
return None # Can't construct without paper ID
elif 'ICLR' in venue.upper() and year:
# ICLR uses OpenReview
return None # Can't construct without paper ID
return None
def _suggest_ieee_url(self, entry: BibEntry) -> Optional[str]:
"""Suggest IEEE Xplore URL."""
# IEEE URLs require document numbers which we don't have
# Would need actual search/lookup
return None
def find_url_via_search(self, entry: BibEntry) -> Optional[str]:
"""Try to find URL by searching for the paper title and authors.
Uses Claude Code's web search if available.
"""
# Check if we're running in Claude Code environment
try:
# Try to use Claude's web search capability
return self._search_with_claude(entry)
except:
# Claude Code not available or search failed
return None
def _search_with_claude(self, entry: BibEntry) -> Optional[str]:
"""Use Claude Code's web search to find paper URL."""
# This method will be called from the interactive mode
# and will use Claude's capabilities when available
# Build search query from entry metadata
title = entry.fields.get('title', '')
authors = entry.fields.get('author', '')
year = entry.fields.get('year', '')
# Clean up title - remove LaTeX commands and braces
clean_title = re.sub(r'\{([^}]+)\}', r'\1', title)
clean_title = re.sub(r'\\[a-zA-Z]+', '', clean_title)
# Get first author's last name
first_author = ''
if authors:
# Extract first author's last name
author_parts = authors.split(' and ')[0].strip()
if ',' in author_parts:
first_author = author_parts.split(',')[0].strip()
else:
# Last word is probably last name
first_author = author_parts.split()[-1] if author_parts else ''
# Build search query
search_parts = []
if clean_title:
search_parts.append(f'"{clean_title}"')
if first_author:
search_parts.append(first_author)
if year:
search_parts.append(year)
search_query = ' '.join(search_parts)
# Return the search query for Claude to process
# The actual search will be done by the interactive CLI
return search_query
def validate_url(self, url: str) -> bool:
"""Check if a URL is valid and accessible."""
try:
# Simple validation - just check URL format
result = urllib.parse.urlparse(url)
return all([result.scheme, result.netloc])
except:
return False
def get_missing_urls_report(self, entries: List[BibEntry]) -> Dict[str, str]:
"""Generate a report of entries missing URLs with suggestions."""
report = {}
for entry in entries:
if 'url' not in entry.fields:
suggestion = self.suggest_url(entry)
if suggestion:
report[entry.key] = suggestion
else:
# Try harder to find URL
suggestion = self.find_url_via_search(entry)
if suggestion:
report[entry.key] = suggestion
else:
report[entry.key] = "No URL found"
return report