bibtex_cleanup/url_suggester.py at main · vesteinn/bibtex_cleanup · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/usr/bin/env python3
"""Module for suggesting URLs for BibTeX entries that are missing them."""

import re
from typing import Optional, Dict, List
from bibtex_parser import BibEntry
import requests
import urllib.parse


class URLSuggester:
    """Suggests URLs for BibTeX entries based on their metadata."""

    def __init__(self):
        self.suggestion_cache = {}

    def suggest_url(self, entry: BibEntry) -> Optional[str]:
        """Suggest a URL for an entry based on its metadata."""

        # Check cache first
        if entry.key in self.suggestion_cache:
            return self.suggestion_cache[entry.key]

        url = None

        # If there's already a URL, return it
        if 'url' in entry.fields:
            return entry.fields['url']

        # Try different strategies based on entry type and available fields

        # 1. Check for DOI - most reliable
        if 'doi' in entry.fields:
            doi = entry.fields['doi'].strip()
            # Clean up DOI if needed
            if doi.startswith('10.'):
                url = f"https://doi.org/{doi}"
            elif 'doi.org' in doi:
                url = doi  # Already a full URL
            else:
                # Try to extract DOI pattern
                match = re.search(r'(10\.\d{4,}/[-._;()/:\w]+)', doi)
                if match:
                    url = f"https://doi.org/{match.group(1)}"

        # 2. Check for arXiv
        elif 'eprint' in entry.fields or 'archiveprefix' in entry.fields:
            eprint = entry.fields.get('eprint', '')
            if eprint:
                # Clean up eprint ID
                eprint = eprint.replace('arXiv:', '').strip()
                url = f"https://arxiv.org/abs/{eprint}"

        # 3. Check for specific conference/journal patterns
        elif 'booktitle' in entry.fields or 'journal' in entry.fields:
            venue = entry.fields.get('booktitle', entry.fields.get('journal', ''))
            title = entry.fields.get('title', '')

            # ACL Anthology papers
            if any(conf in venue.upper() for conf in ['ACL', 'EMNLP', 'NAACL', 'COLING', 'EACL']):
                # Try to construct ACL Anthology URL if we have enough info
                # This is a heuristic and may not always work
                if entry.key and '-' in entry.key and 'etal' in entry.key:
                    # Entries like "merrill-etal-2020-formal" often map to ACL anthology
                    url = self._suggest_acl_url(entry)

            # NeurIPS/ICML/ICLR papers
            elif any(conf in venue.upper() for conf in ['NEURIPS', 'NIPS', 'ICML', 'ICLR']):
                url = self._suggest_ml_conference_url(entry)

            # IEEE papers
            elif 'IEEE' in venue:
                url = self._suggest_ieee_url(entry)

        # Cache the result
        if url:
            self.suggestion_cache[entry.key] = url

        return url

    def _suggest_acl_url(self, entry: BibEntry) -> Optional[str]:
        """Suggest ACL Anthology URL based on entry metadata."""
        # This is a heuristic - ACL Anthology URLs often follow patterns
        # but this won't always be accurate

        # If we have the entry key in format: author-etal-year-shortname
        if entry.key and '-' in entry.key:
            parts = entry.key.split('-')
            if len(parts) >= 3 and parts[-2].isdigit():
                year = parts[-2]
                # This is very approximate - real implementation would need
                # actual lookup or database
                return f"https://aclanthology.org/{year}.{entry.key}"

        return None

    def _suggest_ml_conference_url(self, entry: BibEntry) -> Optional[str]:
        """Suggest URL for ML conference papers."""
        # These conferences often have predictable URL patterns
        # but would need actual lookup for accuracy

        venue = entry.fields.get('booktitle', entry.fields.get('journal', ''))
        year = entry.fields.get('year', '')

        if 'NEURIPS' in venue.upper() or 'NIPS' in venue.upper():
            # NeurIPS papers are often on proceedings.neurips.cc
            # But we'd need the paper hash/ID for the actual URL
            return None  # Can't construct without paper ID
        elif 'ICML' in venue.upper():
            # ICML uses proceedings.mlr.press
            return None  # Can't construct without paper ID
        elif 'ICLR' in venue.upper() and year:
            # ICLR uses OpenReview
            return None  # Can't construct without paper ID

        return None

    def _suggest_ieee_url(self, entry: BibEntry) -> Optional[str]:
        """Suggest IEEE Xplore URL."""
        # IEEE URLs require document numbers which we don't have
        # Would need actual search/lookup
        return None

    def find_url_via_search(self, entry: BibEntry) -> Optional[str]:
        """Try to find URL by searching for the paper title and authors.

        Uses Claude Code's web search if available.
        """
        # Check if we're running in Claude Code environment
        try:
            # Try to use Claude's web search capability
            return self._search_with_claude(entry)
        except:
            # Claude Code not available or search failed
            return None

    def _search_with_claude(self, entry: BibEntry) -> Optional[str]:
        """Use Claude Code's web search to find paper URL."""
        # This method will be called from the interactive mode
        # and will use Claude's capabilities when available

        # Build search query from entry metadata
        title = entry.fields.get('title', '')
        authors = entry.fields.get('author', '')
        year = entry.fields.get('year', '')

        # Clean up title - remove LaTeX commands and braces
        clean_title = re.sub(r'\{([^}]+)\}', r'\1', title)
        clean_title = re.sub(r'\\[a-zA-Z]+', '', clean_title)

        # Get first author's last name
        first_author = ''
        if authors:
            # Extract first author's last name
            author_parts = authors.split(' and ')[0].strip()
            if ',' in author_parts:
                first_author = author_parts.split(',')[0].strip()
            else:
                # Last word is probably last name
                first_author = author_parts.split()[-1] if author_parts else ''

        # Build search query
        search_parts = []
        if clean_title:
            search_parts.append(f'"{clean_title}"')
        if first_author:
            search_parts.append(first_author)
        if year:
            search_parts.append(year)

        search_query = ' '.join(search_parts)

        # Return the search query for Claude to process
        # The actual search will be done by the interactive CLI
        return search_query

    def validate_url(self, url: str) -> bool:
        """Check if a URL is valid and accessible."""
        try:
            # Simple validation - just check URL format
            result = urllib.parse.urlparse(url)
            return all([result.scheme, result.netloc])
        except:
            return False

    def get_missing_urls_report(self, entries: List[BibEntry]) -> Dict[str, str]:
        """Generate a report of entries missing URLs with suggestions."""
        report = {}

        for entry in entries:
            if 'url' not in entry.fields:
                suggestion = self.suggest_url(entry)
                if suggestion:
                    report[entry.key] = suggestion
                else:
                    # Try harder to find URL
                    suggestion = self.find_url_via_search(entry)
                    if suggestion:
                        report[entry.key] = suggestion
                    else:
                        report[entry.key] = "No URL found"

        return report