bibtex_cleanup/claude_url_finder_api.py at main · vesteinn/bibtex_cleanup · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/usr/bin/env python3
"""Module for finding URLs using Claude API or direct web search."""

import os
import re
from typing import Optional, List, Tuple
from bibtex_parser import BibEntry

# Try to import Anthropic SDK
try:
    from anthropic import Anthropic
    ANTHROPIC_AVAILABLE = True
except ImportError:
    ANTHROPIC_AVAILABLE = False
    print("Note: Anthropic SDK not installed. Install with: pip install anthropic")


class ClaudeURLFinder:
    """Find URLs for papers using Claude API."""

    def __init__(self):
        self.client = None
        self.api_key = os.environ.get('ANTHROPIC_API_KEY')

        if ANTHROPIC_AVAILABLE and self.api_key:
            try:
                self.client = Anthropic(api_key=self.api_key)
                print("✓ Claude API initialized successfully")
            except Exception as e:
                print(f"Warning: Could not initialize Claude API: {e}")
                self.client = None
        elif not self.api_key:
            print("Note: ANTHROPIC_API_KEY not found in environment variables")
            print("      To use Claude API, set: export ANTHROPIC_API_KEY='your-key'")

    def find_paper_url(self, entry: BibEntry) -> Optional[Tuple[str, float]]:
        """Find URL for a paper using Claude API.

        Returns tuple of (url, confidence) or None.
        """
        if not self.client:
            return None

        # Build context from entry
        title = entry.fields.get('title', '')
        authors = entry.fields.get('author', '')
        year = entry.fields.get('year', '')
        venue = entry.fields.get('booktitle', entry.fields.get('journal', ''))

        # Clean up title
        clean_title = re.sub(r'\{([^}]+)\}', r'\1', title)
        clean_title = re.sub(r'\\[a-zA-Z]+', '', clean_title).strip()

        prompt = f"""Find the official URL for this academic paper. Return ONLY the URL, nothing else.

Paper details:
- Title: {clean_title}
- Authors: {authors}
- Year: {year}
- Venue: {venue}

Requirements:
1. Find the official publisher URL or repository link (arXiv, ACL, etc.)
2. Prefer direct PDF links when available
3. Return only the URL, no explanation
4. If you cannot find a reliable URL, return "NOT_FOUND"
"""

        try:
            response = self.client.messages.create(
                model="claude-3-haiku-20240307",  # Use fast, cheap model
                max_tokens=200,
                temperature=0,
                messages=[{"role": "user", "content": prompt}]
            )

            result = response.content[0].text.strip()

            # Check if a URL was found
            if result and result != "NOT_FOUND" and result.startswith('http'):
                # Extract confidence based on response
                confidence = 0.9 if 'doi.org' in result or 'arxiv.org' in result else 0.7
                return (result, confidence)

        except Exception as e:
            print(f"Error calling Claude API: {e}")

        return None

    def find_urls_batch(self, entries: List[BibEntry], max_entries: int = 10) -> dict:
        """Find URLs for multiple entries.

        Args:
            entries: List of BibEntry objects missing URLs
            max_entries: Maximum number to process (to limit API costs)

        Returns:
            Dictionary mapping entry keys to suggested URLs
        """
        results = {}
        processed = 0

        for entry in entries:
            if processed >= max_entries:
                print(f"Reached limit of {max_entries} entries")
                break

            if 'url' not in entry.fields:
                print(f"Finding URL for: {entry.key}")
                result = self.find_paper_url(entry)

                if result:
                    url, confidence = result
                    results[entry.key] = {
                        'url': url,
                        'confidence': confidence
                    }
                    print(f"  ✓ Found: {url[:60]}... (confidence: {confidence:.1%})")
                else:
                    print(f"  ✗ No URL found")

                processed += 1

        return results


def find_paper_url_direct(title: str, authors: str = "", year: str = "") -> Optional[str]:
    """Direct function to find a paper URL using Claude.

    This can be called from the command line or other scripts.
    """
    finder = ClaudeURLFinder()

    if not finder.client:
        print("Claude API not available. Please set ANTHROPIC_API_KEY environment variable.")
        return None

    # Create a temporary entry
    entry = BibEntry(
        entry_type='article',
        key='temp',
        fields={'title': title, 'author': authors, 'year': year}
    )

    result = finder.find_paper_url(entry)
    if result:
        url, confidence = result
        print(f"Found URL with {confidence:.1%} confidence: {url}")
        return url

    return None


# For testing
if __name__ == "__main__":
    print("Claude URL Finder with Anthropic API")
    print("=====================================\n")

    # Check if API is available
    if not ANTHROPIC_AVAILABLE:
        print("Please install the Anthropic SDK:")
        print("  pip install anthropic")
        exit(1)

    # Test with a known paper
    test_entry = BibEntry(
        entry_type='article',
        key='hochreiter1997long',
        fields={
            'title': 'Long short-term memory',
            'author': 'Hochreiter, Sepp and Schmidhuber, Jürgen',
            'journal': 'Neural computation',
            'year': '1997',
            'volume': '9',
            'number': '8',
            'pages': '1735--1780'
        }
    )

    finder = ClaudeURLFinder()

    if finder.client:
        print("\nTesting with Hochreiter LSTM paper...")
        result = finder.find_paper_url(test_entry)

        if result:
            url, confidence = result
            print(f"\nFound URL: {url}")
            print(f"Confidence: {confidence:.1%}")
        else:
            print("\nNo URL found")
    else:
        print("\nClaude API not configured. Please set ANTHROPIC_API_KEY.")
        print("\nAlternatively, I can search for the URL directly:")

        # Demonstrate what the search would look like
        print("\nSearch query for this paper:")
        print('"Long short-term memory" Hochreiter 1997 Neural Computation pdf')