-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclaude_url_finder_api.py
More file actions
198 lines (157 loc) · 6.43 KB
/
claude_url_finder_api.py
File metadata and controls
198 lines (157 loc) · 6.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/usr/bin/env python3
"""Module for finding URLs using Claude API or direct web search."""
import os
import re
from typing import Optional, List, Tuple
from bibtex_parser import BibEntry
# Try to import Anthropic SDK
try:
from anthropic import Anthropic
ANTHROPIC_AVAILABLE = True
except ImportError:
ANTHROPIC_AVAILABLE = False
print("Note: Anthropic SDK not installed. Install with: pip install anthropic")
class ClaudeURLFinder:
"""Find URLs for papers using Claude API."""
def __init__(self):
self.client = None
self.api_key = os.environ.get('ANTHROPIC_API_KEY')
if ANTHROPIC_AVAILABLE and self.api_key:
try:
self.client = Anthropic(api_key=self.api_key)
print("✓ Claude API initialized successfully")
except Exception as e:
print(f"Warning: Could not initialize Claude API: {e}")
self.client = None
elif not self.api_key:
print("Note: ANTHROPIC_API_KEY not found in environment variables")
print(" To use Claude API, set: export ANTHROPIC_API_KEY='your-key'")
def find_paper_url(self, entry: BibEntry) -> Optional[Tuple[str, float]]:
"""Find URL for a paper using Claude API.
Returns tuple of (url, confidence) or None.
"""
if not self.client:
return None
# Build context from entry
title = entry.fields.get('title', '')
authors = entry.fields.get('author', '')
year = entry.fields.get('year', '')
venue = entry.fields.get('booktitle', entry.fields.get('journal', ''))
# Clean up title
clean_title = re.sub(r'\{([^}]+)\}', r'\1', title)
clean_title = re.sub(r'\\[a-zA-Z]+', '', clean_title).strip()
prompt = f"""Find the official URL for this academic paper. Return ONLY the URL, nothing else.
Paper details:
- Title: {clean_title}
- Authors: {authors}
- Year: {year}
- Venue: {venue}
Requirements:
1. Find the official publisher URL or repository link (arXiv, ACL, etc.)
2. Prefer direct PDF links when available
3. Return only the URL, no explanation
4. If you cannot find a reliable URL, return "NOT_FOUND"
"""
try:
response = self.client.messages.create(
model="claude-3-haiku-20240307", # Use fast, cheap model
max_tokens=200,
temperature=0,
messages=[{"role": "user", "content": prompt}]
)
result = response.content[0].text.strip()
# Check if a URL was found
if result and result != "NOT_FOUND" and result.startswith('http'):
# Extract confidence based on response
confidence = 0.9 if 'doi.org' in result or 'arxiv.org' in result else 0.7
return (result, confidence)
except Exception as e:
print(f"Error calling Claude API: {e}")
return None
def find_urls_batch(self, entries: List[BibEntry], max_entries: int = 10) -> dict:
"""Find URLs for multiple entries.
Args:
entries: List of BibEntry objects missing URLs
max_entries: Maximum number to process (to limit API costs)
Returns:
Dictionary mapping entry keys to suggested URLs
"""
results = {}
processed = 0
for entry in entries:
if processed >= max_entries:
print(f"Reached limit of {max_entries} entries")
break
if 'url' not in entry.fields:
print(f"Finding URL for: {entry.key}")
result = self.find_paper_url(entry)
if result:
url, confidence = result
results[entry.key] = {
'url': url,
'confidence': confidence
}
print(f" ✓ Found: {url[:60]}... (confidence: {confidence:.1%})")
else:
print(f" ✗ No URL found")
processed += 1
return results
def find_paper_url_direct(title: str, authors: str = "", year: str = "") -> Optional[str]:
"""Direct function to find a paper URL using Claude.
This can be called from the command line or other scripts.
"""
finder = ClaudeURLFinder()
if not finder.client:
print("Claude API not available. Please set ANTHROPIC_API_KEY environment variable.")
return None
# Create a temporary entry
entry = BibEntry(
entry_type='article',
key='temp',
fields={'title': title, 'author': authors, 'year': year}
)
result = finder.find_paper_url(entry)
if result:
url, confidence = result
print(f"Found URL with {confidence:.1%} confidence: {url}")
return url
return None
# For testing
if __name__ == "__main__":
print("Claude URL Finder with Anthropic API")
print("=====================================\n")
# Check if API is available
if not ANTHROPIC_AVAILABLE:
print("Please install the Anthropic SDK:")
print(" pip install anthropic")
exit(1)
# Test with a known paper
test_entry = BibEntry(
entry_type='article',
key='hochreiter1997long',
fields={
'title': 'Long short-term memory',
'author': 'Hochreiter, Sepp and Schmidhuber, Jürgen',
'journal': 'Neural computation',
'year': '1997',
'volume': '9',
'number': '8',
'pages': '1735--1780'
}
)
finder = ClaudeURLFinder()
if finder.client:
print("\nTesting with Hochreiter LSTM paper...")
result = finder.find_paper_url(test_entry)
if result:
url, confidence = result
print(f"\nFound URL: {url}")
print(f"Confidence: {confidence:.1%}")
else:
print("\nNo URL found")
else:
print("\nClaude API not configured. Please set ANTHROPIC_API_KEY.")
print("\nAlternatively, I can search for the URL directly:")
# Demonstrate what the search would look like
print("\nSearch query for this paper:")
print('"Long short-term memory" Hochreiter 1997 Neural Computation pdf')