Skip to content

Commit a4c7716

Browse files
committed
fix: pdf2doi package dependency removed completely
1 parent 6efbace commit a4c7716

File tree

3 files changed

+104
-55
lines changed

3 files changed

+104
-55
lines changed

paper-dependencies.txt

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# ComProScanner Package Dependency Versions used for the paperwork are provided if any issue arises with the latest versions.
2+
3+
# Build Requirements
4+
setuptools>=64
5+
wheel>=0.45.1
6+
7+
# Core Dependencies
8+
requests>=2.32.5
9+
python-dotenv>=1.1.1
10+
tqdm>=4.67.1
11+
lxml>=5.4.0
12+
pandas>=2.3.3
13+
torch>=2.9.0
14+
langchain>=0.3.27
15+
transformers>=4.57.1
16+
tokenizers>=0.22.1
17+
mysql>=0.0.3
18+
mysql-connector>=2.2.9
19+
langchain_community>=0.3.31
20+
crewai>=0.203.1
21+
crewai-tools>=0.76.0
22+
chromadb>=1.1.1
23+
docling>=2.57.0
24+
sentence-transformers>=5.1.1
25+
neo4j>=6.0.2
26+
pymatgen>=2025.10.7
27+
seaborn>=0.13.2
28+
29+
## Development Dependencies
30+
pytest>=8.4.2
31+
pytest-mock>=3.15.1
32+
33+
## Documentation Dependencies
34+
mkdocs-material>=9.6.22
35+
mkdocs-minify-plugin>=0.8.0

pyproject.toml

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[build-system]
2-
requires = ["setuptools>=64", "wheel>=0.45.1"]
2+
requires = ["setuptools", "wheel"]
33
build-backend = "setuptools.build_meta"
44

55
[project]
@@ -11,27 +11,26 @@ authors = [{ name = "Aritra Roy", email = "contact@aritraroy.live" }]
1111
license = { text = "MIT" }
1212
requires-python = ">=3.12,<3.14"
1313
dependencies = [
14-
"requests>=2.32.5",
15-
"python-dotenv>=1.1.1",
16-
"tqdm>=4.67.1",
17-
"lxml>=5.4.0",
18-
"pandas>=2.3.3",
19-
"torch>=2.9.0",
20-
"langchain>=0.3.27",
21-
"transformers>=4.57.1",
22-
"tokenizers>=0.22.1",
23-
"mysql>=0.0.3",
24-
"mysql-connector>=2.2.9",
25-
"langchain_community>=0.3.31",
26-
"crewai>=0.203.1",
27-
"crewai-tools>=0.76.0",
28-
"chromadb>=1.1.1",
29-
"pdf2doi @ git+https://github.com/MicheleCotrufo/pdf2doi.git", # pdfminer.six>=20231228 is needed for CrewAI, it is updated in the repo but not released yet by pdf2doi as a package
30-
"docling>=2.57.0",
31-
"sentence-transformers>=5.1.1",
32-
"neo4j>=6.0.2",
33-
"pymatgen>=2025.10.7",
34-
"seaborn>=0.13.2",
14+
"requests",
15+
"python-dotenv",
16+
"tqdm",
17+
"lxml",
18+
"pandas",
19+
"torch",
20+
"langchain",
21+
"transformers",
22+
"tokenizers",
23+
"mysql",
24+
"mysql-connector",
25+
"langchain_community",
26+
"crewai",
27+
"crewai-tools",
28+
"chromadb",
29+
"docling",
30+
"sentence-transformers",
31+
"neo4j",
32+
"pymatgen",
33+
"seaborn",
3534
]
3635
classifiers = [
3736
"Development Status :: 3 - Alpha",
@@ -49,8 +48,8 @@ classifiers = [
4948
"Topic :: Scientific/Engineering :: Visualization",
5049
]
5150
[project.optional-dependencies]
52-
dev = ["pytest>=8.4.2", "pytest-mock>=3.15.1"]
53-
docs = ["mkdocs-material>=9.6.22", "mkdocs-minify-plugin>=0.8.0"]
51+
dev = ["pytest", "pytest-mock"]
52+
docs = ["mkdocs-material", "mkdocs-minify-plugin"]
5453
[project.urls]
5554
"Homepage" = "https://github.com/slimeslab/ComProScanner"
5655
"Bug Tracker" = "https://github.com/slimeslab/ComProScanner/issues"

src/comproscanner/article_processors/pdfs_processor.py

Lines changed: 46 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,14 @@
88
"""
99

1010
# Importing required libraries
11-
import pdf2doi
1211
import logging
1312
import time
1413
import json
1514
import pandas as pd
1615
from tqdm import tqdm
17-
import feedparser
1816
import glob
17+
import re
18+
import os
1919

2020
# Custom imports
2121
from ..utils.configs import (
@@ -127,19 +127,31 @@ def __init__(
127127
self.csv_db_manager = CSVDatabaseManager()
128128
self.vector_db_manager = VectorDatabaseManager(rag_config=self.rag_config)
129129

130-
def _get_paper_metadata_from_pdf(self, results: dict):
130+
def _extract_doi_from_text(self, text: str):
131+
"""Extract DOI from text using regex pattern matching.
132+
133+
Args:
134+
text (str): The text to extract DOI from.
135+
136+
Returns:
137+
str: The extracted DOI or empty string if not found.
138+
"""
131139
try:
132-
if "validation_info" not in results:
133-
logger.error(f"Validation info not found in the results...")
134-
return "", "", ""
135-
validation_dict = json.loads(results["validation_info"])
136-
title = validation_dict.get("title", "")
137-
journal_name = validation_dict.get("container-title", "")
138-
publisher = validation_dict.get("publisher", "")
139-
return title, journal_name, publisher
140-
except json.JSONDecodeError:
141-
logger.error(f"Error decoding JSON validation info...")
142-
return "", "", ""
140+
# Standard DOI pattern: 10.xxxx/xxxxx
141+
doi_pattern = r'10\.\d{4,9}/[-._;()/:a-zA-Z0-9]+'
142+
matches = re.findall(doi_pattern, text)
143+
144+
if matches:
145+
# Return the first match, clean up common trailing characters
146+
doi = matches[0].rstrip('.,;)]')
147+
logger.debug(f"DOI extracted: {doi}")
148+
return doi
149+
else:
150+
logger.debug("No DOI found in text")
151+
return ""
152+
except Exception as e:
153+
logger.error(f"Error extracting DOI from text: {e}")
154+
return ""
143155

144156
def process_pdfs(self):
145157
"""
@@ -154,31 +166,34 @@ def process_pdfs(self):
154166
pdf_files, desc="Processing PDFs", total=total_files, colour="#d6adff"
155167
):
156168
try:
157-
# Suppress pdf2doi logs and get data from the PDF file
158-
logging.getLogger("pdf2doi").setLevel(logging.ERROR)
159-
result = pdf2doi.pdf2doi(pdf_file)
160-
if isinstance(result, feedparser.FeedParserDict):
161-
result = dict(result)
162-
if result and result.get("identifier"):
163-
self.identifier = result["identifier"]
164-
if self.identifier.startswith("10."):
165-
self.doi = self.identifier
169+
# Convert PDF to Markdown text
170+
pdf_to_md = PDFToMarkdownText(source=pdf_file)
171+
md_text = pdf_to_md.convert_to_markdown()
172+
173+
# Extract DOI from the converted markdown text
174+
self.doi = self._extract_doi_from_text(md_text)
175+
176+
if self.doi:
177+
self.identifier = self.doi
178+
logger.debug(f"DOI found: {self.doi}")
166179
else:
180+
# Use filename as identifier if DOI not found
167181
logger.warning(
168182
f"DOI not found for {pdf_file}. Using filename as identifier."
169183
)
170-
self.identifier = pdf_file.split(".pdf")[0]
171-
title, journal_name, publisher = self._get_paper_metadata_from_pdf(
172-
result
173-
)
174-
if title == "" or journal_name == "" or publisher == "" and self.doi:
184+
filename = os.path.basename(pdf_file)
185+
self.identifier = filename.replace(".pdf", "")
186+
187+
# Get metadata from external API using DOI
188+
title, journal_name, publisher = "", "", ""
189+
if self.doi:
175190
title, journal_name, publisher = get_paper_metadata_from_oaworks(
176191
self.doi
177192
)
193+
if not title:
194+
logger.warning(f"Metadata not found for DOI: {self.doi}")
178195

179-
# Convert PDF to Markdown text
180-
pdf_to_md = PDFToMarkdownText(source=pdf_file)
181-
md_text = pdf_to_md.convert_to_markdown()
196+
# Process sections
182197
all_sections = pdf_to_md.clean_text(md_text)
183198
row = pdf_to_md.append_section_to_df(
184199
all_sections,

0 commit comments

Comments
 (0)