fix: pdf2doi package dependency removed completely

aritraroy24 · aritraroy24 · commit a4c7716521bf · 2025-10-22T03:10:04.000+01:00
diff --git a/paper-dependencies.txt b/paper-dependencies.txt
@@ -0,0 +1,35 @@
+# ComProScanner Package Dependency Versions used for the paperwork are provided if any issue arises with the latest versions.
+
+# Build Requirements
+setuptools>=64
+wheel>=0.45.1
+
+# Core Dependencies
+requests>=2.32.5
+python-dotenv>=1.1.1
+tqdm>=4.67.1
+lxml>=5.4.0
+pandas>=2.3.3
+torch>=2.9.0
+langchain>=0.3.27
+transformers>=4.57.1
+tokenizers>=0.22.1
+mysql>=0.0.3
+mysql-connector>=2.2.9
+langchain_community>=0.3.31
+crewai>=0.203.1
+crewai-tools>=0.76.0
+chromadb>=1.1.1
+docling>=2.57.0
+sentence-transformers>=5.1.1
+neo4j>=6.0.2
+pymatgen>=2025.10.7
+seaborn>=0.13.2
+
+## Development Dependencies
+pytest>=8.4.2
+pytest-mock>=3.15.1
+
+## Documentation Dependencies
+mkdocs-material>=9.6.22
+mkdocs-minify-plugin>=0.8.0
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=64", "wheel>=0.45.1"]
+requires = ["setuptools", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -11,27 +11,26 @@ authors = [{ name = "Aritra Roy", email = "contact@aritraroy.live" }]
 license = { text = "MIT" }
 requires-python = ">=3.12,<3.14"
 dependencies = [
-    "requests>=2.32.5",
-    "python-dotenv>=1.1.1",
-    "tqdm>=4.67.1",
-    "lxml>=5.4.0",
-    "pandas>=2.3.3",
-    "torch>=2.9.0",
-    "langchain>=0.3.27",
-    "transformers>=4.57.1",
-    "tokenizers>=0.22.1",
-    "mysql>=0.0.3",
-    "mysql-connector>=2.2.9",
-    "langchain_community>=0.3.31",
-    "crewai>=0.203.1",
-    "crewai-tools>=0.76.0",
-    "chromadb>=1.1.1",
-    "pdf2doi @ git+https://github.com/MicheleCotrufo/pdf2doi.git", # pdfminer.six>=20231228 is needed for CrewAI, it is updated in the repo but not released yet by pdf2doi as a package
-    "docling>=2.57.0",
-    "sentence-transformers>=5.1.1",
-    "neo4j>=6.0.2",
-    "pymatgen>=2025.10.7",
-    "seaborn>=0.13.2",
+    "requests",
+    "python-dotenv",
+    "tqdm",
+    "lxml",
+    "pandas",
+    "torch",
+    "langchain",
+    "transformers",
+    "tokenizers",
+    "mysql",
+    "mysql-connector",
+    "langchain_community",
+    "crewai",
+    "crewai-tools",
+    "chromadb",
+    "docling",
+    "sentence-transformers",
+    "neo4j",
+    "pymatgen",
+    "seaborn",
 ]
 classifiers = [
     "Development Status :: 3 - Alpha",
@@ -49,8 +48,8 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Visualization",
 ]
 [project.optional-dependencies]
-dev = ["pytest>=8.4.2", "pytest-mock>=3.15.1"]
-docs = ["mkdocs-material>=9.6.22", "mkdocs-minify-plugin>=0.8.0"]
+dev = ["pytest", "pytest-mock"]
+docs = ["mkdocs-material", "mkdocs-minify-plugin"]
 [project.urls]
 "Homepage" = "https://github.com/slimeslab/ComProScanner"
 "Bug Tracker" = "https://github.com/slimeslab/ComProScanner/issues"
diff --git a/src/comproscanner/article_processors/pdfs_processor.py b/src/comproscanner/article_processors/pdfs_processor.py
@@ -8,14 +8,14 @@
 """
 
 # Importing required libraries
-import pdf2doi
 import logging
 import time
 import json
 import pandas as pd
 from tqdm import tqdm
-import feedparser
 import glob
+import re
+import os
 
 # Custom imports
 from ..utils.configs import (
@@ -127,19 +127,31 @@ def __init__(
         self.csv_db_manager = CSVDatabaseManager()
         self.vector_db_manager = VectorDatabaseManager(rag_config=self.rag_config)
 
-    def _get_paper_metadata_from_pdf(self, results: dict):
+    def _extract_doi_from_text(self, text: str):
+        """Extract DOI from text using regex pattern matching.
+
+        Args:
+            text (str): The text to extract DOI from.
+
+        Returns:
+            str: The extracted DOI or empty string if not found.
+        """
         try:
-            if "validation_info" not in results:
-                logger.error(f"Validation info not found in the results...")
-                return "", "", ""
-            validation_dict = json.loads(results["validation_info"])
-            title = validation_dict.get("title", "")
-            journal_name = validation_dict.get("container-title", "")
-            publisher = validation_dict.get("publisher", "")
-            return title, journal_name, publisher
-        except json.JSONDecodeError:
-            logger.error(f"Error decoding JSON validation info...")
-            return "", "", ""
+            # Standard DOI pattern: 10.xxxx/xxxxx
+            doi_pattern = r'10\.\d{4,9}/[-._;()/:a-zA-Z0-9]+'
+            matches = re.findall(doi_pattern, text)
+
+            if matches:
+                # Return the first match, clean up common trailing characters
+                doi = matches[0].rstrip('.,;)]')
+                logger.debug(f"DOI extracted: {doi}")
+                return doi
+            else:
+                logger.debug("No DOI found in text")
+                return ""
+        except Exception as e:
+            logger.error(f"Error extracting DOI from text: {e}")
+            return ""
 
     def process_pdfs(self):
         """
@@ -154,31 +166,34 @@ def process_pdfs(self):
             pdf_files, desc="Processing PDFs", total=total_files, colour="#d6adff"
         ):
             try:
-                # Suppress pdf2doi logs and get data from the PDF file
-                logging.getLogger("pdf2doi").setLevel(logging.ERROR)
-                result = pdf2doi.pdf2doi(pdf_file)
-                if isinstance(result, feedparser.FeedParserDict):
-                    result = dict(result)
-                if result and result.get("identifier"):
-                    self.identifier = result["identifier"]
-                    if self.identifier.startswith("10."):
-                        self.doi = self.identifier
+                # Convert PDF to Markdown text
+                pdf_to_md = PDFToMarkdownText(source=pdf_file)
+                md_text = pdf_to_md.convert_to_markdown()
+
+                # Extract DOI from the converted markdown text
+                self.doi = self._extract_doi_from_text(md_text)
+
+                if self.doi:
+                    self.identifier = self.doi
+                    logger.debug(f"DOI found: {self.doi}")
                 else:
+                    # Use filename as identifier if DOI not found
                     logger.warning(
                         f"DOI not found for {pdf_file}. Using filename as identifier."
                     )
-                    self.identifier = pdf_file.split(".pdf")[0]
-                title, journal_name, publisher = self._get_paper_metadata_from_pdf(
-                    result
-                )
-                if title == "" or journal_name == "" or publisher == "" and self.doi:
+                    filename = os.path.basename(pdf_file)
+                    self.identifier = filename.replace(".pdf", "")
+
+                # Get metadata from external API using DOI
+                title, journal_name, publisher = "", "", ""
+                if self.doi:
                     title, journal_name, publisher = get_paper_metadata_from_oaworks(
                         self.doi
                     )
+                    if not title:
+                        logger.warning(f"Metadata not found for DOI: {self.doi}")
 
-                # Convert PDF to Markdown text
-                pdf_to_md = PDFToMarkdownText(source=pdf_file)
-                md_text = pdf_to_md.convert_to_markdown()
+                # Process sections
                 all_sections = pdf_to_md.clean_text(md_text)
                 row = pdf_to_md.append_section_to_df(
                     all_sections,