88"""
99
1010# Importing required libraries
11- import pdf2doi
1211import logging
1312import time
1413import json
1514import pandas as pd
1615from tqdm import tqdm
17- import feedparser
1816import glob
17+ import re
18+ import os
1919
2020# Custom imports
2121from ..utils .configs import (
@@ -127,19 +127,31 @@ def __init__(
127127 self .csv_db_manager = CSVDatabaseManager ()
128128 self .vector_db_manager = VectorDatabaseManager (rag_config = self .rag_config )
129129
130- def _get_paper_metadata_from_pdf (self , results : dict ):
130+ def _extract_doi_from_text (self , text : str ):
131+ """Extract DOI from text using regex pattern matching.
132+
133+ Args:
134+ text (str): The text to extract DOI from.
135+
136+ Returns:
137+ str: The extracted DOI or empty string if not found.
138+ """
131139 try :
132- if "validation_info" not in results :
133- logger .error (f"Validation info not found in the results..." )
134- return "" , "" , ""
135- validation_dict = json .loads (results ["validation_info" ])
136- title = validation_dict .get ("title" , "" )
137- journal_name = validation_dict .get ("container-title" , "" )
138- publisher = validation_dict .get ("publisher" , "" )
139- return title , journal_name , publisher
140- except json .JSONDecodeError :
141- logger .error (f"Error decoding JSON validation info..." )
142- return "" , "" , ""
140+ # Standard DOI pattern: 10.xxxx/xxxxx
141+ doi_pattern = r'10\.\d{4,9}/[-._;()/:a-zA-Z0-9]+'
142+ matches = re .findall (doi_pattern , text )
143+
144+ if matches :
145+ # Return the first match, clean up common trailing characters
146+ doi = matches [0 ].rstrip ('.,;)]' )
147+ logger .debug (f"DOI extracted: { doi } " )
148+ return doi
149+ else :
150+ logger .debug ("No DOI found in text" )
151+ return ""
152+ except Exception as e :
153+ logger .error (f"Error extracting DOI from text: { e } " )
154+ return ""
143155
144156 def process_pdfs (self ):
145157 """
@@ -154,31 +166,34 @@ def process_pdfs(self):
154166 pdf_files , desc = "Processing PDFs" , total = total_files , colour = "#d6adff"
155167 ):
156168 try :
157- # Suppress pdf2doi logs and get data from the PDF file
158- logging .getLogger ("pdf2doi" ).setLevel (logging .ERROR )
159- result = pdf2doi .pdf2doi (pdf_file )
160- if isinstance (result , feedparser .FeedParserDict ):
161- result = dict (result )
162- if result and result .get ("identifier" ):
163- self .identifier = result ["identifier" ]
164- if self .identifier .startswith ("10." ):
165- self .doi = self .identifier
169+ # Convert PDF to Markdown text
170+ pdf_to_md = PDFToMarkdownText (source = pdf_file )
171+ md_text = pdf_to_md .convert_to_markdown ()
172+
173+ # Extract DOI from the converted markdown text
174+ self .doi = self ._extract_doi_from_text (md_text )
175+
176+ if self .doi :
177+ self .identifier = self .doi
178+ logger .debug (f"DOI found: { self .doi } " )
166179 else :
180+ # Use filename as identifier if DOI not found
167181 logger .warning (
168182 f"DOI not found for { pdf_file } . Using filename as identifier."
169183 )
170- self .identifier = pdf_file .split (".pdf" )[0 ]
171- title , journal_name , publisher = self ._get_paper_metadata_from_pdf (
172- result
173- )
174- if title == "" or journal_name == "" or publisher == "" and self .doi :
184+ filename = os .path .basename (pdf_file )
185+ self .identifier = filename .replace (".pdf" , "" )
186+
187+ # Get metadata from external API using DOI
188+ title , journal_name , publisher = "" , "" , ""
189+ if self .doi :
175190 title , journal_name , publisher = get_paper_metadata_from_oaworks (
176191 self .doi
177192 )
193+ if not title :
194+ logger .warning (f"Metadata not found for DOI: { self .doi } " )
178195
179- # Convert PDF to Markdown text
180- pdf_to_md = PDFToMarkdownText (source = pdf_file )
181- md_text = pdf_to_md .convert_to_markdown ()
196+ # Process sections
182197 all_sections = pdf_to_md .clean_text (md_text )
183198 row = pdf_to_md .append_section_to_df (
184199 all_sections ,
0 commit comments