diff --git a/.DS_Store b/.DS_Store index 6c203b94..e720f8b0 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/.gitignore b/.gitignore index b0391b88..b99551f7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,14 @@ # don't sync coursera docs + +*grobid_speedtest_pdfs* coursera-dl/ *parsed.json wandb *.ipynb *.pem +*.tei.xml +*.json +*articles.db # don't expose env files .env diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml index 4186a1e2..292c526c 100644 --- a/.trunk/trunk.yaml +++ b/.trunk/trunk.yaml @@ -43,7 +43,6 @@ lint: paths: - .github/**/* - .trunk/**/* - - mkdocs.yml - .DS_Store - .vscode/**/* - README.md diff --git a/ai_ta_backend/utils/types.py b/ai_ta_backend/utils/types.py new file mode 100644 index 00000000..b5200a84 --- /dev/null +++ b/ai_ta_backend/utils/types.py @@ -0,0 +1,64 @@ +import datetime +from typing import Any, Dict, List, Optional + +import pydantic + + +class DocumentMetadata(pydantic.BaseModel): + authors: list[str] + journal_name: str + publication_date: datetime.date # Changed from datetime.date to str + keywords: list[str] + doi: str + title: str + subtitle: Optional[str] + visible_urls: list[str] + field_of_science: str + concise_summary: str + specific_questions_document_can_answer: list[str] + additional_fields: Optional[Dict[str, Any]] = {} + + # Can't get this to work properly + # class Config: + # extra = pydantic.Extra.allow # Allow arbitrary additional fields + + +class GrobidMetadata(pydantic.BaseModel): + """ + additional_fields is for the paper "sections" with arbitrary section names. + Currently, the SQLite DB will have a separate column for every unique "major_sec_title". + We'll see how messy it gets... maybe LLMs can normalize this some. + + Format of additional_fields: + { + "major_sec_num": 3, + "major_sec_title": "Extracting Metadata", + "text": "In the previous section, we...", # full text of the section + "tokens": 1067 + } + """ + uuid: str + filepath: str + total_tokens: int + avg_tokens_per_section: int + max_tokens_per_section: int + all_sections: Dict[str, str] + additional_fields: Optional[List[Dict[str, Any]]] = [{}] + + +# Prisma data model https://prisma-client-py.readthedocs.io/en/stable/ +# TBH I'd rather invest in learning real SQL. Theo switched from Drizzle to Prisma... no long-term security in either. +# model DocumentMetadata { +# id Int @id @default(autoincrement()) +# authors String[] +# journalName String +# publicationDate DateTime +# keywords String[] +# doi String +# title String +# subtitle String? // Optional field +# visibleUrls String[] +# fieldOfScience String +# conciseSummary String +# specificQuestionsDocumentCanAnswer String[] +# } diff --git a/llm-guided-pdf-parsing/llm-guided-retrival/evaluate_chunks.py b/llm-guided-pdf-parsing/llm-guided-retrival/evaluate_chunks.py new file mode 100644 index 00000000..efb3d875 --- /dev/null +++ b/llm-guided-pdf-parsing/llm-guided-retrival/evaluate_chunks.py @@ -0,0 +1,195 @@ +import json +import os + +from dotenv import load_dotenv +from openai import OpenAI +from read_sql import ( + get_context_given_contextID, + get_next_context_id, + get_previous_context_id, +) + + +def evaluate_chunks(query, chunks, outline): + load_dotenv() + + api_key = os.getenv("AZURE_OPENAI_KEY") + endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") + deployment = os.getenv("DEPLOYMENT") + api_version = os.getenv("OPENAI_API_VERSION") + openai_api = os.getenv("OPENAI_API") + + client = OpenAI(api_key=openai_api) + + tools = [ + { + "type": "function", + "function": { + "name": "keep_current_chunk", + "description": "Check if the current chunk of context is relevant to the query", + "parameters": { + "type": "object", + "properties": { + "keep": { + "type": "boolean", + "description": "Whether to keep the current chunk or not" + } + }, + "required": ["keep"] + } + }, + "required": True + }, + { + "type": "function", + "function": { + "name": "check_previous_chunk", + "description": "Check if the previous chunk of context is relevant to the query", + "parameters": { + "type": "object", + "properties": { + "check_previous": { + "type": "boolean", + "description": "True if the previous chunk is relevant, False otherwise" + } + }, + "required": ["check_previous"] + } + }, + }, + { + "type": "function", + "function": { + "name": "check_next_chunk", + "description": "Check if the next chunk of context is relevant to the query", + "parameters": { + "type": "object", + "properties": { + "check_next": { + "type": "boolean", + "description": "True if the next chunk is relevant, False otherwise" + } + }, + "required": ["check_next"] + } + } + }, + { + "type": "function", + "function": { + "name": "go_to_URL", + "description": "Click link or go to URL referenced in chunk.", + "parameters": { + "type": "object", + "properties": { + "go_to_URL": { + "type": "boolean", + "description": "Whether to go to the URL or not" + } + }, + "required": ["go_to_URL"] + } + } + }, + { + "type": "function", + "function": { + "name": "go_to_section", + "description": "Navigate to a specific section in the document.", + "parameters": { + "type": "object", + "properties": { + "go_to_section": { + "type": "boolean", + "description": "Whether to go to the section or not" + }, + "section": { + "type": "string", + "description": "The section to navigate to" + } + }, + "required": ["go_to_section", "section"] + } + } + }, + ] + + messages = [{ + "role": + "system", + "content": ( + "You are an expert in information retrieval. Your task is to evaluate the relevance of a given text chunk to a specific research question. " + "You have four functions at your disposal: 'keep_current_chunk', 'check_previous_chunk', 'check_next_chunk', and 'go_to_url'. " + "Always use 'keep_current_chunk' to determine if the current chunk is relevant. Then, consider using 'check_previous_chunk' or 'check_next_chunk' or 'go_to_url'. " + "When using 'check_previous_chunk', if you find previous chunk relevant to the research question, set 'check_previous' to 'True', otherwise false. " + "When using 'check_next_chunk', if you find next chunk relevant to the research question, set 'check_next' to 'True', otherwise false. " + "Use 'go_to_url' if the chunk suggests checking an external link." + "You can also use 'go_to_section' to navigate to a specific section in the document.") + }, { + "role": + "user", + "content": ( + f"Research Question: '{query}'\n\n" + f"Table of Contents: '{outline}'\n\n" + f"Current Text Chunk: '{chunks}'\n\n" + "Evaluate the relevance of the current chunk to the research question. Determine if the current chunk should be kept. " + "Also, decide whether to check the previous chunk by calling 'check_previous_chunk', or the next chunk by calling 'check_next_chunk', or whether to follow an external link using the respective functions. " + "Make sure you call other functions and determine if previous or next chunks are relevant to the research question." + ) + }] + + completion = client.chat.completions.create( + model='gpt-4o', + messages=messages, + tools=tools, + # tool_choice={"type": "function", "function": {"name": "keep_current_chunk"}}, + ) + + return completion + + +# query = "What is the name of Polly's son?" +# chunks = [ +# "Polly's son, Michael Gray, was taken away when he was a baby. He was raised by a family in Australia. He was brought back to Birmingham by Polly in season 2.", +# "The Blinders' dominance came about from beating rivals, including the 'Sloggers', 'a pugilistic term for someone who could strike a heavy blow in the ring', whom they fought for territory in Birmingham and its surrounding districts.", +# "Britain is a mixture of despair and hedonism in 1919 in the aftermath of the Great War. Returning soldiers, newly minted revolutions and criminal gangs are fighting for survival in a nation rocked by economic upheaval." +# ] +# outline = "1. Introduction\n2. Polly Gray\n3. The Blinders\n4. Britain in 1919" +# result = evaluate_chunks(query, chunks[1], outline) +# print(result) +# for tool in result.choices[0].message.tool_calls: +# print("true" in tool.function.arguments) +# print("---") + + +def evaluate_chunks_with_step(query, chunk_id, step, chunks_to_keep, is_visited): + if step > 4: + return + if chunk_id in is_visited: + print("already visited") + return + is_visited[chunk_id] = True + step += 1 + + print(chunk_id) + context_data, current_context, outline = get_context_given_contextID(chunk_id) + completion = evaluate_chunks(query, current_context, outline) + + if completion is None: + return + # print(completion) + + for tool in completion.choices[0].message.tool_calls: + if tool.function.name == "keep_current_chunk" and "true" in tool.function.arguments: + print("Keeping current chunk") + chunks_to_keep.append(current_context) + if tool.function.name == "check_previous_chunk" and "true" in tool.function.arguments: + previous_context_id = get_previous_context_id(context_data, chunk_id) + if previous_context_id is not None: + print("Checking previous chunk") + evaluate_chunks_with_step(query, previous_context_id, step, chunks_to_keep) + if tool.function.name == "check_next_chunk" and "true" in tool.function.arguments: + next_context_id = get_next_context_id(context_data, chunk_id) + if next_context_id is not None: + print("Checking next chunk") + evaluate_chunks_with_step(query, next_context_id, step, chunks_to_keep) diff --git a/llm-guided-pdf-parsing/llm-guided-retrival/guided_retrival.py b/llm-guided-pdf-parsing/llm-guided-retrival/guided_retrival.py new file mode 100644 index 00000000..78a8f10d --- /dev/null +++ b/llm-guided-pdf-parsing/llm-guided-retrival/guided_retrival.py @@ -0,0 +1,31 @@ +from evaluate_chunks import evaluate_chunks, evaluate_chunks_with_step + +# Example chunks, to be updated with the actual retrival +query = "How many studies in total met inclusion criteria for analysis of FEV 1?" +# 1, 10 ,15 +chunks = { + "DjkRDVj9YwkfmxAFfgXgJ": + "Despite reduced resting lung volumes and D LCO , patients with long COVID and dyspnoea have similar physiological response to exercise to healthy subjects. D LCO impairment can marginally explain heterogeneity of complex syndromes such as long COVID. https://bit.ly/40j4aX6", + "e7d6C_gDSEQt5fb6rj8Nt": + "Studies were identified using the systematic review methods described previously by KOTECHA et al. [4, 13] which followed the Preferred Reporting Items for Systematic Reviews and Meta-analysis (PRISMA) guidelines [14] . Briefly, 86 studies in total met inclusion criteria for analysis of FEV 1 . Although this systematic review was designed to capture studies to answer questions specifically related to FEV 1 , the search criteria were subsequently deemed acceptable to capture appropriately other spirometry measures including FVC, FEV 1 /FVC ratio and FEF 25-75 . Studies were included for this analysis if they fulfilled the following criteria: 1) FEV 1 /FVC reported in survivors of preterm birth (with or without BPD) and those born healthy at term; or if 2) FEV 1 /FVC were reported separately in survivors of preterm birth with and without BPD.", + "Fv2taY_bTmimdP-LE8ldk": + "Publication bias was observed when comparing Preterm (All) with Term groups subjectively with an asymmetrical distribution noted on funnel plots, and objectively with Egger's test reaching significance ( p<0.01). When preterm groups were separated into those with and without BPD, however, a symmetrical distribution was noted on all funnel plots and Egger's test did not reach significance (supplementary figure S1 ). This could imply that asymmetry seen in the combined preterm group may be due to the heterogeneity of having two different disease populations defined by the presence or absence of BPD." +} +outline = { + "DjkRDVj9YwkfmxAFfgXgJ": + "0: Abstract\n1: ", + "e7d6C_gDSEQt5fb6rj8Nt": + "0: Abstract\n1: Introduction\n10: Meta-regression\n11: Discussion\n12: \n2: Research questions\n3: Study identification and selection\n4: Publication bias and study quality\n5: Data collection\n6: Data analysis\n7: Study selection and study quality\n8: Publication bias\n9: Meta-analysis", + "Fv2taY_bTmimdP-LE8ldk": + "0: Abstract\n1: Introduction\n10: Meta-regression\n11: Discussion\n12: \n2: Research questions\n3: Study identification and selection\n4: Publication bias and study quality\n5: Data collection\n6: Data analysis\n7: Study selection and study quality\n8: Publication bias\n9: Meta-analysis" +} + +chunks_to_keep = [] +is_visited = {} + +for chunk_id in chunks: + chunks = evaluate_chunks_with_step(query, chunk_id, 0, chunks_to_keep, is_visited) + if chunks is not None: + chunks_to_keep.append(chunks) + +print(chunks_to_keep) diff --git a/llm-guided-pdf-parsing/llm-guided-retrival/read_sql.py b/llm-guided-pdf-parsing/llm-guided-retrival/read_sql.py new file mode 100644 index 00000000..7095ef2b --- /dev/null +++ b/llm-guided-pdf-parsing/llm-guided-retrival/read_sql.py @@ -0,0 +1,81 @@ +import sqlite3 + +db = '/home/guest/ai-ta-backend/UIUC_Chat/pdf-parsing/articles-test.db' + + +def get_context_given_contextID(context_id): + conn = sqlite3.connect(db) + cursor = conn.cursor() + + query_article_id = """ + SELECT articles.ID, articles.Outline + FROM contexts + JOIN sections_contexts ON contexts.ID = sections_contexts.Context_ID + JOIN article_sections ON sections_contexts.Section_ID = article_sections.Section_ID + JOIN articles ON article_sections.Article_ID = articles.ID + WHERE contexts.ID = ? + """ + cursor.execute(query_article_id, (context_id,)) + article_id_result = cursor.fetchone() + + if article_id_result is None: + conn.close() + raise Exception(f"Context ID {context_id} not found in the database") + + article_id = article_id_result[0] + outline = article_id_result[1] + + query_contexts = """ + SELECT contexts.Section_Num, contexts.num_tokens, contexts.Section_Title, contexts.text, contexts.ID + FROM contexts + JOIN sections_contexts ON contexts.ID = sections_contexts.Context_ID + JOIN article_sections ON sections_contexts.Section_ID = article_sections.Section_ID + WHERE article_sections.Article_ID = ? + """ + cursor.execute(query_contexts, (article_id,)) + all_contexts = cursor.fetchall() + + query_current_context = """ + SELECT contexts.Section_Title, contexts.text + FROM contexts + WHERE contexts.ID = ? + """ + cursor.execute(query_current_context, (context_id,)) + current_context = cursor.fetchone() + + conn.close() + + return all_contexts, current_context, outline + + +# Given a list of context data, I want to get the context id of the preivous context +def get_previous_context_id(context_data, current_context_id): + for i in range(len(context_data)): + if context_data[i][4] == current_context_id: + if i == 0: + return None + else: + return context_data[i - 1][4] + + +def get_next_context_id(context_data, current_context_id): + for i in range(len(context_data)): + if context_data[i][4] == current_context_id: + if i == len(context_data) - 1: + return None + else: + return context_data[i + 1][4] + + +# context_id = "DjkRDVj9YwkfmxAFfgXgJ" +# context_data, current_context, outline = get_context_given_contextID(context_id) +# for row in context_data: +# print(f"Section Number: {row[0]}") +# print(f"Number of Tokens: {row[1]}") +# print(f"Section Title: {row[2]}") +# print(f"ID: {row[4]}") +# print("---") +# print(current_context) +# print(outline) +# print(get_previous_context_id(context_data, "DjkRDVj9YwkfmxAFfgXgJ")) +# print(get_next_context_id(context_data, "lcP1p88Pu0hIP-rgWzf2u")) diff --git a/llm-guided-pdf-parsing/llm-guided-retrival/save.py b/llm-guided-pdf-parsing/llm-guided-retrival/save.py new file mode 100644 index 00000000..6d32b0ee --- /dev/null +++ b/llm-guided-pdf-parsing/llm-guided-retrival/save.py @@ -0,0 +1,60 @@ +import json +import sqlite3 +import struct + +import sqlite_vec + +db_path = '/home/guest/ai-ta-backend/UIUC_Chat/pdf-parsing/articles-test.db' + + +def serialize_f32(vector): + return struct.pack('%sf' % len(vector), *vector) + + +def deserialize_f32(blob): + return list(struct.unpack('%sf' % (len(blob) // 4), blob)) + + +conn = sqlite3.connect(db_path) + +conn.enable_load_extension(True) +sqlite_vec.load(conn) +conn.enable_load_extension(False) + +cursor = conn.cursor() +cursor.execute('SELECT `Embedding_nomic_1.5` FROM contexts') +sample_embedding = cursor.fetchone()[0] + +if isinstance(sample_embedding, str): + sample_embedding_vector = json.loads(sample_embedding) +else: + sample_embedding_vector = deserialize_f32(sample_embedding) + +vector_dim = len(sample_embedding_vector) + +conn.execute(f'CREATE VIRTUAL TABLE IF NOT EXISTS vec_item USING vec0(embedding float[{vector_dim}])') + +# Fetch and insert all embeddings +cursor.execute('SELECT `Embedding_nomic_1.5` FROM contexts') +rows = cursor.fetchall() + +with conn: + for row in rows: + embedding = row[0] + conn.execute('INSERT INTO vec_item(embedding) VALUES (?)', (embedding,)) + +query_vector = [0.3] * vector_dim +query_blob = serialize_f32(query_vector) + +result = conn.execute( + ''' + SELECT rowid, distance(embedding, ?) as dist + FROM vec_item + ORDER BY dist + LIMIT 3 +''', (query_blob,)).fetchall() + +for row in result: + print(f"{row[0]:.4f}") + +conn.close() diff --git a/llm-guided-pdf-parsing/llm-guided-retrival/sqlite-vec.py b/llm-guided-pdf-parsing/llm-guided-retrival/sqlite-vec.py new file mode 100644 index 00000000..fc46a6d2 --- /dev/null +++ b/llm-guided-pdf-parsing/llm-guided-retrival/sqlite-vec.py @@ -0,0 +1,57 @@ +import json +import sqlite3 +import struct +from typing import List + +import sqlite_vec + +db_path = '/home/guest/ai-ta-backend/UIUC_Chat/pdf-parsing/articles-test.db' + + +def serialize_f32(vector: List[float]) -> bytes: + """Serializes a list of floats into a compact "raw bytes" format.""" + return struct.pack("%sf" % len(vector), *vector) + + +conn = sqlite3.connect(db_path, timeout=30) +conn.enable_load_extension(True) +sqlite_vec.load(conn) +conn.enable_load_extension(False) + +query_vector = [0.1] * 768 +query_blob = serialize_f32(query_vector) + +cursor = conn.cursor() + +cursor.execute('SELECT `Embedding_nomic_1.5` FROM contexts') +rows = cursor.fetchall() + +if rows: + vector_dim = len(json.loads(rows[0][0])) + +conn.execute(f"CREATE VIRTUAL TABLE IF NOT EXISTS vec_items USING vec0(embedding float[{vector_dim}])") +count = 0 +for row in rows: + embedding = json.loads(row[0]) + if embedding is None: + print("Embedding is empty") + count += 1 + continue + count += 1 + if type(embedding[0]) is list: + embedding = embedding[0] + embedding_blob = serialize_f32(embedding) + conn.execute('INSERT INTO vec_items(embedding) VALUES (?)', (embedding_blob,)) + +result = conn.execute( + ''' + SELECT rowid, distance + FROM vec_items + WHERE embedding MATCH ? + ORDER BY distance + LIMIT 3 +''', (query_blob,)).fetchall() + +conn.close() + +print(result) diff --git a/llm-guided-pdf-parsing/llm-guided-retrival/sqlite_optimizer.py b/llm-guided-pdf-parsing/llm-guided-retrival/sqlite_optimizer.py new file mode 100644 index 00000000..f9f0cd00 --- /dev/null +++ b/llm-guided-pdf-parsing/llm-guided-retrival/sqlite_optimizer.py @@ -0,0 +1,48 @@ +# Generating a Python script for SQLite performance optimization + +sqlite_optimization_script = """ +import sqlite3 + +def optimize_sqlite(db_path): + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + # Set WAL mode + cursor.execute("PRAGMA journal_mode=WAL;") + + # Increase cache size + cursor.execute("PRAGMA cache_size = 10000;") # Adjust the value based on your available memory + + # Set synchronous mode to NORMAL or OFF + cursor.execute("PRAGMA synchronous = NORMAL;") + # cursor.execute("PRAGMA synchronous = OFF;") # Uncomment this line if you prefer OFF mode + + # Enable memory-mapped I/O + cursor.execute("PRAGMA mmap_size = 268435456;") # Set an appropriate size based on your system + + # Print current settings to verify + cursor.execute("PRAGMA journal_mode;") + print(f"Journal Mode: {cursor.fetchone()[0]}") + + cursor.execute("PRAGMA cache_size;") + print(f"Cache Size: {cursor.fetchone()[0]}") + + cursor.execute("PRAGMA synchronous;") + print(f"Synchronous: {cursor.fetchone()[0]}") + + cursor.execute("PRAGMA mmap_size;") + print(f"Mmap Size: {cursor.fetchone()[0]}") + + conn.close() + +if __name__ == "__main__": + db_path = "/home/guest/ai-ta-backend/UIUC_Chat/pdf-parsing/articles-test.db" # Replace with your SQLite database path + optimize_sqlite(db_path) +""" + +# Save the script to a Python file +# script_path = "/mnt/data/optimize_sqlite.py" +# with open(script_path, "w") as file: +# file.write(sqlite_optimization_script) + +# script_path diff --git a/llm-guided-pdf-parsing/llm-guided-retrival/test.py b/llm-guided-pdf-parsing/llm-guided-retrival/test.py new file mode 100644 index 00000000..c52795e8 --- /dev/null +++ b/llm-guided-pdf-parsing/llm-guided-retrival/test.py @@ -0,0 +1,100 @@ +import json +import os + +from dotenv import load_dotenv +from openai import AzureOpenAI + +load_dotenv() + +api_key = os.getenv("AZURE_OPENAI_KEY") +endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") +deployment = os.getenv("DEPLOYMENT") +api_version = os.getenv("OPENAI_API_VERSION") + +client = AzureOpenAI(azure_endpoint=endpoint, api_key=api_key, api_version=api_version) + +custom_functions = [{ + "name": "multi_Func", + "description": "Call two functions in one call", + "parameters": { + "type": "object", + "properties": { + "keep_current_chunk": { + "name": "keep_current_chunk", + "description": "Check if the current chunk of context is relevant to the query.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The query or question to check relevance" + }, + "chunk": { + "type": "string", + "description": "The current chunk of context to check" + }, + "keep": { + "type": "boolean", + "description": "Whether to keep the current chunk or not" + } + }, + "required": ["query", "chunk", "keep"] + } + }, + "check_preivous_chunk": { + "name": "check_preivous_chunk", + "description": "Check if the previous chunk of context is relevant to the query", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The query or question to check relevance" + }, + "chunk": { + "type": "string", + "description": "The current chunk of context to check" + }, + "check_previous": { + "type": "boolean", + "description": "Whether to check the previous chunk or not" + } + }, + "required": ["query", "chunk", "check_previous"] + } + } + }, + "required": ["keep_current_chunk", "check_preivous_chunk"], + } +}] + +query = "What is shortcomings of existing methodologies?" +chunks = [ + "How might we be able to circumvent the estimation cost of sampling-based attributions? Let us start by examining the existing data attribution methods—specifically, the ones that use only one (or a few) trained models—and evaluate them on our LDS benchmark.", + "This approach can indeed be a useful proxy for evaluating data attribution methods, but the resulting metrics may be too sensitive to the particulars of the auxiliary task and thus make comparisons across different problems and settings difficult.", + "Motivated by the above shortcomings of existing methodologies, we propose a new metric for evaluating data attribution methods.", + "There are also approaches that use more heuristic measures of training example importance for data attribution." +] + +messages = [ + { + "role": + "system", + "content": + "You are an expert at doing literature review and determining if a piece of text is relevant to a research question by calling all four questions: keep_current_chunk, check_preivous_chunk, check_next_chunk, and go_to_url." + }, + { + "role": + "user", + "content": + f"Given the question query is {query}, and the current chunk is {chunks[1]}. Is the current chunk relevant to the query? If not, what about the previous" + }, +] +response = client.chat.completions.create( + model=deployment, + messages=messages, + functions=custom_functions, +) + +# Loading the response as a JSON object +print(response) diff --git a/llm-guided-pdf-parsing/pdf-parsing/.gitattributes b/llm-guided-pdf-parsing/pdf-parsing/.gitattributes new file mode 100644 index 00000000..2b7db775 --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/.gitattributes @@ -0,0 +1 @@ +all_pubmed_filenames.txt filter=lfs diff=lfs merge=lfs -text diff --git a/llm-guided-pdf-parsing/pdf-parsing/SQLite.py b/llm-guided-pdf-parsing/pdf-parsing/SQLite.py new file mode 100644 index 00000000..cc562635 --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/SQLite.py @@ -0,0 +1,165 @@ +import json +import os +import sqlite3 +from typing import Dict, List, Optional +from uuid import uuid4 + +import fastnanoid # type: ignore +from qdrant_client import QdrantClient, models + + +def initialize_database(db_path): + """ + Initializes the database and creates the necessary tables if they don't already exist. + + :param db_path: Path to the SQLite database file. + """ + if not os.path.exists(db_path): + conn = sqlite3.connect(db_path) + cur = conn.cursor() + + cur.execute("PRAGMA journal_mode=WAL;") + cur.execute("PRAGMA cache_size = 10000;") + cur.execute("PRAGMA synchronous = NORMAL;") + + cur.execute(''' + CREATE TABLE IF NOT EXISTS articles ( + ID TEXT PRIMARY KEY, + Num_tokens INTEGER, + Title TEXT, + Date_published TEXT, + Journal TEXT, + Authors TEXT, + Outline TEXT, + Minio_Path TEXT + ) + ''') + + cur.execute(''' + CREATE TABLE IF NOT EXISTS sections ( + ID TEXT PRIMARY KEY, + Article_ID TEXT, + num_tokens INTEGER, + Section_Title TEXT, + Section_Num TEXT, + FOREIGN KEY (Article_ID) REFERENCES articles(ID) + ) + ''') + + cur.execute(''' + CREATE TABLE IF NOT EXISTS contexts ( + ID TEXT PRIMARY KEY, + Section_ID TEXT, + text TEXT, + num_tokens INTEGER, + context_idx TEXT, + `Embedding_nomic_1.5` TEXT, + Stop_Reason TEXT, + FOREIGN KEY (Section_ID) REFERENCES sections(ID) + ) + ''') + + conn.commit() + conn.close() + + print("Database and tables created successfully.") + else: + print("Database already exists. No changes made.") + + +def insert_data(metadata: Dict, + total_tokens: int, + grouped_data: List[Dict], + db_path: str, + references: Optional[Dict] = None, + ref_num_tokens: Optional[Dict] = None, + minio_path: Optional[str] = None, + client=None): + """ + Inserts article metadata and sections into the database. + + :param metadata: Dictionary containing article metadata (title, date_published, journal, authors). + :param total_tokens: Total number of tokens in the article. + :param grouped_data: List of dictionaries containing section data (tokens, sec_num, sec_title). + :param db_path: Path to the SQLite database file. + """ + if references is None: + references = {} + if ref_num_tokens is None: + ref_num_tokens = {} + + conn = sqlite3.connect(db_path) + cur = conn.cursor() + + outline = '\n'.join([f"{section['sec_num']}: {section['sec_title']}" for section in grouped_data]) + + article_id = fastnanoid.generate() + authors = metadata.get('authors', []) + if not isinstance(authors, list): + authors = [authors] + filtered_authors = [author for author in authors if author is not None] + cur.execute( + ''' + INSERT INTO articles (ID, Title, Date_published, Journal, Authors, Num_tokens, Outline, Minio_Path) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + ''', (article_id, metadata['title'], metadata['date_published'], metadata['journal'], ', '.join(filtered_authors), + total_tokens, outline, minio_path)) + + context_idx = 0 + + for section in grouped_data: + section_id = fastnanoid.generate() + cur.execute( + ''' + INSERT INTO sections (ID, Article_ID, num_tokens, Section_Num, Section_Title) + VALUES (?, ?, ?, ?, ?) + ''', (section_id, article_id, section["tokens"], section["sec_num"], section["sec_title"])) + + if section["tokens"] > 7000: + for text, token in zip(section["chunk_text"], section["chunk_tokens"]): + context_id = fastnanoid.generate() + embedding = json.dumps(section["embedding"][0]) + cur.execute( + ''' + INSERT INTO contexts (ID, Section_ID, text, num_tokens, `Embedding_nomic_1.5`, stop_reason, context_idx) + VALUES (?, ?, ?, ?, ?, ?, ?) + ''', (context_id, section_id, text, token, embedding, "Token limit", context_idx)) + + qdrant_insert(embedding, article_id, section_id, minio_path, context_id, client) + + context_idx += 1 + + else: + context_id = fastnanoid.generate() + embedding = json.dumps(section["embedding"][0]) + cur.execute( + ''' + INSERT INTO contexts (ID, Section_ID, text, num_tokens, `Embedding_nomic_1.5`, stop_reason, context_idx) + VALUES (?, ?, ?, ?, ?, ?, ?) + ''', (context_id, section_id, section["text"], section["tokens"], embedding, "Section", context_idx)) + qdrant_insert(embedding, article_id, section_id, minio_path, context_id, client) + context_idx += 1 + + conn.commit() + conn.close() + + +def qdrant_insert(embedding, article_id, section_id, minio_path, context_id, client): + points = [] + qd_embedding = json.loads(embedding if isinstance(embedding, str) else embedding) + points.append( + models.PointStruct( + id=str(uuid4()), + payload={ + "article_id": article_id, + "section_id": section_id, + "minio_path": minio_path, + "context_id": context_id, + }, + vector=qd_embedding, + )) + + client.upsert( + collection_name="embedding", + points=points, + ) diff --git a/llm-guided-pdf-parsing/pdf-parsing/all_pubmed_filenames.txt b/llm-guided-pdf-parsing/pdf-parsing/all_pubmed_filenames.txt new file mode 100644 index 00000000..ece7fa9c --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/all_pubmed_filenames.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e40acaa99c9736d327c2a89dbb722d40d6f30eed48d837b4e19e9075efc99996 +size 524895633 diff --git a/llm-guided-pdf-parsing/pdf-parsing/caddy/CaddyFile b/llm-guided-pdf-parsing/pdf-parsing/caddy/CaddyFile new file mode 100644 index 00000000..46fa5283 --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/caddy/CaddyFile @@ -0,0 +1,15 @@ +:6969 { + reverse_proxy { + to https://grobid-cn064.kastan.ai https://grobid-cn064.kastan.ai https://grobid-cn064.kastan.ai https://grobid-cn064.kastan.ai https://grobid-cn064.kastan.ai https://grobid-cn064.kastan.ai https://grobid-cn064.kastan.ai https://grobid.kastan.ai + lb_policy random + header_up Host {http.reverse_proxy.upstream.hostport} + header_up X-Real-IP {remote_host} + header_down Strict-Transport-Security max-age=0 + transport http { + tls_insecure_skip_verify + read_buffer 100MB + response_header_timeout 120s + dial_timeout 120s + } + } +} diff --git a/llm-guided-pdf-parsing/pdf-parsing/caddy/CaddyFile.old b/llm-guided-pdf-parsing/pdf-parsing/caddy/CaddyFile.old new file mode 100644 index 00000000..8bae070d --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/caddy/CaddyFile.old @@ -0,0 +1,17 @@ +:6969 { + reverse_proxy { + to https://grobid-cn064.kastan.ai https://grobid.kastan.ai + lb_policy round_robin + header_up Host {http.reverse_proxy.upstream.hostport} + header_up X-Real-IP {remote_host} + header_up X-Forwarded-For {remote_host} + header_up X-Forwarded-Proto {scheme} + header_down Strict-Transport-Security max-age=0 + transport http { + tls_insecure_skip_verify + read_buffer 100MB + response_header_timeout 120s + dial_timeout 120s + } + } +} diff --git a/llm-guided-pdf-parsing/pdf-parsing/depcirated/grobid_speedtest.py b/llm-guided-pdf-parsing/pdf-parsing/depcirated/grobid_speedtest.py new file mode 100644 index 00000000..6de35e23 --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/depcirated/grobid_speedtest.py @@ -0,0 +1,317 @@ +import os +import tempfile +import time +import traceback +from concurrent.futures import ProcessPoolExecutor, as_completed + +import urllib3 +from doc2json.grobid_client import GrobidClient +from dotenv import load_dotenv +from urllib3 import PoolManager +from urllib3.util.retry import Retry + +from minio import Minio # type: ignore + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +load_dotenv(override=True) + +BUCKET_NAME = 'pubmed' + +input_files = [ + "00001-2022.PMC9108969.pdf", + "00002-2022.PMC9108967.pdf", + "00004-2022.PMC8994962.pdf", + "00004-2023.PMC10259823.pdf", + "00006-2022.PMC8899496.pdf", + "00007-2023.PMC10086694.pdf", + "00008-2022.PMC9379355.pdf", + "00009-2022.PMC9149393.pdf", + "0001.f1.pdf", + "0001.f2.pdf", + "00010-2022.PMC9059131.pdf", + "00010-2023.PMC10277870.pdf", + "00011-2023.PMC10204863.pdf", + "00012-2022.PMC9501655.pdf", + "00013-2023.PMC10204849.pdf", + "00014-2022.PMC8845010.pdf", + "00016-2022.PMC9124870.pdf", + "00017-2022.PMC8995535.pdf", + "00017-2023.PMC10423983.pdf", + "00020-2023.PMC10204814.pdf", + "00021-2023.pdf", + "00024-2022.PMC9574553.pdf", + "00024-2023.PMC10316040.pdf", + "00025-2022.PMC9501840.pdf", + "00026-2023.PMC10204812.pdf", + "00027-2023.PMC10440677.pdf", + "00028-2022.PMC9234436.pdf", + "00030-2022.PMC9339768.pdf", + "00031-2022.PMC9062297.pdf", + "00032-2022.PMC8958218.pdf", + "00033-2022.PMC9511143.pdf", + "00033-2023.PMC10227635.pdf", + "00034-2022.PMC8883039.pdf", + "00034-2023.PMC10204859.pdf", + "00035-2022.PMC9108963.pdf", + "00035-2023.PMC10518870.pdf", + "00036-2023.PMC10291303.pdf", + "00038-2022.PMC9149385.pdf", + "00039-2023.PMC10204822.pdf", + "0004-282X-anp-80-08-0767.PMC9703882.pdf", + "0004-282X-anp-80-08-0770.PMC9703896.pdf", + "0004-282X-anp-80-08-0779.PMC9703884.pdf", + "0004-282X-anp-80-08-0786.PMC9703892.pdf", + "0004-282X-anp-80-08-0794.PMC9703890.pdf", + "0004-282X-anp-80-08-0802.PMC9703888.pdf", + "0004-282X-anp-80-08-0806.PMC9703885.pdf", + "0004-282X-anp-80-08-0812.PMC9703889.pdf", + "0004-282X-anp-80-08-0822.PMC9703895.pdf", + "0004-282X-anp-80-08-0831.PMC9703883.pdf", + "0004-282X-anp-80-08-0837.PMC9703894.pdf", + "0004-282X-anp-80-08-0845.PMC9703891.pdf", + "0004-282X-anp-80-08-0862.PMC9703887.pdf", + "0004-282X-anp-80-08-0867.PMC9703881.pdf", + "0004-282X-anp-80-08-0869.PMC9703893.pdf", + "0004-282X-anp-80-08-0871.PMC9703886.pdf", + "00044-2022.PMC9339767.pdf", + "00046-2022.PMC9080287.pdf", + "00046-2023.PMC10277871.pdf", + "00048-2022.PMC9271757.pdf", + "00048-2023.pdf", + "00051-2023.PMC10152258.pdf", + "00052-2023.PMC10107065.pdf", + "00053-2022.PMC9149391.pdf", + "00053-2023.PMC10204811.pdf", + "00054-2022.PMC9209851.pdf", + "00055-2023.PMC10291312.pdf", + "00056-2022.PMC9035766.pdf", + "00056-2023.PMC10645323.pdf", + "00057-2022.SUPPLEMENT.pdf", + "00057-2022.pdf", + "00057-2023.PMC10493709.pdf", + "00058-2022.SUPPLEMENT.pdf", + "00058-2022.pdf", + "00060-2022.PMC9835995.pdf", + "00061-2022.PMC9209849.pdf", + "00063-2022.PMC9234440.pdf", + "00063-2023.PMC9969230.pdf", + "00064-2023.PMC10316043.pdf", + "00065-2022.PMC8994963.pdf", + "00066-2022.PMC8899494.pdf", + "00067-2022.PMC9661281.pdf", + "00068-2022.PMC9209848.pdf", + "00069-2022.PMC8982749.pdf", + "00070-2023.PMC10291313.pdf", + "00072-2022.PMC8943283.pdf", + "00072-2023.PMC10227628.pdf", + "00074-2022.SUPPLEMENT.pdf", + "00074-2022.pdf", + "00074-2023.PMC10493712.pdf", + "00075-2022.PMC9309344.pdf", + "00078-2023.PMC10291305.pdf", + "00079-2023.PMC10291310.pdf", + "00080-2022.PMC9574560.pdf", + "00080-2023.PMC10291299.pdf", + "00082-2022.PMC8990384.pdf", + "00082-2023.PMC10493707.pdf", + "00085-2022.PMC9703146.pdf", + "00087-2023.PMC10152263.pdf", + "00090-2023.PMC10227632.pdf", + "00092-2022.PMC9168080.pdf", + "00093-2022.PMC9271754.pdf", + "00094-2023.PMC10204820.pdf", + "00095-2022.PMC9149384.pdf", + "00098-2023.PMC10440648.pdf", + "00100-2023.PMC10204816.pdf", + "00102-2023.PMC10388177.pdf", + "00104-2023.PMC10204853.pdf", + "00105-2022.PMC9289374.pdf", + "00109-2022.PMC9251366.pdf", + "00110-2022.SUPPLEMENT.pdf", + "00110-2022.pdf", + "00111-2022.PMC9131135.pdf", + "00113-2022.PMC9235056.pdf", + "00114-2022.PMC9530886.pdf", + "00115-2022.PMC9209850.pdf", + "00116-2022.PMC9124868.pdf", + "00117-2022.PMC9209847.pdf", + "00117-2023.PMC10276923.pdf", + "00119-2022.PMC9149388.pdf", + "00120-2022.PMC9465005.pdf", + "00121-2022.pdf", + "00122-2022.PMC9589319.pdf", + "00122-2023.PMC10107053.pdf", + "00123-2023.PMC10291304.pdf", + "00124-2023.PMC10204731.pdf", + "00126-2022.PMC9234427.pdf", + "00127-2022.PMC9234439.pdf", + "00127-2023.PMC10463038.pdf", + "00129-2022.PMC9131124.pdf", + "00131-2022.SUPPLEMENT.pdf", + "00131-2022.pdf", + "00131-2023.PMC10518878.pdf", + "00132-2022.PMC9619251.pdf", + "00135-2023.PMC10423986.pdf", + "00138-2022.PMC9511155.pdf", + "00138-2023.PMC10641575.pdf", + "00139-2022.PMC9379353.pdf", + "00140-2022.FIGURE1.pdf", + "00140-2022.FIGURE2.pdf", + "00140-2022.SUPPLEMENT.pdf", + "00140-2022.pdf", + "00141-2022.PMC9379352.pdf", + "00141-2023.PMC10493713.pdf", + "00142-2022.PMC9309342.pdf", + "00143-2023.PMC10463028.pdf", + "00144-2022.pdf", + "00144-2022.supplement.pdf", + "00145-2023.PMC10316044.pdf", + "00146-2022.PMC9793243.pdf", + "00148-2023.PMC10316033.pdf", + "00150-2022.PMC9234426.pdf", + "00152-2022.PMC9501643.pdf", + "00154-2021.PMC8841990.pdf", + "00154-2022.PMC9209852.pdf", + "00156-2023.PMC10291309.pdf", + "00157-2022.PMC9703150.pdf", + "00158-2023.PMC10388176.pdf", + "00159-2023.PMC10518872.pdf", + "00161-2023.pdf", + "00163-2022.PMC9574559.pdf", + "00163-2023.PMC10518857.pdf", + "00164-2022.PMC9835973.pdf", + "00164-2023.PMC10463025.pdf", + "00165-2022.PMC9589321.pdf", + "00167-2023.PMC10423982.pdf", + "00168-2022.PMC9530887.pdf", + "00168-2023.PMC10105511.pdf", + "00170-2022.SUPPLEMENT.pdf", + "00170-2022.pdf", + "00172-2022.PMC9619250.pdf", + "00174-2022.PMC9661269.pdf", + "00176-2023.PMC10505954.pdf", + "00177-2023.PMC10227636.pdf", + "00179-2022.PMC9149383.pdf", + "00179-2023.PMC10463030.pdf", + "00180-2023.PMC10613990.pdf", + "00181-2023.PMC10291725.pdf", + "00182-2023.PMC10277874.pdf", + "00183-2022.pdf", + "00185-2022.PMC9309343.pdf", + "00186-2022.PMC9589324.pdf", + "00186-2023.PMC10316034.pdf", + "00189-2022.PMC9511157.pdf", + "00190-2022.PMC9720546.pdf", + "00190-2023.PMC10440652.pdf", + "00191-2022.PMC9589331.pdf", + "00193-2022.PMC9703145.pdf", + "00193-2023.PMC10316041.pdf", + "00194-2023.PMC10423988.pdf", + "00195-2023.PMC10658642.pdf", + "00196-2022.PMC9548241.pdf", + "00199-2022.PMC9271262.pdf", + "00200-2022.PMC9511138.pdf", + "00202-2022.PMC9234424.pdf", + "00203-2022.PMC9234430.pdf", + "00203-2023.PMC10440649.pdf", + "00204-2022.PMC9661249.pdf", + "00205-2022.PMC9792102.pdf", + "00206-2022.pdf", + "00206-2023.PMC10505950.pdf", +] + +# Create a custom PoolManager with desired settings +http_client = PoolManager( + timeout=300, # 5 minutes timeout + maxsize=200, # Increased pool size + retries=Retry(total=10, backoff_factor=0.1, status_forcelist=[100, 102, 103, 104])) + +minio_client = Minio( + os.environ['MINIO_API_ENDPOINT'], + access_key=os.getenv('MINIO_ACCESS_KEY'), + secret_key=os.getenv('MINIO_SECRET_KEY'), + secure=True, + http_client=http_client, +) + +BASE_TEMP_DIR = 'temp' + +NUM_PARALLEL = 120 + +# Configure Grobid +grobid_config = { + # "grobid_server": os.getenv('GROBID_SERVER'), + # "grobid_server": 'https://grobid-gpub004.kastan.ai/', + "grobid_server": 'https://grobid-cn037.kastan.ai/', + "batch_size": 2000, + "sleep_time": 3, + "generateIDs": False, + "consolidate_header": False, + "consolidate_citations": False, + # "include_raw_citations": True, + "include_raw_citations": False, + "include_raw_affiliations": False, + "timeout": 600, + "n": NUM_PARALLEL, + "max_workers": NUM_PARALLEL, + # "concurrency": 8, +} + +# Process PDF with Grobid +grobidClient = GrobidClient(grobid_config) + + +def process_pdf(file_name): + # Download file from Minio + start_time_minio = time.monotonic() + start_time_grobid = time.monotonic() + response = minio_client.get_object(BUCKET_NAME, file_name) + file_content = response.read() + response.close() + response.release_conn() + print(f"⏰ Minio download: {(time.monotonic() - start_time_minio):.2f} seconds") + + # Create a temporary file + with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf: + temp_pdf.write(file_content) + temp_pdf_path = temp_pdf.name + + # Save to a persistent file + persistent_path = os.path.join('grobid_speedtest_pdfs', file_name) + os.makedirs(os.path.dirname(persistent_path), exist_ok=True) + with open(persistent_path, 'wb') as persistent_pdf: + persistent_pdf.write(file_content) + + # Create a temporary file for TEI output + with tempfile.NamedTemporaryFile(delete=False, suffix='.tei.xml') as temp_tei: + temp_tei_path = temp_tei.name + + grobidClient.process_pdf(temp_pdf_path, temp_tei_path, "processFulltextDocument") + + # Clean up temporary files + os.unlink(temp_pdf_path) + os.unlink(temp_tei_path) + print(f"📜 Grobid runtime: {(time.monotonic() - start_time_grobid):.2f} seconds") + + return f"Processed {file_name}" + + +def main(): + start_time = time.monotonic() + + with ProcessPoolExecutor(max_workers=NUM_PARALLEL) as executor: + futures = [executor.submit(process_pdf, file_name) for file_name in input_files] + + for future in as_completed(futures): + try: + result = future.result() + print(result) + except Exception as e: + print(f"An error occurred: {str(e)}") + traceback.print_exc() + + print(f"Total runtime: {(time.monotonic() - start_time) / 60:.2f} minutes") + + +if __name__ == "__main__": + main() diff --git a/llm-guided-pdf-parsing/pdf-parsing/depcirated/sqlite_optimizer.py b/llm-guided-pdf-parsing/pdf-parsing/depcirated/sqlite_optimizer.py new file mode 100644 index 00000000..c1215a6e --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/depcirated/sqlite_optimizer.py @@ -0,0 +1,48 @@ +# Generating a Python script for SQLite performance optimization + +sqlite_optimization_script = """ +import sqlite3 + +def optimize_sqlite(db_path): + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + # Set WAL mode + cursor.execute("PRAGMA journal_mode=WAL;") + + # Increase cache size + cursor.execute("PRAGMA cache_size = 10000;") # Adjust the value based on your available memory + + # Set synchronous mode to NORMAL or OFF + cursor.execute("PRAGMA synchronous = NORMAL;") + # cursor.execute("PRAGMA synchronous = OFF;") # Uncomment this line if you prefer OFF mode + + # Enable memory-mapped I/O + cursor.execute("PRAGMA mmap_size = 268435456;") # Set an appropriate size based on your system + + # Print current settings to verify + cursor.execute("PRAGMA journal_mode;") + print(f"Journal Mode: {cursor.fetchone()[0]}") + + cursor.execute("PRAGMA cache_size;") + print(f"Cache Size: {cursor.fetchone()[0]}") + + cursor.execute("PRAGMA synchronous;") + print(f"Synchronous: {cursor.fetchone()[0]}") + + cursor.execute("PRAGMA mmap_size;") + print(f"Mmap Size: {cursor.fetchone()[0]}") + + conn.close() + +if __name__ == "__main__": + db_path = "articles.db" # Replace with your SQLite database path + optimize_sqlite(db_path) +""" + +# Save the script to a Python file +# script_path = "/mnt/data/optimize_sqlite.py" +# with open(script_path, "w") as file: +# file.write(sqlite_optimization_script) + +# script_path diff --git a/llm-guided-pdf-parsing/pdf-parsing/doc2json/__init__.py b/llm-guided-pdf-parsing/pdf-parsing/doc2json/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/llm-guided-pdf-parsing/pdf-parsing/doc2json/client.py b/llm-guided-pdf-parsing/pdf-parsing/doc2json/client.py new file mode 100644 index 00000000..963f2d83 --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/doc2json/client.py @@ -0,0 +1,190 @@ +""" Generic API Client """ +import json +from copy import deepcopy + +# try: +# from urlparse import urljoin +# except ImportError: +from urllib.parse import urljoin + +import requests + + +class ApiClient(object): + """ Client to interact with a generic Rest API. + + Subclasses should implement functionality accordingly with the provided + service methods, i.e. ``get``, ``post``, ``put`` and ``delete``. + """ + + accept_type = 'application/xml' + api_base = None + + def __init__(self, base_url, username=None, api_key=None, status_endpoint=None, timeout=60): + """ Initialise client. + + Args: + base_url (str): The base URL to the service being used. + username (str): The username to authenticate with. + api_key (str): The API key to authenticate with. + timeout (int): Maximum time before timing out. + """ + self.base_url = base_url + self.username = username + self.api_key = api_key + self.status_endpoint = urljoin(self.base_url, status_endpoint) + self.timeout = timeout + + @staticmethod + def encode(request, data): + """ Add request content data to request body, set Content-type header. + + Should be overridden by subclasses if not using JSON encoding. + + Args: + request (HTTPRequest): The request object. + data (dict, None): Data to be encoded. + + Returns: + HTTPRequest: The request object. + """ + if data is None: + return request + + request.add_header('Content-Type', 'application/json') + request.data = json.dumps(data) + + return request + + @staticmethod + def decode(response): + """ Decode the returned data in the response. + + Should be overridden by subclasses if something else than JSON is + expected. + + Args: + response (HTTPResponse): The response object. + + Returns: + dict or None. + """ + try: + return response.json() + except ValueError as e: + return e.message + + def get_credentials(self): + """ Returns parameters to be added to authenticate the request. + + This lives on its own to make it easier to re-implement it if needed. + + Returns: + dict: A dictionary containing the credentials. + """ + return {"username": self.username, "api_key": self.api_key} + + def call_api( + self, + method, + url, + headers=None, + params=None, + data=None, + files=None, + timeout=None, + ): + """ Call API. + + This returns object containing data, with error details if applicable. + + Args: + method (str): The HTTP method to use. + url (str): Resource location relative to the base URL. + headers (dict or None): Extra request headers to set. + params (dict or None): Query-string parameters. + data (dict or None): Request body contents for POST or PUT requests. + files (dict or None: Files to be passed to the request. + timeout (int): Maximum time before timing out. + + Returns: + ResultParser or ErrorParser. + """ + headers = deepcopy(headers) or {} + headers['Accept'] = self.accept_type + params = deepcopy(params) or {} + data = data or {} + files = files or {} + #if self.username is not None and self.api_key is not None: + # params.update(self.get_credentials()) + r = requests.request( + method, + url, + headers=headers, + params=params, + files=files, + data=data, + timeout=timeout, + ) + + return r, r.status_code + + def get(self, url, params=None, **kwargs): + """ Call the API with a GET request. + + Args: + url (str): Resource location relative to the base URL. + params (dict or None): Query-string parameters. + + Returns: + ResultParser or ErrorParser. + """ + return self.call_api("GET", url, params=params, **kwargs) + + def delete(self, url, params=None, **kwargs): + """ Call the API with a DELETE request. + + Args: + url (str): Resource location relative to the base URL. + params (dict or None): Query-string parameters. + + Returns: + ResultParser or ErrorParser. + """ + return self.call_api("DELETE", url, params=params, **kwargs) + + def put(self, url, params=None, data=None, files=None, **kwargs): + """ Call the API with a PUT request. + + Args: + url (str): Resource location relative to the base URL. + params (dict or None): Query-string parameters. + data (dict or None): Request body contents. + files (dict or None: Files to be passed to the request. + + Returns: + An instance of ResultParser or ErrorParser. + """ + return self.call_api("PUT", url, params=params, data=data, files=files, **kwargs) + + def post(self, url, params=None, data=None, files=None, **kwargs): + """ Call the API with a POST request. + + Args: + url (str): Resource location relative to the base URL. + params (dict or None): Query-string parameters. + data (dict or None): Request body contents. + files (dict or None: Files to be passed to the request. + + Returns: + An instance of ResultParser or ErrorParser. + """ + return self.call_api(method="POST", url=url, params=params, data=data, files=files, **kwargs) + + def service_status(self, **kwargs): + """ Call the API to get the status of the service. + + Returns: + An instance of ResultParser or ErrorParser. + """ + return self.call_api('GET', self.status_endpoint, params={'format': 'json'}, **kwargs) diff --git a/llm-guided-pdf-parsing/pdf-parsing/doc2json/config.py b/llm-guided-pdf-parsing/pdf-parsing/doc2json/config.py new file mode 100644 index 00000000..2cea9a4e --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/doc2json/config.py @@ -0,0 +1,2 @@ +S2ORC_NAME_STRING = 'S2ORC' +S2ORC_VERSION_STRING = '1.0.0' \ No newline at end of file diff --git a/llm-guided-pdf-parsing/pdf-parsing/doc2json/grobid_client.py b/llm-guided-pdf-parsing/pdf-parsing/doc2json/grobid_client.py new file mode 100644 index 00000000..47baf1bd --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/doc2json/grobid_client.py @@ -0,0 +1,228 @@ +import argparse +import glob +import io +import json +import ntpath +import os +import time +from typing import List + +from doc2json.client import ApiClient +''' +This version uses the standard ProcessPoolExecutor for parallelizing the concurrent calls to the GROBID services. +Given the limits of ThreadPoolExecutor (input stored in memory, blocking Executor.map until the whole input +is acquired), it works with batches of PDF of a size indicated in the config.json file (default is 1000 entries). +We are moving from first batch to the second one only when the first is entirely processed - which means it is +slightly sub-optimal, but should scale better. However acquiring a list of million of files in directories would +require something scalable too, which is not implemented for the moment. +''' +# from dotenv import load_dotenv +# load_dotenv(path="../.env", override=True) + +# DEFAULT_GROBID_CONFIG = { +# "grobid_server": "https://grobid.kastan.ai", +# # "grobid_server": os.environ["GROBID_SERVER"], +# # "grobid_server": "http://localhost:6969/", +# # "grobid_port": "443", +# # "grobid_server": "localhost", +# # "grobid_port": "8070", +# "batch_size": 2000, +# "sleep_time": 5, +# "generateIDs": False, +# "consolidate_header": False, +# "consolidate_citations": False, +# "include_raw_citations": True, +# "include_raw_affiliations": False, +# "max_workers": 18, +# } + + +class GrobidClient(ApiClient): + + def __init__(self, config): + self.config = config + self.generate_ids = self.config["generateIDs"] + self.consolidate_header = self.config["consolidate_header"] + self.consolidate_citations = self.config["consolidate_citations"] + self.include_raw_citations = self.config["include_raw_citations"] + self.include_raw_affiliations = self.config["include_raw_affiliations"] + self.max_workers = self.config["max_workers"] + self.grobid_server = self.config["grobid_server"] + # self.grobid_port = self.config["grobid_port"] + self.sleep_time = self.config["sleep_time"] + + def process(self, input: str, output: str, service: str): + batch_size_pdf = self.config['batch_size'] + pdf_files = [] + + for pdf_file in glob.glob(input + "/*.pdf"): + pdf_files.append(pdf_file) + + if len(pdf_files) == batch_size_pdf: + self.process_batch(pdf_files, output, service) + pdf_files = [] + + # last batch + if len(pdf_files) > 0: + self.process_batch(pdf_files, output, service) + + def process_batch(self, pdf_files: List[str], output: str, service: str) -> None: + print(len(pdf_files), "PDF files to process") + for pdf_file in pdf_files: + self.process_pdf(pdf_file, output, service) + + def process_pdf_stream(self, pdf_file: str, pdf_strm: bytes, output: str, service: str) -> str: + # process the stream + files = {'input': (pdf_file, pdf_strm, 'application/pdf', {'Expires': '0'})} + + the_url = self.grobid_server + # the_url += ":" + self.grobid_port + the_url += "api/" + service + + # set the GROBID parameters + the_data = {} + if self.generate_ids: + the_data['generateIDs'] = '1' + else: + the_data['generateIDs'] = '0' + + if self.consolidate_header: + the_data['consolidateHeader'] = '1' + else: + the_data['consolidateHeader'] = '0' + + if self.consolidate_citations: + the_data['consolidateCitations'] = '1' + else: + the_data['consolidateCitations'] = '0' + + if self.include_raw_affiliations: + the_data['includeRawAffiliations'] = '1' + else: + the_data['includeRawAffiliations'] = '0' + + if self.include_raw_citations: + the_data['includeRawCitations'] = '1' + else: + the_data['includeRawCitations'] = '0' + + # print(f"Right before post to Grobid. URL: {the_url}, files: {files}, data: {the_data}") + res, status = self.post(url=the_url, files=files, data=the_data, headers={'Accept': 'text/plain'}) + + if status == 200: + return res.text + elif status == 503: + # No more threads available - let's silently retry + time.sleep(self.sleep_time) + return self.process_pdf_stream(pdf_file, pdf_strm, output, service) + elif status == 524: + # Timeout - let's silently retry + time.sleep(self.sleep_time) + return self.process_pdf_stream(pdf_file, pdf_strm, output, service) + else: + print(f'Grobid Failed. Status: {str(status)}. Output: {output}') + raise (ValueError(f"Grobid failed with status {str(status)}")) + + def process_pdf(self, pdf_file: str, output: str, service: str) -> None: + # check if TEI file is already produced + # we use ntpath here to be sure it will work on Windows too + pdf_file_name = ntpath.basename(pdf_file) + filename = os.path.join(output, os.path.splitext(pdf_file_name)[0] + '.tei.xml') + if os.path.isfile(filename): + return + + # print(pdf_file) + pdf_strm = open(pdf_file, 'rb').read() + tei_text = self.process_pdf_stream(pdf_file, pdf_strm, output, service) + # writing TEI file + if tei_text: + with io.open(output, 'w+', encoding='utf8') as tei_file: + tei_file.write(tei_text) + + def process_citation(self, bib_string: str, log_file: str) -> str | None: + # process citation raw string and return corresponding dict + the_data = {'citations': bib_string, 'consolidateCitations': '0'} + + the_url = self.grobid_server + # the_url += ":" + self.grobid_port + the_url += "/api/processCitation" + + for _ in range(5): + try: + res, status = self.post(url=the_url, data=the_data, headers={'Accept': 'text/plain'}) + if status == 503: + time.sleep(self.sleep_time) + continue + elif status != 200: + with open(log_file, "a+") as failed: + failed.write("-- BIBSTR --\n") + failed.write(bib_string + "\n\n") + break + else: + return res.text + except Exception: + continue + + def process_header_names(self, header_string: str, log_file: str) -> str | None: + # process author names from header string + the_data = {'names': header_string} + + the_url = self.grobid_server + # the_url += ":" + self.grobid_port + the_url += "/api/processHeaderNames" + + res, status = self.post(url=the_url, data=the_data, headers={'Accept': 'text/plain'}) + + if status == 503: + time.sleep(self.sleep_time) + return self.process_header_names(header_string, log_file) + elif status != 200: + with open(log_file, "a+") as failed: + failed.write("-- AUTHOR --\n") + failed.write(header_string + "\n\n") + else: + return res.text + + def process_affiliations(self, aff_string: str, log_file: str) -> str | None: + # process affiliation from input string + the_data = {'affiliations': aff_string} + + the_url = self.grobid_server + # the_url += ":" + self.grobid_port + the_url += "/api/processAffiliations" + + res, status = self.post(url=the_url, data=the_data, headers={'Accept': 'text/plain'}) + + if status == 503: + time.sleep(self.sleep_time) + return self.process_affiliations(aff_string, log_file) + elif status != 200: + with open(log_file, "a+") as failed: + failed.write("-- AFFILIATION --\n") + failed.write(aff_string + "\n\n") + else: + return res.text + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Client for GROBID services") + parser.add_argument("service", help="one of [processFulltextDocument, processHeaderDocument, processReferences]") + parser.add_argument("--input", default=None, help="path to the directory containing PDF to process") + parser.add_argument("--output", default=None, help="path to the directory where to put the results") + parser.add_argument("--config", default=None, help="path to the config file, default is ./config.json") + + args = parser.parse_args() + + input_path = args.input + config = json.load(open(args.config)) if args.config else DEFAULT_GROBID_CONFIG + output_path = args.output + service = args.service + + client = GrobidClient(config=config) + + start_time = time.time() + + client.process(input_path, output_path, service, n=18) + + runtime = round(time.time() - start_time, 3) + print("runtime: %s seconds " % (runtime)) diff --git a/llm-guided-pdf-parsing/pdf-parsing/doc2json/s2orc.py b/llm-guided-pdf-parsing/pdf-parsing/doc2json/s2orc.py new file mode 100644 index 00000000..09adf566 --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/doc2json/s2orc.py @@ -0,0 +1,507 @@ +""" +S2ORC classes +""" + +from datetime import datetime +from typing import Dict, List, Optional + +from doc2json.config import * + +CORRECT_KEYS = {"issn": "issue", "type": "type_str"} + +SKIP_KEYS = {'link', 'bib_id'} + +REFERENCE_OUTPUT_KEYS = { + 'figure': {'text', 'type_str', 'uris', 'num', 'fig_num'}, + 'table': {'text', 'type_str', 'content', 'num', 'html'}, + 'footnote': {'text', 'type_str', 'num'}, + 'section': {'text', 'type_str', 'num', 'parent'}, + 'equation': {'text', 'type_str', 'latex', 'mathml', 'num'} +} + +METADATA_KEYS = {"title", "authors", "year", "venue", "identifiers"} + + +class ReferenceEntry: + """ + Class for representing S2ORC figure and table references + + An example json representation (values are examples, not accurate): + + { + "FIGREF0": { + "text": "FIG. 2. Depth profiles of...", + "latex": null, + "type": "figure" + }, + "TABREF2": { + "text": "Diversity indices of...", + "latex": null, + "type": "table", + "content": "", + "html": "" + } + } + """ + + def __init__(self, + ref_id: str, + text: str, + type_str: str, + latex: Optional[str] = None, + mathml: Optional[str] = None, + content: Optional[str] = None, + html: Optional[str] = None, + uris: Optional[List[str]] = None, + num: Optional[str] = None, + parent: Optional[str] = None, + fig_num: Optional[str] = None): + self.ref_id = ref_id + self.text = text + self.type_str = type_str + self.latex = latex + self.mathml = mathml + self.content = content + self.html = html + self.uris = uris + self.num = num + self.parent = parent + self.fig_num = fig_num + + def as_json(self): + keep_keys = REFERENCE_OUTPUT_KEYS.get(self.type_str, None) + if keep_keys: + return {k: self.__getattribute__(k) for k in keep_keys} + else: + return { + "text": self.text, + "type": self.type_str, + "latex": self.latex, + "mathml": self.mathml, + "content": self.content, + "html": self.html, + "uris": self.uris, + "num": self.num, + "parent": self.parent, + "fig_num": self.fig_num + } + + +class BibliographyEntry: + """ + Class for representing S2ORC parsed bibliography entries + + An example json representation (values are examples, not accurate): + + { + "title": "Mobility Reports...", + "authors": [ + { + "first": "A", + "middle": ["A"], + "last": "Haija", + "suffix": "" + } + ], + "year": 2015, + "venue": "IEEE Wireless Commun. Mag", + "volume": "42", + "issn": "9", + "pages": "80--92", + "other_ids": { + "doi": [ + "10.1109/TWC.2014.2360196" + ], + + } + } + + """ + + def __init__(self, + bib_id: str, + title: str, + authors: List[Dict[str, str]], + ref_id: Optional[str] = None, + year: Optional[int] = None, + venue: Optional[str] = None, + volume: Optional[str] = None, + issue: Optional[str] = None, + pages: Optional[str] = None, + other_ids: Dict[str, List] = None, + num: Optional[int] = None, + urls: Optional[List] = None, + raw_text: Optional[str] = None, + links: Optional[List] = None): + self.bib_id = bib_id + self.ref_id = ref_id + self.title = title + self.authors = authors + self.year = year + self.venue = venue + self.volume = volume + self.issue = issue + self.pages = pages + self.other_ids = other_ids + self.num = num + self.urls = urls + self.raw_text = raw_text + self.links = links + + def as_json(self): + return { + "ref_id": self.ref_id, + "title": self.title, + "authors": self.authors, + "year": self.year, + "venue": self.venue, + "volume": self.volume, + "issue": self.issue, + "pages": self.pages, + "other_ids": self.other_ids, + "num": self.num, + "urls": self.urls, + "raw_text": self.raw_text, + "links": self.links + } + + +class Affiliation: + """ + Class for representing affiliation info + + Example: + { + "laboratory": "Key Laboratory of Urban Environment and Health", + "institution": "Chinese Academy of Sciences", + "location": { + "postCode": "361021", + "settlement": "Xiamen", + "country": "People's Republic of China" + } + """ + + def __init__(self, laboratory: str, institution: str, location: Dict): + self.laboratory = laboratory + self.institution = institution + self.location = location + + def as_json(self): + return {"laboratory": self.laboratory, "institution": self.institution, "location": self.location} + + +class Author: + """ + Class for representing paper authors + + Example: + + { + "first": "Anyi", + "middle": [], + "last": "Hu", + "suffix": "", + "affiliation": { + "laboratory": "Key Laboratory of Urban Environment and Health", + "institution": "Chinese Academy of Sciences", + "location": { + "postCode": "361021", + "settlement": "Xiamen", + "country": "People's Republic of China" + } + }, + "email": "" + } + """ + + def __init__(self, + first: str, + middle: List[str], + last: str, + suffix: str, + affiliation: Optional[Dict] = None, + email: Optional[str] = None): + self.first = first + self.middle = middle + self.last = last + self.suffix = suffix + self.affiliation = Affiliation(**affiliation) if affiliation else {} + self.email = email + + def as_json(self): + return { + "first": self.first, + "middle": self.middle, + "last": self.last, + "suffix": self.suffix, + "affiliation": self.affiliation.as_json() if self.affiliation else {}, + "email": self.email + } + + +class Metadata: + """ + Class for representing paper metadata + + Example: + { + "title": "Niche Partitioning...", + "authors": [ + { + "first": "Anyi", + "middle": [], + "last": "Hu", + "suffix": "", + "affiliation": { + "laboratory": "Key Laboratory of Urban Environment and Health", + "institution": "Chinese Academy of Sciences", + "location": { + "postCode": "361021", + "settlement": "Xiamen", + "country": "People's Republic of China" + } + }, + "email": "" + } + ], + "year": "2011-11" + } + """ + + def __init__(self, + title: str, + authors: List[Dict], + year: Optional[str] = None, + venue: Optional[str] = None, + identifiers: Optional[Dict] = {}): + self.title = title + self.authors = [Author(**author) for author in authors] + self.year = year + self.venue = venue + self.identifiers = identifiers + + def as_json(self): + return { + "title": self.title, + "authors": [author.as_json() for author in self.authors], + "year": self.year, + "venue": self.venue, + "identifiers": self.identifiers + } + + +class Paragraph: + """ + Class for representing a parsed paragraph from Grobid xml + All xml tags are removed from the paragraph text, all figures, equations, and tables are replaced + with a special token that maps to a reference identifier + Citation mention spans and section header are extracted + + An example json representation (values are examples, not accurate): + + { + "text": "Formal language techniques BID1 may be used to study FORMULA0 (see REF0)...", + "mention_spans": [ + { + "start": 27, + "end": 31, + "text": "[1]") + ], + "ref_spans": [ + { + "start": , + "end": , + "text": "Fig. 1" + } + ], + "eq_spans": [ + { + "start": 53, + "end": 61, + "text": "α = 1", + "latex": "\\alpha = 1", + "ref_id": null + } + ], + "section": "Abstract" + } + """ + + def __init__(self, + text: str, + cite_spans: List[Dict], + ref_spans: List[Dict], + eq_spans: Optional[List[Dict]] = [], + section: Optional = None, + sec_num: Optional = None): + self.text = text + self.cite_spans = cite_spans + self.ref_spans = ref_spans + self.eq_spans = eq_spans + if type(section) == str: + if section: + sec_parts = section.split('::') + section_list = [[None, sec_name] for sec_name in sec_parts] + else: + section_list = None + if section_list and sec_num: + section_list[-1][0] = sec_num + else: + section_list = section + self.section = section_list + + def as_json(self): + return { + "text": self.text, + "cite_spans": self.cite_spans, + "ref_spans": self.ref_spans, + "eq_spans": self.eq_spans, + "section": '::'.join([sec[1] for sec in self.section]) if self.section else "", + "sec_num": self.section[-1][0] if self.section else None + } + + +class Paper: + """ + Class for representing a parsed S2ORC paper + """ + + def __init__(self, paper_id: str, pdf_hash: str, metadata: Dict, abstract: List[Dict], body_text: List[Dict], + back_matter: List[Dict], bib_entries: Dict, ref_entries: Dict): + self.paper_id = paper_id + self.pdf_hash = pdf_hash + self.metadata = Metadata(**metadata) + self.abstract = [Paragraph(**para) for para in abstract] + self.body_text = [Paragraph(**para) for para in body_text] + self.back_matter = [Paragraph(**para) for para in back_matter] + self.bib_entries = [ + BibliographyEntry(bib_id=key, + **{ + CORRECT_KEYS[k] if k in CORRECT_KEYS else k: v + for k, v in bib.items() + if k not in SKIP_KEYS + }) + for key, bib in bib_entries.items() + ] + self.ref_entries = [ + ReferenceEntry(ref_id=key, + **{ + CORRECT_KEYS[k] if k in CORRECT_KEYS else k: v for k, v in ref.items() if k != 'ref_id' + }) for key, ref in ref_entries.items() + ] + + def as_json(self): + return { + "paper_id": self.paper_id, + "pdf_hash": self.pdf_hash, + "metadata": self.metadata.as_json(), + "abstract": [para.as_json() for para in self.abstract], + "body_text": [para.as_json() for para in self.body_text], + "back_matter": [para.as_json() for para in self.back_matter], + "bib_entries": { + bib.bib_id: bib.as_json() for bib in self.bib_entries + }, + "ref_entries": { + ref.ref_id: ref.as_json() for ref in self.ref_entries + } + } + + @property + def raw_abstract_text(self) -> str: + """ + Get all the body text joined by a newline + :return: + """ + return '\n'.join([para.text for para in self.abstract]) + + @property + def raw_body_text(self) -> str: + """ + Get all the body text joined by a newline + :return: + """ + return '\n'.join([para.text for para in self.body_text]) + + def release_json(self, doc_type: str = "pdf"): + """ + Return in release JSON format + :return: + """ + # TODO: not fully implemented; metadata format is not right; extra keys in some places + release_dict = {"paper_id": self.paper_id} + release_dict.update({ + "header": { + "generated_with": f'{S2ORC_NAME_STRING} {S2ORC_VERSION_STRING}', + "date_generated": datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%fZ') + } + }) + release_dict.update(self.metadata.as_json()) + release_dict.update({"abstract": self.raw_abstract_text}) + release_dict.update({ + f"{doc_type}_parse": { + "paper_id": self.paper_id, + "_pdf_hash": self.pdf_hash, + "abstract": [para.as_json() for para in self.abstract], + "body_text": [para.as_json() for para in self.body_text], + "back_matter": [para.as_json() for para in self.back_matter], + "bib_entries": { + bib.bib_id: bib.as_json() for bib in self.bib_entries + }, + "ref_entries": { + ref.ref_id: ref.as_json() for ref in self.ref_entries + } + } + }) + return release_dict + + +def load_s2orc(paper_dict: Dict) -> Paper: + """ + Load release S2ORC into Paper class + :param paper_dict: + :return: + """ + paper_id = paper_dict['paper_id'] + pdf_hash = paper_dict.get('_pdf_hash', paper_dict.get('s2_pdf_hash', None)) + + # 2019 gorc parses + if "grobid_parse" in paper_dict and paper_dict.get("grobid_parse"): + metadata = {k: v for k, v in paper_dict["metadata"].items() if k in METADATA_KEYS} + abstract = paper_dict.get("grobid_parse").get("abstract", []) + body_text = paper_dict.get("grobid_parse").get("body_text", []) + back_matter = paper_dict.get("grobid_parse").get("back_matter", []) + bib_entries = paper_dict.get("grobid_parse").get("bib_entries", {}) + for k, v in bib_entries.items(): + if 'link' in v: + v['links'] = [v['link']] + ref_entries = paper_dict.get("grobid_parse").get("ref_entries", {}) + # current and 2020 s2orc release_json + elif ("pdf_parse" in paper_dict and paper_dict.get("pdf_parse")) or ("body_text" in paper_dict and + paper_dict.get("body_text")): + if "pdf_parse" in paper_dict: + paper_dict = paper_dict["pdf_parse"] + if paper_dict.get("metadata"): + metadata = {k: v for k, v in paper_dict.get("metadata").items() if k in METADATA_KEYS} + # 2020 s2orc releases (metadata is separate) + else: + metadata = {"title": None, "authors": [], "year": None} + abstract = paper_dict.get("abstract", []) + body_text = paper_dict.get("body_text", []) + back_matter = paper_dict.get("back_matter", []) + bib_entries = paper_dict.get("bib_entries", {}) + for k, v in bib_entries.items(): + if 'link' in v: + v['links'] = [v['link']] + ref_entries = paper_dict.get("ref_entries", {}) + else: + print(paper_id) + raise NotImplementedError("Unknown S2ORC file type!") + + return Paper(paper_id=paper_id, + pdf_hash=pdf_hash, + metadata=metadata, + abstract=abstract, + body_text=body_text, + back_matter=back_matter, + bib_entries=bib_entries, + ref_entries=ref_entries) diff --git a/llm-guided-pdf-parsing/pdf-parsing/doc2json/tei_to_json.py b/llm-guided-pdf-parsing/pdf-parsing/doc2json/tei_to_json.py new file mode 100644 index 00000000..dd005a99 --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/doc2json/tei_to_json.py @@ -0,0 +1,693 @@ +#!/usr/bin/env python + +import os +import re +import sys +from typing import Dict, List, Tuple + +import bs4 +from bs4 import BeautifulSoup, NavigableString +from doc2json.s2orc import Paper +from doc2json.utils.citation_util import ( + BRACKET_REGEX, + BRACKET_STYLE_THRESHOLD, + SINGLE_BRACKET_REGEX, + _clean_empty_and_duplicate_authors_from_grobid_parse, + is_expansion_string, +) +from doc2json.utils.grobid_util import ( + extract_paper_metadata_from_grobid_xml, + parse_bib_entry, +) +from doc2json.utils.refspan_util import sub_spans_and_update_indices + +REPLACE_TABLE_TOKS = { + "": "", + "": "", + "": "", + "": "", + "": "", + "": "", + " List[Dict]: + """ + Finds all bibliography entries in a grobid xml. + """ + bibliography = soup.listBibl + if bibliography is None: + return [] + + entries = bibliography.find_all("biblStruct") + + structured_entries = [] + for entry in entries: + bib_entry = parse_bib_entry(entry) + # add bib entry only if it has a title + if bib_entry['title']: + structured_entries.append(bib_entry) + + bibliography.decompose() + + return structured_entries + + +def extract_formulas_from_tei_xml(sp: BeautifulSoup) -> None: + """ + Replace all formulas with the text + :param sp: + :return: + """ + for eq in sp.find_all('formula'): + eq.replace_with(sp.new_string(eq.text.strip())) + + +def table_to_html(table: bs4.element.Tag) -> str: + """ + Sub table tags with html table tags + :param table_str: + :return: + """ + for tag in table: + if tag.name != 'row': + print(f'Unknown table subtag: {tag.name}') + tag.decompose() + table_str = str(table) + for token, subtoken in REPLACE_TABLE_TOKS.items(): + table_str = table_str.replace(token, subtoken) + return table_str + + +def extract_figures_and_tables_from_tei_xml(sp: BeautifulSoup) -> Dict[str, Dict]: + """ + Generate figure and table dicts + :param sp: + :return: + """ + ref_map = dict() + + for fig in sp.find_all('figure'): + try: + if fig.name and fig.get('xml:id'): + if fig.get('type') == 'table': + ref_map[normalize_grobid_id(fig.get('xml:id'))] = { + "text": fig.figDesc.text.strip() if fig.figDesc else fig.head.text.strip() if fig.head else "", + "latex": None, + "type": "table", + "content": table_to_html(fig.table), + "fig_num": fig.get('xml:id') + } + else: + if True in [char.isdigit() for char in fig.findNext('head').findNext('label')]: + fig_num = fig.findNext('head').findNext('label').contents[0] + else: + fig_num = None + ref_map[normalize_grobid_id(fig.get('xml:id'))] = { + "text": fig.figDesc.text.strip() if fig.figDesc else "", + "latex": None, + "type": "figure", + "content": "", + "fig_num": fig_num + } + except AttributeError: + continue + fig.decompose() + + return ref_map + + +def check_if_citations_are_bracket_style(sp: BeautifulSoup) -> bool: + """ + Check if the document has bracket style citations + :param sp: + :return: + """ + cite_strings = [] + if sp.body: + for div in sp.body.find_all('div'): + if div.head: + continue + for rtag in div.find_all('ref'): + ref_type = rtag.get('type') + if ref_type == 'bibr': + cite_strings.append(rtag.text.strip()) + + # check how many match bracket style + bracket_style = [bool(BRACKET_REGEX.match(cite_str)) for cite_str in cite_strings] + + # return true if + if sum(bracket_style) > BRACKET_STYLE_THRESHOLD: + return True + + return False + + +def sub_all_note_tags(sp: BeautifulSoup) -> BeautifulSoup: + """ + Sub all note tags with p tags + :param para_el: + :param sp: + :return: + """ + for ntag in sp.find_all('note'): + p_tag = sp.new_tag('p') + p_tag.string = ntag.text.strip() + ntag.replace_with(p_tag) + return sp + + +def process_formulas_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup) -> None: + """ + Process all formulas in paragraph and replace with text and label + :param para_el: + :param sp: + :return: + """ + for ftag in para_el.find_all('formula'): + # get label if exists and insert a space between formula and label + if ftag.label: + label = ' ' + ftag.label.text + ftag.label.decompose() + else: + label = '' + ftag.replace_with(sp.new_string(f'{ftag.text.strip()}{label}')) + + +def process_references_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup, refs: Dict) -> Dict: + """ + Process all references in paragraph and generate a dict that contains (type, ref_id, surface_form) + :param para_el: + :param sp: + :param refs: + :return: + """ + tokgen = UniqTokenGenerator('REFTOKEN') + ref_dict = dict() + for rtag in para_el.find_all('ref'): + try: + ref_type = rtag.get('type') + # skip if citation + if ref_type == 'bibr': + continue + if ref_type == 'table' or ref_type == 'figure': + ref_id = rtag.get('target') + if ref_id and normalize_grobid_id(ref_id) in refs: + # normalize reference string + rtag_string = normalize_grobid_id(ref_id) + else: + rtag_string = None + # add to ref set + ref_key = tokgen.next() + ref_dict[ref_key] = (rtag_string, rtag.text.strip(), ref_type) + rtag.replace_with(sp.new_string(f" {ref_key} ")) + else: + # replace with surface form + rtag.replace_with(sp.new_string(rtag.text.strip())) + except AttributeError: + continue + return ref_dict + + +def process_citations_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup, bibs: Dict, bracket: bool) -> Dict: + """ + Process all citations in paragraph and generate a dict for surface forms + :param para_el: + :param sp: + :param bibs: + :param bracket: + :return: + """ + + # CHECK if range between two surface forms is appropriate for bracket style expansion + def _get_surface_range(start_surface, end_surface): + span1_match = SINGLE_BRACKET_REGEX.match(start_surface) + span2_match = SINGLE_BRACKET_REGEX.match(end_surface) + if span1_match and span2_match: + # get numbers corresponding to citations + span1_num = int(span1_match.group(1)) + span2_num = int(span2_match.group(1)) + # expand if range is between 1 and 20 + if 1 < span2_num - span1_num < 20: + return span1_num, span2_num + return None + + # CREATE BIBREF range between two reference ids, e.g. BIBREF1-BIBREF4 -> BIBREF1 BIBREF2 BIBREF3 BIBREF4 + def _create_ref_id_range(start_ref_id, end_ref_id): + start_ref_num = int(start_ref_id[6:]) + end_ref_num = int(end_ref_id[6:]) + return [f'BIBREF{curr_ref_num}' for curr_ref_num in range(start_ref_num, end_ref_num + 1)] + + # CREATE surface form range between two bracket strings, e.g. [1]-[4] -> [1] [2] [3] [4] + def _create_surface_range(start_number, end_number): + return [f'[{n}]' for n in range(start_number, end_number + 1)] + + # create citation dict with keywords + cite_map = dict() + tokgen = UniqTokenGenerator('CITETOKEN') + + for rtag in para_el.find_all('ref'): + try: + # get surface span, e.g. [3] + surface_span = rtag.text.strip() + + # check if target is available (#b2 -> BID2) + if rtag.get('target'): + # normalize reference string + rtag_ref_id = normalize_grobid_id(rtag.get('target')) + + # skip if rtag ref_id not in bibliography + if rtag_ref_id not in bibs: + cite_key = tokgen.next() + rtag.replace_with(sp.new_string(f" {cite_key} ")) + cite_map[cite_key] = (None, surface_span) + continue + + # if bracket style, only keep if surface form is bracket + if bracket: + # valid bracket span + if surface_span and (surface_span[0] == '[' or surface_span[-1] == ']' or surface_span[-1] == ','): + pass + # invalid, replace tag with surface form and continue to next ref tag + else: + rtag.replace_with(sp.new_string(f" {surface_span} ")) + continue + # not bracket, add cite span and move on + else: + cite_key = tokgen.next() + rtag.replace_with(sp.new_string(f" {cite_key} ")) + cite_map[cite_key] = (rtag_ref_id, surface_span) + continue + + ### EXTRA PROCESSING FOR BRACKET STYLE CITATIONS; EXPAND RANGES ### + # look backward for range marker, e.g. [1]-*[3]* + backward_between_span = "" + for sib in rtag.previous_siblings: + if sib.name == 'ref': + break + elif type(sib) == NavigableString: + backward_between_span += sib + else: + break + + # check if there's a backwards expansion, e.g. need to expand [1]-[3] -> [1] [2] [3] + if is_expansion_string(backward_between_span): + # get surface number range + surface_num_range = _get_surface_range(rtag.find_previous_sibling('ref').text.strip(), surface_span) + # if the surface number range is reasonable (range < 20, in order), EXPAND + if surface_num_range: + # delete previous ref tag and anything in between (i.e. delete "-" and extra spaces) + for sib in rtag.previous_siblings: + if sib.name == 'ref': + break + elif type(sib) == NavigableString: + sib.replace_with(sp.new_string("")) + else: + break + + # get ref id of previous ref, e.g. [1] (#b0 -> BID0) + previous_rtag = rtag.find_previous_sibling('ref') + previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get('target')) + previous_rtag.decompose() + + # replace this ref tag with the full range expansion, e.g. [3] (#b2 -> BID1 BID2) + id_range = _create_ref_id_range(previous_rtag_ref_id, rtag_ref_id) + surface_range = _create_surface_range(surface_num_range[0], surface_num_range[1]) + replace_string = '' + for range_ref_id, range_surface_form in zip(id_range, surface_range): + # only replace if ref id is in bibliography, else add none + if range_ref_id in bibs: + cite_key = tokgen.next() + cite_map[cite_key] = (range_ref_id, range_surface_form) + else: + cite_key = tokgen.next() + cite_map[cite_key] = (None, range_surface_form) + replace_string += cite_key + ' ' + rtag.replace_with(sp.new_string(f" {replace_string} ")) + # ELSE do not expand backwards and replace previous and current rtag with appropriate ref id + else: + # add mapping between ref id and surface form for previous ref tag + previous_rtag = rtag.find_previous_sibling('ref') + previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get('target')) + previous_rtag_surface = previous_rtag.text.strip() + cite_key = tokgen.next() + previous_rtag.replace_with(sp.new_string(f" {cite_key} ")) + cite_map[cite_key] = (previous_rtag_ref_id, previous_rtag_surface) + + # add mapping between ref id and surface form for current reftag + cite_key = tokgen.next() + rtag.replace_with(sp.new_string(f" {cite_key} ")) + cite_map[cite_key] = (rtag_ref_id, surface_span) + else: + # look forward and see if expansion string, e.g. *[1]*-[3] + forward_between_span = "" + for sib in rtag.next_siblings: + if sib.name == 'ref': + break + elif type(sib) == NavigableString: + forward_between_span += sib + else: + break + # look forward for range marker (if is a range, continue -- range will be expanded + # when we get to the second value) + if is_expansion_string(forward_between_span): + continue + # else treat like normal reference + else: + cite_key = tokgen.next() + rtag.replace_with(sp.new_string(f" {cite_key} ")) + cite_map[cite_key] = (rtag_ref_id, surface_span) + + else: + cite_key = tokgen.next() + rtag.replace_with(sp.new_string(f" {cite_key} ")) + cite_map[cite_key] = (None, surface_span) + except AttributeError: + continue + + return cite_map + + +def process_paragraph(sp: BeautifulSoup, para_el: bs4.element.Tag, section_names: List[Tuple], bib_dict: Dict, + ref_dict: Dict, bracket: bool) -> Dict: + """ + Process one paragraph + :param sp: + :param para_el: + :param section_names: + :param bib_dict: + :param ref_dict: + :param bracket: if bracket style, expand and clean up citations + :return: + """ + # return empty paragraph if no text + if not para_el.text: + return {'text': "", 'cite_spans': [], 'ref_spans': [], 'eq_spans': [], 'section': section_names} + + # replace formulas with formula text + process_formulas_in_paragraph(para_el, sp) + + # get references to tables and figures + ref_map = process_references_in_paragraph(para_el, sp, ref_dict) + + # generate citation map for paragraph element (keep only cite spans with bib entry or unlinked) + cite_map = process_citations_in_paragraph(para_el, sp, bib_dict, bracket) + + # substitute space characters + para_text = re.sub(r'\s+', ' ', para_el.text) + para_text = re.sub(r'\s', ' ', para_text) + + # get all cite and ref spans + all_spans_to_replace = [] + for span in re.finditer(r'(CITETOKEN\d+)', para_text): + uniq_token = span.group() + ref_id, surface_text = cite_map[uniq_token] + all_spans_to_replace.append((span.start(), span.start() + len(uniq_token), uniq_token, surface_text)) + for span in re.finditer(r'(REFTOKEN\d+)', para_text): + uniq_token = span.group() + ref_id, surface_text, ref_type = ref_map[uniq_token] + all_spans_to_replace.append((span.start(), span.start() + len(uniq_token), uniq_token, surface_text)) + + # replace cite and ref spans and create json blobs + para_text, all_spans_to_replace = sub_spans_and_update_indices(all_spans_to_replace, para_text) + + cite_span_blobs = [{ + "start": start, + "end": end, + "text": surface, + "ref_id": cite_map[token][0] + } for start, end, token, surface in all_spans_to_replace if token.startswith('CITETOKEN')] + + ref_span_blobs = [{ + "start": start, + "end": end, + "text": surface, + "ref_id": ref_map[token][0] + } for start, end, token, surface in all_spans_to_replace if token.startswith('REFTOKEN')] + + for cite_blob in cite_span_blobs: + assert para_text[cite_blob["start"]:cite_blob["end"]] == cite_blob["text"] + + for ref_blob in ref_span_blobs: + assert para_text[ref_blob["start"]:ref_blob["end"]] == ref_blob["text"] + + return { + 'text': para_text, + 'cite_spans': cite_span_blobs, + 'ref_spans': ref_span_blobs, + 'eq_spans': [], + 'section': section_names + } + + +def extract_abstract_from_tei_xml(sp: BeautifulSoup, bib_dict: Dict, ref_dict: Dict, + cleanup_bracket: bool) -> List[Dict]: + """ + Parse abstract from soup + :param sp: + :param bib_dict: + :param ref_dict: + :param cleanup_bracket: + :return: + """ + abstract_text = [] + if sp.abstract: + # process all divs + if sp.abstract.div: + for div in sp.abstract.find_all('div'): + if div.text: + if div.p: + for para in div.find_all('p'): + if para.text: + abstract_text.append( + process_paragraph(sp, para, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket)) + else: + if div.text: + abstract_text.append(process_paragraph(sp, div, [(None, "Abstract")], bib_dict, ref_dict, + cleanup_bracket)) + # process all paragraphs + elif sp.abstract.p: + for para in sp.abstract.find_all('p'): + if para.text: + abstract_text.append(process_paragraph(sp, para, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket)) + # else just try to get the text + else: + if sp.abstract.text: + abstract_text.append( + process_paragraph(sp, sp.abstract, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket)) + sp.abstract.decompose() + return abstract_text + + +def extract_body_text_from_div(sp: BeautifulSoup, div: bs4.element.Tag, sections: List[Tuple], bib_dict: Dict, + ref_dict: Dict, cleanup_bracket: bool) -> List[Dict]: + """ + Parse body text from soup + :param sp: + :param div: + :param sections: + :param bib_dict: + :param ref_dict: + :param cleanup_bracket: + :return: + """ + chunks = [] + # check if nested divs; recursively process + if div.div: + for subdiv in div.find_all('div'): + # has header, add to section list and process + if subdiv.head: + chunks += extract_body_text_from_div(sp, subdiv, + sections + [(subdiv.head.get('n', None), subdiv.head.text.strip())], + bib_dict, ref_dict, cleanup_bracket) + subdiv.head.decompose() + # no header, process with same section list + else: + chunks += extract_body_text_from_div(sp, subdiv, sections, bib_dict, ref_dict, cleanup_bracket) + # process tags individuals + for tag in div: + try: + if tag.name == 'p': + if tag.text: + chunks.append(process_paragraph(sp, tag, sections, bib_dict, ref_dict, cleanup_bracket)) + elif tag.name == 'formula': + # e.g. Y = W T X. + label = tag.label.text + tag.label.decompose() + eq_text = tag.text + chunks.append({ + 'text': 'EQUATION', + 'cite_spans': [], + 'ref_spans': [], + 'eq_spans': [{ + "start": 0, + "end": 8, + "text": "EQUATION", + "ref_id": "EQREF", + "raw_str": eq_text, + "eq_num": label + }], + 'section': sections + }) + except AttributeError: + if tag.text: + chunks.append(process_paragraph(sp, tag, sections, bib_dict, ref_dict, cleanup_bracket)) + + return chunks + + +def extract_body_text_from_tei_xml(sp: BeautifulSoup, bib_dict: Dict, ref_dict: Dict, + cleanup_bracket: bool) -> List[Dict]: + """ + Parse body text from soup + :param sp: + :param bib_dict: + :param ref_dict: + :param cleanup_bracket: + :return: + """ + body_text = [] + if sp.body: + body_text = extract_body_text_from_div(sp, sp.body, [], bib_dict, ref_dict, cleanup_bracket) + sp.body.decompose() + return body_text + + +def extract_back_matter_from_tei_xml(sp: BeautifulSoup, bib_dict: Dict, ref_dict: Dict, + cleanup_bracket: bool) -> List[Dict]: + """ + Parse back matter from soup + :param sp: + :param bib_dict: + :param ref_dict: + :param cleanup_bracket: + :return: + """ + back_text = [] + + if sp.back: + for div in sp.back.find_all('div'): + if div.get('type'): + section_type = div.get('type') + else: + section_type = '' + + for child_div in div.find_all('div'): + if child_div.head: + section_title = child_div.head.text.strip() + section_num = child_div.head.get('n', None) + child_div.head.decompose() + else: + section_title = section_type + section_num = None + if child_div.text: + if child_div.text: + back_text.append( + process_paragraph(sp, child_div, [(section_num, section_title)], bib_dict, ref_dict, cleanup_bracket)) + sp.back.decompose() + return back_text + + +def convert_tei_xml_soup_to_s2orc_json(soup: BeautifulSoup, paper_id: str, pdf_hash: str) -> Paper: + """ + Convert Grobid TEI XML to S2ORC json format + :param soup: BeautifulSoup of XML file content + :param paper_id: name of file + :param pdf_hash: hash of PDF + :return: + """ + # extract metadata + metadata = extract_paper_metadata_from_grobid_xml(soup.fileDesc) + # clean metadata authors (remove dupes etc) + metadata['authors'] = _clean_empty_and_duplicate_authors_from_grobid_parse(metadata['authors']) + + # parse bibliography entries (removes empty bib entries) + biblio_entries = parse_bibliography(soup) + bibkey_map = {normalize_grobid_id(bib['ref_id']): bib for bib in biblio_entries} + + # # process formulas and replace with text + # extract_formulas_from_tei_xml(soup) + + # extract figure and table captions + refkey_map = extract_figures_and_tables_from_tei_xml(soup) + + # get bracket style + is_bracket_style = check_if_citations_are_bracket_style(soup) + + # substitute all note tags with p tags + soup = sub_all_note_tags(soup) + + # process abstract if possible + abstract_entries = extract_abstract_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style) + + # process body text + body_entries = extract_body_text_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style) + + # parse back matter (acks, author statements, competing interests, abbrevs etc) + back_matter = extract_back_matter_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style) + + # form final paper entry + return Paper(paper_id=paper_id, + pdf_hash=pdf_hash, + metadata=metadata, + abstract=abstract_entries, + body_text=body_entries, + back_matter=back_matter, + bib_entries=bibkey_map, + ref_entries=refkey_map) + + +def convert_tei_xml_file_to_s2orc_json(tei_file: str, pdf_hash: str = "") -> Paper: + """ + Convert a TEI XML file to S2ORC JSON + :param tei_file: + :param pdf_hash: + :return: + """ + if not os.path.exists(tei_file): + raise FileNotFoundError("Input TEI XML file doesn't exist") + paper_id = tei_file.split('/')[-1].split('.')[0] + soup = BeautifulSoup(open(tei_file, "rb").read(), "xml") + paper = convert_tei_xml_soup_to_s2orc_json(soup, paper_id, pdf_hash) + return paper diff --git a/llm-guided-pdf-parsing/pdf-parsing/doc2json/utils/citation_util.py b/llm-guided-pdf-parsing/pdf-parsing/doc2json/utils/citation_util.py new file mode 100644 index 00000000..ccead35f --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/doc2json/utils/citation_util.py @@ -0,0 +1,75 @@ +# utility functions for handling failure situations with grobid-detected citation spans + +import re +from typing import Dict, List, Tuple + +BRACKET_REGEX = re.compile(r'\[[1-9]\d{0,2}([,;\-\s]+[1-9]\d{0,2})*;?\]') +BRACKET_STYLE_THRESHOLD = 5 + +SINGLE_BRACKET_REGEX = re.compile(r'\[([1-9]\d{0,2})\]') +EXPANSION_CHARS = {'-', '–'} + + +def span_already_added(sub_start: int, sub_end: int, span_indices: List[Tuple[int, int]]) -> bool: + """ + Check if span is a subspan of existing span + :param sub_start: + :param sub_end: + :param span_indices: + :return: + """ + for span_start, span_end in span_indices: + if sub_start >= span_start and sub_end <= span_end: + return True + return False + + +def is_expansion_string(between_string: str) -> bool: + """ + Check if the string between two refs is an expansion string + :param between_string: + :return: + """ + if len(between_string) <= 2 \ + and any([c in EXPANSION_CHARS for c in between_string]) \ + and all([c in EXPANSION_CHARS.union({' '}) for c in between_string]): + return True + return False + + +# TODO: still cases like `09bcee03baceb509d4fcf736fa1322cb8adf507f` w/ dups like ['L Jung', 'R Hessler', 'Louis Jung', 'Roland Hessler'] +# example paper that has empties & duplicates: `09bce26cc7e825e15a4469e3e78b7a54898bb97f` +def _clean_empty_and_duplicate_authors_from_grobid_parse(authors: List[Dict]) -> List[Dict]: + """ + Within affiliation, `location` is a dict with fields , , , , etc. + Too much hassle, so just take the first one that's not empty. + """ + # stripping empties + clean_authors_list = [] + for author in authors: + clean_first = author['first'].strip() + clean_last = author['last'].strip() + clean_middle = [m.strip() for m in author['middle']] + clean_suffix = author['suffix'].strip() + if clean_first or clean_last or clean_middle: + author['first'] = clean_first + author['last'] = clean_last + author['middle'] = clean_middle + author['suffix'] = clean_suffix + clean_authors_list.append(author) + # combining duplicates (preserve first occurrence of author name as position) + key_to_author_blobs = {} + ordered_keys_by_author_pos = [] + for author in clean_authors_list: + key = (author['first'], author['last'], ' '.join(author['middle']), author['suffix']) + if key not in key_to_author_blobs: + key_to_author_blobs[key] = author + ordered_keys_by_author_pos.append(key) + else: + if author['email']: + key_to_author_blobs[key]['email'] = author['email'] + if author['affiliation'] and (author['affiliation']['institution'] or author['affiliation']['laboratory'] or + author['affiliation']['location']): + key_to_author_blobs[key]['affiliation'] = author['affiliation'] + dedup_authors_list = [key_to_author_blobs[key] for key in ordered_keys_by_author_pos] + return dedup_authors_list diff --git a/llm-guided-pdf-parsing/pdf-parsing/doc2json/utils/grobid_util.py b/llm-guided-pdf-parsing/pdf-parsing/doc2json/utils/grobid_util.py new file mode 100644 index 00000000..6eecca09 --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/doc2json/utils/grobid_util.py @@ -0,0 +1,374 @@ +import re +from collections import defaultdict +from typing import Dict, List, Optional + +import bs4 +from bs4 import BeautifulSoup + +SUBSTITUTE_TAGS = {'persName', 'orgName', 'publicationStmt', 'titleStmt', 'biblScope'} + + +def clean_tags(el: bs4.element.Tag): + """ + Replace all tags with lowercase version + :param el: + :return: + """ + for sub_tag in SUBSTITUTE_TAGS: + for sub_el in el.find_all(sub_tag): + sub_el.name = sub_tag.lower() + + +def soup_from_path(file_path: str): + """ + Read XML file + :param file_path: + :return: + """ + return BeautifulSoup(open(file_path, "rb").read(), "xml") + + +def get_title_from_grobid_xml(raw_xml: BeautifulSoup) -> str: + """ + Returns title + :return: + """ + for title_entry in raw_xml.find_all("title"): + if title_entry.has_attr("level") \ + and title_entry["level"] == "a": + return title_entry.text + try: + return raw_xml.title.text + except AttributeError: + return "" + + +def get_author_names_from_grobid_xml(raw_xml: BeautifulSoup) -> List[Dict[str, str]]: + """ + Returns a list of dictionaries, one for each author, + containing the first and last names. + + e.g. + { + "first": first, + "middle": middle, + "last": last, + "suffix": suffix + } + """ + names = [] + + for author in raw_xml.find_all("author"): + if not author.persname: + continue + + # forenames include first and middle names + forenames = author.persname.find_all("forename") + + # surnames include last names + surnames = author.persname.find_all("surname") + + # name suffixes + suffixes = author.persname.find_all("suffix") + + first = "" + middle = [] + last = "" + suffix = "" + + for forename in forenames: + if forename["type"] == "first": + if not first: + first = forename.text + else: + middle.append(forename.text) + elif forename["type"] == "middle": + middle.append(forename.text) + + if len(surnames) > 1: + for surname in surnames[:-1]: + middle.append(surname.text) + last = surnames[-1].text + elif len(surnames) == 1: + last = surnames[0].text + + if len(suffix) >= 1: + suffix = " ".join([suffix.text for suffix in suffixes]) + + names_dict = {"first": first, "middle": middle, "last": last, "suffix": suffix} + + names.append(names_dict) + return names + + +def get_affiliation_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict: + """ + Get affiliation from grobid xml + :param raw_xml: + :return: + """ + location_dict = dict() + laboratory_name = "" + institution_name = "" + + if raw_xml and raw_xml.affiliation: + for child in raw_xml.affiliation: + if child.name == "orgname": + if child.has_attr("type"): + if child["type"] == "laboratory": + laboratory_name = child.text + elif child["type"] == "institution": + institution_name = child.text + elif child.name == "address": + for grandchild in child: + if grandchild.name and grandchild.text: + location_dict[grandchild.name] = grandchild.text + + if laboratory_name or institution_name: + return {"laboratory": laboratory_name, "institution": institution_name, "location": location_dict} + + return {} + + +def get_author_data_from_grobid_xml(raw_xml: BeautifulSoup) -> List[Dict]: + """ + Returns a list of dictionaries, one for each author, + containing the first and last names. + + e.g. + { + "first": first, + "middle": middle, + "last": last, + "suffix": suffix, + "affiliation": { + "laboratory": "", + "institution": "", + "location": "", + }, + "email": "" + } + """ + authors = [] + + for author in raw_xml.find_all("author"): + + first = "" + middle = [] + last = "" + suffix = "" + + if author.persname: + # forenames include first and middle names + forenames = author.persname.find_all("forename") + + # surnames include last names + surnames = author.persname.find_all("surname") + + # name suffixes + suffixes = author.persname.find_all("suffix") + + for forename in forenames: + if forename.has_attr("type"): + if forename["type"] == "first": + if not first: + first = forename.text + else: + middle.append(forename.text) + elif forename["type"] == "middle": + middle.append(forename.text) + + if len(surnames) > 1: + for surname in surnames[:-1]: + middle.append(surname.text) + last = surnames[-1].text + elif len(surnames) == 1: + last = surnames[0].text + + if len(suffix) >= 1: + suffix = " ".join([suffix.text for suffix in suffixes]) + + affiliation = get_affiliation_from_grobid_xml(author) + + email = "" + if author.email: + email = author.email.text + + author_dict = { + "first": first, + "middle": middle, + "last": last, + "suffix": suffix, + "affiliation": affiliation, + "email": email + } + + authors.append(author_dict) + + return authors + + +def get_year_from_grobid_xml(raw_xml: BeautifulSoup) -> Optional[int]: + """ + Returns date published if exists + :return: + """ + if raw_xml.date and raw_xml.date.has_attr("when"): + # match year in date text (which is in some unspecified date format) + year_match = re.match(r"((19|20)\d{2})", raw_xml.date["when"]) + if year_match: + year = year_match.group(0) + if year and year.isnumeric() and len(year) == 4: + return int(year) + return None + + +def get_venue_from_grobid_xml(raw_xml: BeautifulSoup, title_text: str) -> str: + """ + Returns venue/journal/publisher of bib entry + Grobid ref documentation: https://grobid.readthedocs.io/en/latest/training/Bibliographical-references/ + level="j": journal title + level="m": "non journal bibliographical item holding the cited article" + level="s": series title + :return: + """ + title_names = [] + keep_types = ["j", "m", "s"] + # get all titles of the anove types + for title_entry in raw_xml.find_all("title"): + if title_entry.has_attr("level") \ + and title_entry["level"] in keep_types \ + and title_entry.text != title_text: + title_names.append((title_entry["level"], title_entry.text)) + # return the title name that most likely belongs to the journal or publication venue + if title_names: + title_names.sort(key=lambda x: keep_types.index(x[0])) + return title_names[0][1] + return "" + + +def get_volume_from_grobid_xml(raw_xml: BeautifulSoup) -> str: + """ + Returns the volume number of grobid bib entry + Grobid + :return: + """ + for bibl_entry in raw_xml.find_all("biblscope"): + if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "volume": + return bibl_entry.text + return "" + + +def get_issue_from_grobid_xml(raw_xml: BeautifulSoup) -> str: + """ + Returns the issue number of grobid bib entry + Grobid + :return: + """ + for bibl_entry in raw_xml.find_all("biblscope"): + if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "issue": + return bibl_entry.text + return "" + + +def get_pages_from_grobid_xml(raw_xml: BeautifulSoup) -> str: + """ + Returns the page numbers of grobid bib entry + Grobid + :return: + """ + for bibl_entry in raw_xml.find_all("biblscope"): + if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "page" and bibl_entry.has_attr("from"): + from_page = bibl_entry["from"] + if bibl_entry.has_attr("to"): + to_page = bibl_entry["to"] + return f'{from_page}--{to_page}' + else: + return from_page + return "" + + +def get_other_ids_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict[str, List]: + """ + Returns a dictionary of other identifiers from grobid bib entry (arxiv, pubmed, doi) + :param raw_xml: + :return: + """ + other_ids = defaultdict(list) + + for idno_entry in raw_xml.find_all("idno"): + if idno_entry.has_attr("type") and idno_entry.text: + other_ids[idno_entry["type"]].append(idno_entry.text) + + return other_ids + + +def get_raw_bib_text_from_grobid_xml(raw_xml: BeautifulSoup) -> str: + """ + Returns the raw bibiliography string + :param raw_xml: + :return: + """ + for note in raw_xml.find_all("note"): + if note.has_attr("type") and note["type"] == "raw_reference": + return note.text + return "" + + +def get_publication_datetime_from_grobid_xml(raw_xml: BeautifulSoup) -> str: + """ + Finds and returns the publication datetime if it exists + :param raw_xml: + :return: + """ + submission_note = raw_xml.find('note', {'type': 'submission'}) + + if submission_note: + text = submission_note.get_text() + if 'Accepted:' in text: + accepted_date = text.split('Accepted:')[1].strip() + return accepted_date + + return "" + + +def parse_bib_entry(bib_entry: BeautifulSoup) -> Dict: + """ + Parse one bib entry + :param bib_entry: + :return: + """ + clean_tags(bib_entry) + title = get_title_from_grobid_xml(bib_entry) + return { + 'ref_id': bib_entry.attrs.get("xml:id", None), + 'title': title, + 'authors': get_author_names_from_grobid_xml(bib_entry), + 'year': get_year_from_grobid_xml(bib_entry), + 'venue': get_venue_from_grobid_xml(bib_entry, title), + 'volume': get_volume_from_grobid_xml(bib_entry), + 'issue': get_issue_from_grobid_xml(bib_entry), + 'pages': get_pages_from_grobid_xml(bib_entry), + 'other_ids': get_other_ids_from_grobid_xml(bib_entry), + 'raw_text': get_raw_bib_text_from_grobid_xml(bib_entry), + 'urls': [] + } + + +def is_reference_tag(tag: bs4.element.Tag) -> bool: + return tag.name == "ref" and tag.attrs.get("type", "") == "bibr" + + +def extract_paper_metadata_from_grobid_xml(tag: bs4.element.Tag) -> Dict: + """ + Extract paper metadata (title, authors, affiliation, year) from grobid xml + :param tag: + :return: + """ + clean_tags(tag) + paper_metadata = { + "title": tag.titlestmt.title.text, + "authors": get_author_data_from_grobid_xml(tag), + "year": get_publication_datetime_from_grobid_xml(tag) + } + return paper_metadata diff --git a/llm-guided-pdf-parsing/pdf-parsing/doc2json/utils/refspan_util.py b/llm-guided-pdf-parsing/pdf-parsing/doc2json/utils/refspan_util.py new file mode 100644 index 00000000..98a0c75c --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/doc2json/utils/refspan_util.py @@ -0,0 +1,109 @@ +from typing import List, Tuple + + +def replace_refspans(spans_to_replace: List[Tuple[int, int, str, str]], + full_string: str, + pre_padding: str = "", + post_padding: str = "", + btwn_padding: str = ", ") -> str: + """ + For each span within the full string, replace that span with new text + :param spans_to_replace: list of tuples of form (start_ind, end_ind, span_text, new_substring) + :param full_string: + :param pre_padding: + :param post_padding: + :param btwn_padding: + :return: + """ + # assert all spans are equal to full_text span + assert all([full_string[start:end] == span for start, end, span, _ in spans_to_replace]) + + # assert none of the spans start with the same start ind + start_inds = [rep[0] for rep in spans_to_replace] + assert len(set(start_inds)) == len(start_inds) + + # sort by start index + spans_to_replace.sort(key=lambda x: x[0]) + + # form strings for each span group + for i, entry in enumerate(spans_to_replace): + start, end, span, new_string = entry + + # skip empties + if end <= 0: + continue + + # compute shift amount + shift_amount = len(new_string) - len(span) + len(pre_padding) + len(post_padding) + + # shift remaining appropriately + for ind in range(i + 1, len(spans_to_replace)): + next_start, next_end, next_span, next_string = spans_to_replace[ind] + # skip empties + if next_end <= 0: + continue + # if overlap between ref span and current ref span, remove from replacement + if next_start < end: + next_start = 0 + next_end = 0 + next_string = "" + # if ref span abuts previous reference span + elif next_start == end: + next_start += shift_amount + next_end += shift_amount + next_string = btwn_padding + pre_padding + next_string + post_padding + # if ref span starts after, shift starts and ends + elif next_start > end: + next_start += shift_amount + next_end += shift_amount + next_string = pre_padding + next_string + post_padding + # save adjusted span + spans_to_replace[ind] = (next_start, next_end, next_span, next_string) + + spans_to_replace = [entry for entry in spans_to_replace if entry[1] > 0] + spans_to_replace.sort(key=lambda x: x[0]) + + # apply shifts in series + for start, end, span, new_string in spans_to_replace: + assert full_string[start:end] == span + full_string = full_string[:start] + new_string + full_string[end:] + + return full_string + + +def sub_spans_and_update_indices(spans_to_replace: List[Tuple[int, int, str, str]], + full_string: str) -> Tuple[str, List]: + """ + Replace all spans and recompute indices + :param spans_to_replace: + :param full_string: + :return: + """ + # TODO: check no spans overlapping + # TODO: check all spans well-formed + + # assert all spans are equal to full_text span + assert all([full_string[start:end] == token for start, end, token, _ in spans_to_replace]) + + # assert none of the spans start with the same start ind + start_inds = [rep[0] for rep in spans_to_replace] + assert len(set(start_inds)) == len(start_inds) + + # sort by start index + spans_to_replace.sort(key=lambda x: x[0]) + + # compute offsets for each span + new_spans = [[start, end, token, surface, 0] for start, end, token, surface in spans_to_replace] + for i, entry in enumerate(spans_to_replace): + start, end, token, surface = entry + new_end = start + len(surface) + offset = new_end - end + new_spans[i][1] += offset + for new_span_entry in new_spans[i + 1:]: + new_span_entry[4] += offset + + # generate new text and create final spans + new_text = replace_refspans(spans_to_replace, full_string, btwn_padding="") + new_spans = [(start + offset, end + offset, token, surface) for start, end, token, surface, offset in new_spans] + + return new_text, new_spans diff --git a/llm-guided-pdf-parsing/pdf-parsing/embedding.py b/llm-guided-pdf-parsing/pdf-parsing/embedding.py new file mode 100644 index 00000000..aa514426 --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/embedding.py @@ -0,0 +1,35 @@ +import json + +import requests +from retry import retry +import os +from dotenv import load_dotenv + +load_dotenv(override=True) + + +@retry(tries=10, delay=.25) +def get_embeddings(prompt, model="nomic-embed-text:v1.5", base_url=os.environ['EMBEDDING_BASE_URL']): + + payload = {"model": model, "prompt": prompt, "options": {"num_ctx": 8192}} + + headers = { + 'Content-Type': 'application/json', + } + + response = requests.post(base_url, data=json.dumps(payload), headers=headers) + + if response.status_code == 200: + embedding_json = response.json() + if "embedding" in embedding_json: + embeddings = embedding_json["embedding"] + if isinstance(embeddings, list): + return embeddings + else: + print("Error: Embeddings is not a list") + else: + print("Error: 'embeddings' key not found in response") + else: + print(f"Embedding error: {response.status_code}, {response.text}") + + return None diff --git a/llm-guided-pdf-parsing/pdf-parsing/main_pubmed_ingest.py b/llm-guided-pdf-parsing/pdf-parsing/main_pubmed_ingest.py new file mode 100644 index 00000000..72270380 --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/main_pubmed_ingest.py @@ -0,0 +1,304 @@ +import os +import sqlite3 +import tempfile +import time +import traceback +from concurrent.futures import ProcessPoolExecutor, as_completed +from multiprocessing import Manager, Process, Queue +from pathlib import Path + +import sentry_sdk +import urllib3 +from doc2json.grobid_client import GrobidClient +from dotenv import load_dotenv +from pdf_process import parse_and_group_by_section, process_pdf_file +from posthog import Posthog +from qdrant import create_qdrant +from qdrant_client import QdrantClient, models +from SQLite import initialize_database, insert_data +from urllib3 import PoolManager +from urllib3.util.retry import Retry + +from minio import Minio # type: ignore + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +load_dotenv(override=True) + +posthog = Posthog(sync_mode=False, + project_api_key=os.environ['LLM_GUIDED_RETRIEVAL_POSTHOG_API_KEY'], + host='https://us.i.posthog.com') + +# Create a custom PoolManager with desired settings +http_client = PoolManager( + timeout=300, # 5 minutes timeout + maxsize=200, # Increased pool size + retries=Retry(total=10, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])) + +client = Minio( + os.environ['MINIO_API_ENDPOINT'], + access_key=os.environ['MINIO_ACCESS_KEY'], + secret_key=os.environ['MINIO_SECRET_KEY'], + secure=False, + http_client=http_client, +) +sentry_sdk.init( + dsn=os.environ['SENTRY_DSN'], + traces_sample_rate=1.0, + profiles_sample_rate=1.0, +) + +LOG_FILE = os.environ['SUCCESS_LOG_FILE'] +ERR_LOG_FILE = os.environ['ERR_LOG_FILE'] +DB_PATH = os.environ['DB_PATH'] + +grobid_server = os.environ['GROBID_SERVER'] +BASE_TEMP_DIR = 'temp' +BASE_OUTPUT_DIR = 'output' +BUCKET_NAME = 'pubmed' + +NUM_PARALLEL = 30 + +# Configure Grobid +grobid_config = { + "grobid_server": grobid_server, + "batch_size": 2000, + "sleep_time": 3, + "generateIDs": False, + "consolidate_header": False, + "consolidate_citations": False, + # "include_raw_citations": True, + "include_raw_citations": False, + "include_raw_affiliations": False, + "timeout": 600, + "n": NUM_PARALLEL, + "max_workers": NUM_PARALLEL, +} +grobidClient = GrobidClient(grobid_config) + +qdrant_client = QdrantClient(url=os.environ['QDRANT_URL'], port=os.environ['QDRANT_PORT'], https=True, api_key=os.environ['QDRANT_API_KEY']) + + +def main_parallel_upload(): + initialize_database(DB_PATH) + create_qdrant(qdrant_client) + os.makedirs(BASE_TEMP_DIR, exist_ok=True) + os.makedirs(BASE_OUTPUT_DIR, exist_ok=True) + start_time_main_parallel = time.monotonic() + + num_processed_this_run = 0 + + manager = Manager() + queue = manager.Queue() + db_proc = Process(target=db_worker, args=(queue, DB_PATH, qdrant_client)) + db_proc.start() + + # process to monitor queue size + # queue_monitor_proc = Process(target=queue_size_monitor, args=(queue,)) + # queue_monitor_proc.start() + + # qdb_proc = Process(target=qdb_worker, args=(qdrant_client, DB_PATH)) + # qdb_proc.start() + + with ProcessPoolExecutor(max_workers=NUM_PARALLEL) as executor: + batch_size = 2_000 + minio_gen = minio_object_generator(client, BUCKET_NAME) + + while True: + futures = {} + for _ in range(batch_size): + try: + obj = next(minio_gen) + futures[executor.submit(upload_single_pdf, obj.object_name, queue)] = obj + except StopIteration: + break + + if not futures: + break + + for future in as_completed(futures): + obj = futures[future] + try: + # MAIN / ONLY SUCCESS CASE + future.result() + num_processed_this_run += 1 + # print(f"✅ num processed this run: {num_processed_this_run}") + # print_futures_stats(futures) + # print(f"(while completing jobs) Current queue size: {queue.qsize()}") + if num_processed_this_run % 100 == 0: + print( + f"1️⃣ Num processed this run: {num_processed_this_run}. ⏰ Runtime: {(time.monotonic() - start_time_main_parallel):.2f} seconds" + ) + + posthog.capture('llm-guided-ingest', + event='success_ingest_running_total', + properties={ + 'pdf-per-sec-running-total': + float(num_processed_this_run / (time.monotonic() - start_time_main_parallel)), + 'minio_path': + f'{BUCKET_NAME}/{obj.object_name}' + }) + print(f"🏎️ PDF/sec (running total): {float(num_processed_this_run / (time.monotonic() - start_time_main_parallel)):.2f}") + + except Exception as e: + # MAIN / ONLY FAILURE CASE + with open(ERR_LOG_FILE, 'a') as f: + f.write(f"{obj.object_name} --- {str(e)}\n") + posthog.capture('llm-guided-ingest', + event='failed_ingest', + properties={ + 'db_path': DB_PATH, + 'minio_path': f'{BUCKET_NAME}/{obj.object_name}' + }) + print(f"Error processing {obj.object_name}: {str(e)}") + traceback.print_exc() + + futures.clear() + + queue.put(None) + db_proc.join() + # queue_monitor_proc.terminate() + # qdb_proc.join() + + +def upload_single_pdf(minio_object_name, queue): + """ + This is the fundamental unit of parallelism: upload a single PDF to SQLite, all or nothing. + """ + start_time_minio = time.monotonic() + response = client.get_object(BUCKET_NAME, minio_object_name) + file_content = response.read() + response.close() + response.release_conn() + # print(f"⏰ Minio download: {(time.monotonic() - start_time_minio):.2f} seconds") + posthog.capture('llm-guided-ingest', + event='minio_download', + properties={ + 'runtime_sec': float(f"{(time.monotonic() - start_time_minio):.2f}"), + 'minio_path': f'{BUCKET_NAME}/{minio_object_name}', + 'grobid_using_GPU': False, + }) + + with tempfile.NamedTemporaryFile() as tmp_file: + tmp_file.write(file_content) + tmp_file.flush() + + output_data = process_pdf_file(Path(tmp_file.name), BASE_TEMP_DIR, BASE_OUTPUT_DIR, + f"{BUCKET_NAME}/{minio_object_name}", grobidClient) + metadata, grouped_data, total_tokens, references, ref_num_tokens = parse_and_group_by_section(output_data) + + queue.put({ + 'metadata': metadata, + 'total_tokens': total_tokens, + 'grouped_data': grouped_data, + 'references': references, + 'db_path': DB_PATH, + 'file_name': minio_object_name, + 'ref_num_tokens': ref_num_tokens, + 'minio_path': f'{BUCKET_NAME}/{minio_object_name}' + }) + + runtime = round(time.monotonic() - start_time_minio, 2) + print(f"⭐️ Total ingest runtime: {runtime} seconds. Tokens per sec: {(int(total_tokens)/runtime):.2f}") + posthog.capture('llm-guided-ingest', + event='success_ingest_v2', + properties={ + 'metadata': metadata, + 'runtime_sec': float(f"{(time.monotonic() - start_time_minio):.2f}"), + 'total_tokens': int(total_tokens), + 'db_path': DB_PATH, + 'minio_path': f'{BUCKET_NAME}/{minio_object_name}' + }) + + +# Start +def load_processed_files(log_file): + if os.path.exists(log_file): + with open(log_file, 'r') as f: + return set(line.strip() for line in f) + return set() + +def load_error_files(err_log_file): + if os.path.exists(err_log_file): + with open(err_log_file, 'r') as f: + return set(line.split(' --- ')[0].strip() for line in f if ' --- ' in line) + return set() + + + +def save_processed_file(log_file, file_path): + with open(log_file, 'a') as f: + f.write(f"{file_path}\n") + + +def db_worker(queue, db_path, client): + conn = sqlite3.connect(db_path, timeout=30) + while True: + data = queue.get() + if data is None: + break + try: + insert_data(data['metadata'], data['total_tokens'], data['grouped_data'], data['db_path'], data['references'], + data['ref_num_tokens'], data['minio_path'], client) + + save_processed_file(LOG_FILE, data['file_name']) + conn.commit() + except Exception as e: + with open(ERR_LOG_FILE, 'a') as f: + f.write(f"db_worker: {data['file_name']}: {str(e)}\n") + conn.close() + + +# def minio_object_generator(client, bucket_name): +# processed_files = load_processed_files(LOG_FILE) +# error_files = load_error_files(ERR_LOG_FILE) +# all_skipped_files = processed_files.union(error_files) +# for obj in client.list_objects(bucket_name, recursive=True): +# if not obj.object_name.endswith('.pdf'): +# continue +# if obj.object_name not in all_skipped_files: +# yield obj + +def minio_object_generator(client, bucket_name): + processed_files = load_processed_files(LOG_FILE) + error_files = load_error_files(ERR_LOG_FILE) + all_skipped_files = processed_files.union(error_files) + + with open('all_pubmed_filenames.txt', 'r') as f: + for line in f: + object_name = line.strip() + if object_name not in all_skipped_files: + if not object_name.endswith('.pdf'): + continue + # Create a simple object-like structure to maintain compatibility + class MinioObj: + def __init__(self, name): + self.object_name = name + + yield MinioObj(object_name) + +# def queue_size_monitor(queue): +# while True: +# print(f"Current queue size: {queue.qsize()}") +# time.sleep(10) + + +def print_futures_stats(futures): + running_count = 0 + none_count = 0 + error_count = 0 + for f in futures: + if f.running(): + running_count += 1 + elif f.done(): + if f.result() is None: + none_count += 1 + elif f.exception() is not None: + error_count += 1 + print(f"Batch progress. Running: {running_count}, Done: {none_count}, Errors: {error_count}") + + +# End + +if __name__ == '__main__': + main_parallel_upload() diff --git a/llm-guided-pdf-parsing/pdf-parsing/pdf_process.py b/llm-guided-pdf-parsing/pdf-parsing/pdf_process.py new file mode 100644 index 00000000..43e6b8de --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/pdf_process.py @@ -0,0 +1,241 @@ +import math +import os +import tempfile +import time +import traceback +from typing import Any, Dict + +import sentry_sdk +import tiktoken +from doc2json.tei_to_json import convert_tei_xml_file_to_s2orc_json +from dotenv import load_dotenv +from embedding import get_embeddings +from langchain_text_splitters import RecursiveCharacterTextSplitter # type: ignore +from posthog import Posthog + +BASE_TEMP_DIR = 'temp' +BASE_OUTPUT_DIR = 'output' + +load_dotenv(override=True) + +posthog = Posthog(sync_mode=True, + project_api_key=os.environ['LLM_GUIDED_RETRIEVAL_POSTHOG_API_KEY'], + host='https://us.i.posthog.com') + +sentry_sdk.init( + dsn=os.environ['SENTRY_DSN'], + traces_sample_rate=1.0, + profiles_sample_rate=1.0, +) + + +def process_pdf_file( + input_file: os.PathLike, + temp_dir: str = BASE_TEMP_DIR, + output_dir: str = BASE_OUTPUT_DIR, + minio_path: str = '', + grobidClient: Any = None, +) -> Dict | None: + """ + Process a PDF file and get JSON representation + :param input_file: + :param temp_dir: + :param output_dir: + :return: + """ + + tei_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False, suffix='.tei.xml') + output_file = tempfile.NamedTemporaryFile(dir=output_dir, delete=False, suffix='.json') + + try: + start_time = time.monotonic() + grobidClient.process_pdf(str(input_file), tei_file.name, "processFulltextDocument") + # print(f"📜 Grobid Runtime: {(time.monotonic() - start_time):.2f} seconds") + posthog.capture('llm-guided-ingest', + event='grobid_runtime_v2', + properties={ + 'runtime_sec': float(f"{(time.monotonic() - start_time):.2f}"), + 'minio_file': f'{str(minio_path)}', + 'grobid_using_GPU': True, + }) + + assert os.path.exists(tei_file.name) + + paper = convert_tei_xml_file_to_s2orc_json(tei_file.name) + output_data = paper.release_json() + return output_data + except Exception as e: + print(f"Error in process_pdf_file(): {str(e)}") + traceback.print_exc() + raise ValueError(f"Error in process_pdf_file(): {str(e)}") + finally: + if tei_file.name and os.path.exists(tei_file.name): + os.remove(tei_file.name) + if os.path.exists(output_file.name): + os.remove(output_file.name) + + +def parse_and_group_by_section(data) -> Any: + """ + This parses the output of AllenAI's Grobid wrapper. https://github.com/allenai/s2orc-doc2json + + Output format is two dictionaries: + * Full text grouped by major section. + * The "outline:" all the section titles, including minor sections. This might be useful as a separate embedding. + """ + grouped_texts = {} + grouped_titles = {} + grouped_pages = {} + all_sections = {} + references = {} + ref_num_tokens = {} + failed_secs = 0 + + # TODO: I think authors and title are maybe not being handled perfectly when data is missing... + metadata = { + "title": + data["title"] if data and "title" in data else None, + "authors": + None if not data or not data.get("authors") else [ + " ".join(filter(None, [author.get("first"), " ".join(author.get("middle", [])), + author.get("last")])) for author in data.get("authors", []) + ], + "date_published": + data["year"] if data and "year" in data else None, + "journal": + data["venue"] if data and "venue" in data else None, + "paper_id": + data["paper_id"] if data and "paper_id" in data else None, + "header": + data["header"] if data and "header" in data else None, + "year": + data["year"] if data and "year" in data else None, + "identifiers": + data["identifiers"] if data and "identifiers" in data else None, + "abstract": + data["abstract"] if data and "abstract" in data else None, + } + + try: + encoding = tiktoken.get_encoding("cl100k_base") + + for key in data["pdf_parse"]["bib_entries"]: + references[key] = format_reference(data["pdf_parse"]["bib_entries"][key]) + ref_num_tokens[key] = len(encoding.encode(references[key])) + + if metadata['abstract']: + grouped_texts['0'] = metadata['abstract'] + grouped_titles['0'] = 'Abstract' + + null_sec = 1 + cur_sec = null_sec + # for entry in data["pdf_parse"]["body_text"]: + # print(f"Entry: {entry}") + + for entry in data["pdf_parse"]["body_text"]: + text = entry["text"] + section = entry["section"] + sec_num = entry["sec_num"] + # page_num = entry["page_num"] + try: + major_sec_num = math.floor(float(sec_num)) + except Exception: + major_sec_num = -1 + failed_secs += 1 + # print(f"Failed: {failed_secs}", json.dumps(entry, indent=2)) + + if not sec_num and section not in grouped_titles.values(): + sec_num = str(null_sec) + cur_sec = str(null_sec) + null_sec += 1 + + if sec_num: + if sec_num in grouped_texts: + grouped_texts[sec_num] += " " + text + # grouped_pages[sec_num].append(page_num) + else: + grouped_texts[sec_num] = text + grouped_titles[sec_num] = section + # grouped_pages[sec_num] = [page_num] + else: + grouped_texts[cur_sec] += " " + text + + if sec_num: + all_sections[sec_num] = section + + if not all_sections: + num = 1 + for i in data["pdf_parse"]["body_text"]: + grouped_texts[str(num)] = i["text"] + num += 1 + + text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( + model_name="gpt-4", + chunk_size=7000, + chunk_overlap=400, + ) + + valid_keys = [key for key in grouped_texts if key is not None] + + result = [] + + for key in sorted(valid_keys): + section_text = grouped_texts[key] + section_title = grouped_titles.get(key, None) + # section_page = grouped_pages.get(key, "Unknown") + tokens = len(encoding.encode(section_text)) + + result.append({ + "text": section_text, + "sec_num": key, + "sec_title": section_title, + "tokens": tokens, + # "page_num": section_page, + "chunk_text": [], + "chunk_tokens": [], + "embedding": [] + }) + + if tokens > 7000: + chunks = text_splitter.split_text(section_text) + for i, chunk in enumerate(chunks): + chunk_tokens = len(encoding.encode(chunk)) + result[-1]["chunk_text"].append(chunk) + result[-1]["chunk_tokens"].append(chunk_tokens) + else: + result[-1]["chunk_text"].append(section_text) + result[-1]["chunk_tokens"].append(tokens) + + # print("start embedding") + for i, chunk in enumerate(result[-1]["chunk_text"]): + context_with_title = f'Section {result[-1]["sec_num"]}: {result[-1]["sec_title"]}\n{chunk}' + embedding = get_embeddings(context_with_title) + result[-1]["embedding"].append(embedding) + + total_tokens = sum([entry["tokens"] for entry in result]) + return metadata, result, total_tokens, references, ref_num_tokens + + except Exception as e: + sentry_sdk.capture_exception(e) + raise (ValueError(f"Failed parse_and_grou_by_section() with error: {e}")) + + +def format_reference(reference): + # Extract individual fields from the reference + # ref_id = reference['ref_id'] + # title = reference.get('title', '') + # authors = reference.get('authors', []) + # year = reference.get('year', '') + # venue = reference.get('venue', '') + # volume = reference.get('volume', '') + # issue = reference.get('issue', '') + # pages = reference.get('pages', '') + # other_ids = reference.get('other_ids', {}) + # num = reference.get('num', '') + urls = reference.get('urls', []) + raw_text = reference.get('raw_text', '') + links = reference.get('links', '') + + combined_text = (f"{raw_text}. {urls}. {links}") + + return combined_text diff --git a/llm-guided-pdf-parsing/pdf-parsing/qdrant.py b/llm-guided-pdf-parsing/pdf-parsing/qdrant.py new file mode 100644 index 00000000..2143e632 --- /dev/null +++ b/llm-guided-pdf-parsing/pdf-parsing/qdrant.py @@ -0,0 +1,30 @@ +import json +import os +import sqlite3 +from uuid import uuid4 + +from qdrant_client import QdrantClient, models + +db_path = '/home/guest/ai-ta-backend/UIUC_Chat/pdf-parsing/v2-articles.db' +client = QdrantClient(url=os.environ['QDRANT_URL'], + port=int(os.environ['QDRANT_PORT']), + https=True, + api_key=os.environ['QDRANT_API_KEY']) + + +def create_qdrant(client): + # try: + # client.delete_collection(collection_name="embedding") + # print("Collection 'embedding' deleted successfully.") + # except Exception as e: + # print(f"Error deleting collection: {e}") + + try: + collection_info = client.get_collection(collection_name="embedding") + print("Collection 'embedding' already exists.") + except Exception as e: + print("Collection does not exist, creating a new one.") + client.create_collection( + collection_name="embedding", + vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE), + ) diff --git a/science.db b/science.db new file mode 100644 index 00000000..05b83bb0 Binary files /dev/null and b/science.db differ