UIUC-Chatbot · KastanDay · Apr 18, 2024 · Apr 18, 2024 · Apr 19, 2024 · Apr 19, 2024
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -1,9 +1,14 @@
 # don't sync coursera docs
+
+*grobid_speedtest_pdfs*
 coursera-dl/
 *parsed.json
 wandb
 *.ipynb
 *.pem
+*.tei.xml
+*.json
+*articles.db
 
 # don't expose env files
 .env

diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml
@@ -43,7 +43,6 @@ lint:
       paths:
         - .github/**/*
         - .trunk/**/*
-        - mkdocs.yml
         - .DS_Store
         - .vscode/**/*
         - README.md

diff --git a/ai_ta_backend/utils/types.py b/ai_ta_backend/utils/types.py
@@ -0,0 +1,64 @@
+import datetime
+from typing import Any, Dict, List, Optional
+
+import pydantic
+
+
+class DocumentMetadata(pydantic.BaseModel):
+  authors: list[str]
+  journal_name: str
+  publication_date: datetime.date  # Changed from datetime.date to str
+  keywords: list[str]
+  doi: str
+  title: str
+  subtitle: Optional[str]
+  visible_urls: list[str]
+  field_of_science: str
+  concise_summary: str
+  specific_questions_document_can_answer: list[str]
+  additional_fields: Optional[Dict[str, Any]] = {}
+
+  # Can't get this to work properly
+  # class Config:
+  #     extra = pydantic.Extra.allow  # Allow arbitrary additional fields
+
+
+class GrobidMetadata(pydantic.BaseModel):
+  """
+  additional_fields is for the paper "sections" with arbitrary section names. 
+  Currently, the SQLite DB will have a separate column for every unique "major_sec_title". 
+  We'll see how messy it gets... maybe LLMs can normalize this some.
+
+  Format of additional_fields:
+  {
+    "major_sec_num": 3,
+    "major_sec_title": "Extracting Metadata",
+    "text": "In the previous section, we...", # full text of the section
+    "tokens": 1067
+  }
+  """
+  uuid: str
+  filepath: str
+  total_tokens: int
+  avg_tokens_per_section: int
+  max_tokens_per_section: int
+  all_sections: Dict[str, str]
+  additional_fields: Optional[List[Dict[str, Any]]] = [{}]
+
+
+# Prisma data model https://prisma-client-py.readthedocs.io/en/stable/
+# TBH I'd rather invest in learning real SQL. Theo switched from Drizzle to Prisma... no long-term security in either.
+# model DocumentMetadata {
+#   id                             Int      @id @default(autoincrement())
+#   authors                        String[]
+#   journalName                    String
+#   publicationDate                DateTime
+#   keywords                       String[]
+#   doi                            String
+#   title                          String
+#   subtitle                       String?   // Optional field
+#   visibleUrls                    String[]
+#   fieldOfScience                 String
+#   conciseSummary                 String
+#   specificQuestionsDocumentCanAnswer String[]
+# }
diff --git a/llm-guided-pdf-parsing/llm-guided-retrival/evaluate_chunks.py b/llm-guided-pdf-parsing/llm-guided-retrival/evaluate_chunks.py
@@ -0,0 +1,195 @@
+import json
+import os
+
+from dotenv import load_dotenv
+from openai import OpenAI
+from read_sql import (
+    get_context_given_contextID,
+    get_next_context_id,
+    get_previous_context_id,
+)
+
+
+def evaluate_chunks(query, chunks, outline):
+  load_dotenv()
+
+  api_key = os.getenv("AZURE_OPENAI_KEY")
+  endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
+  deployment = os.getenv("DEPLOYMENT")
+  api_version = os.getenv("OPENAI_API_VERSION")
+  openai_api = os.getenv("OPENAI_API")
+
+  client = OpenAI(api_key=openai_api)
+
+  tools = [
+      {
+          "type": "function",
+          "function": {
+              "name": "keep_current_chunk",
+              "description": "Check if the current chunk of context is relevant to the query",
+              "parameters": {
+                  "type": "object",
+                  "properties": {
+                      "keep": {
+                          "type": "boolean",
+                          "description": "Whether to keep the current chunk or not"
+                      }
+                  },
+                  "required": ["keep"]
+              }
+          },
+          "required": True
+      },
+      {
+          "type": "function",
+          "function": {
+              "name": "check_previous_chunk",
+              "description": "Check if the previous chunk of context is relevant to the query",
+              "parameters": {
+                  "type": "object",
+                  "properties": {
+                      "check_previous": {
+                          "type": "boolean",
+                          "description": "True if the previous chunk is relevant, False otherwise"
+                      }
+                  },
+                  "required": ["check_previous"]
+              }
+          },
+      },
+      {
+          "type": "function",
+          "function": {
+              "name": "check_next_chunk",
+              "description": "Check if the next chunk of context is relevant to the query",
+              "parameters": {
+                  "type": "object",
+                  "properties": {
+                      "check_next": {
+                          "type": "boolean",
+                          "description": "True if the next chunk is relevant, False otherwise"
+                      }
+                  },
+                  "required": ["check_next"]
+              }
+          }
+      },
+      {
+          "type": "function",
+          "function": {
+              "name": "go_to_URL",
+              "description": "Click link or go to URL referenced in chunk.",
+              "parameters": {
+                  "type": "object",
+                  "properties": {
+                      "go_to_URL": {
+                          "type": "boolean",
+                          "description": "Whether to go to the URL or not"
+                      }
+                  },
+                  "required": ["go_to_URL"]
+              }
+          }
+      },
+      {
+          "type": "function",
+          "function": {
+              "name": "go_to_section",
+              "description": "Navigate to a specific section in the document.",
+              "parameters": {
+                  "type": "object",
+                  "properties": {
+                      "go_to_section": {
+                          "type": "boolean",
+                          "description": "Whether to go to the section or not"
+                      },
+                      "section": {
+                          "type": "string",
+                          "description": "The section to navigate to"
+                      }
+                  },
+                  "required": ["go_to_section", "section"]
+              }
+          }
+      },
+  ]
+
+  messages = [{
+      "role":
+          "system",
+      "content": (
+          "You are an expert in information retrieval. Your task is to evaluate the relevance of a given text chunk to a specific research question. "
+          "You have four functions at your disposal: 'keep_current_chunk', 'check_previous_chunk', 'check_next_chunk', and 'go_to_url'. "
+          "Always use 'keep_current_chunk' to determine if the current chunk is relevant. Then, consider using 'check_previous_chunk' or 'check_next_chunk' or 'go_to_url'. "
+          "When using 'check_previous_chunk', if you find previous chunk relevant to the research question, set 'check_previous' to 'True', otherwise false. "
+          "When using 'check_next_chunk', if you find next chunk relevant to the research question, set 'check_next' to 'True', otherwise false. "
+          "Use 'go_to_url' if the chunk suggests checking an external link."
+          "You can also use 'go_to_section' to navigate to a specific section in the document.")
+  }, {
+      "role":
+          "user",
+      "content": (
+          f"Research Question: '{query}'\n\n"
+          f"Table of Contents: '{outline}'\n\n"
+          f"Current Text Chunk: '{chunks}'\n\n"
+          "Evaluate the relevance of the current chunk to the research question. Determine if the current chunk should be kept. "
+          "Also, decide whether to check the previous chunk by calling 'check_previous_chunk', or the next chunk by calling 'check_next_chunk', or whether to follow an external link using the respective functions. "
+          "Make sure you call other functions and determine if previous or next chunks are relevant to the research question."
+      )
+  }]
+
+  completion = client.chat.completions.create(
+      model='gpt-4o',
+      messages=messages,
+      tools=tools,
+      # tool_choice={"type": "function", "function": {"name": "keep_current_chunk"}},
+  )
+
+  return completion
+
+
+# query = "What is the name of Polly's son?"
+# chunks = [
+#     "Polly's son, Michael Gray, was taken away when he was a baby. He was raised by a family in Australia. He was brought back to Birmingham by Polly in season 2.",
+#     "The Blinders' dominance came about from beating rivals, including the 'Sloggers', 'a pugilistic term for someone who could strike a heavy blow in the ring', whom they fought for territory in Birmingham and its surrounding districts.",
+#     "Britain is a mixture of despair and hedonism in 1919 in the aftermath of the Great War. Returning soldiers, newly minted revolutions and criminal gangs are fighting for survival in a nation rocked by economic upheaval."
+# ]
+# outline = "1. Introduction\n2. Polly Gray\n3. The Blinders\n4. Britain in 1919"
+# result = evaluate_chunks(query, chunks[1], outline)
+# print(result)
+# for tool in result.choices[0].message.tool_calls:
+#     print("true" in tool.function.arguments)
+#     print("---")
+
+
+def evaluate_chunks_with_step(query, chunk_id, step, chunks_to_keep, is_visited):
+  if step > 4:
+    return
+  if chunk_id in is_visited:
+    print("already visited")
+    return
+  is_visited[chunk_id] = True
+  step += 1
+
+  print(chunk_id)
+  context_data, current_context, outline = get_context_given_contextID(chunk_id)
+  completion = evaluate_chunks(query, current_context, outline)
+
+  if completion is None:
+    return
+  # print(completion)
+
+  for tool in completion.choices[0].message.tool_calls:
+    if tool.function.name == "keep_current_chunk" and "true" in tool.function.arguments:
+      print("Keeping current chunk")
+      chunks_to_keep.append(current_context)
+    if tool.function.name == "check_previous_chunk" and "true" in tool.function.arguments:
+      previous_context_id = get_previous_context_id(context_data, chunk_id)
+      if previous_context_id is not None:
+        print("Checking previous chunk")
+        evaluate_chunks_with_step(query, previous_context_id, step, chunks_to_keep)
+    if tool.function.name == "check_next_chunk" and "true" in tool.function.arguments:
+      next_context_id = get_next_context_id(context_data, chunk_id)
+      if next_context_id is not None:
+        print("Checking next chunk")
+        evaluate_chunks_with_step(query, next_context_id, step, chunks_to_keep)
diff --git a/llm-guided-pdf-parsing/llm-guided-retrival/guided_retrival.py b/llm-guided-pdf-parsing/llm-guided-retrival/guided_retrival.py
@@ -0,0 +1,31 @@
+from evaluate_chunks import evaluate_chunks, evaluate_chunks_with_step
+
+# Example chunks, to be updated with the actual retrival
+query = "How many studies in total met inclusion criteria for analysis of FEV 1?"
+# 1, 10 ,15
+chunks = {
+    "DjkRDVj9YwkfmxAFfgXgJ":
+        "Despite reduced resting lung volumes and D LCO , patients with long COVID and dyspnoea have similar physiological response to exercise to healthy subjects. D LCO impairment can marginally explain heterogeneity of complex syndromes such as long COVID. https://bit.ly/40j4aX6",
+    "e7d6C_gDSEQt5fb6rj8Nt":
+        "Studies were identified using the systematic review methods described previously by KOTECHA et al. [4, 13] which followed the Preferred Reporting Items for Systematic Reviews and Meta-analysis (PRISMA) guidelines [14] . Briefly, 86 studies in total met inclusion criteria for analysis of FEV 1 . Although this systematic review was designed to capture studies to answer questions specifically related to FEV 1 , the search criteria were subsequently deemed acceptable to capture appropriately other spirometry measures including FVC, FEV 1 /FVC ratio and FEF 25-75 . Studies were included for this analysis if they fulfilled the following criteria: 1) FEV 1 /FVC reported in survivors of preterm birth (with or without BPD) and those born healthy at term; or if 2) FEV 1 /FVC were reported separately in survivors of preterm birth with and without BPD.",
+    "Fv2taY_bTmimdP-LE8ldk":
+        "Publication bias was observed when comparing Preterm (All) with Term groups subjectively with an asymmetrical distribution noted on funnel plots, and objectively with Egger's test reaching significance ( p<0.01). When preterm groups were separated into those with and without BPD, however, a symmetrical distribution was noted on all funnel plots and Egger's test did not reach significance (supplementary figure S1 ). This could imply that asymmetry seen in the combined preterm group may be due to the heterogeneity of having two different disease populations defined by the presence or absence of BPD."
+}
+outline = {
+    "DjkRDVj9YwkfmxAFfgXgJ":
+        "0: Abstract\n1: ",
+    "e7d6C_gDSEQt5fb6rj8Nt":
+        "0: Abstract\n1: Introduction\n10: Meta-regression\n11: Discussion\n12: \n2: Research questions\n3: Study identification and selection\n4: Publication bias and study quality\n5: Data collection\n6: Data analysis\n7: Study selection and study quality\n8: Publication bias\n9: Meta-analysis",
+    "Fv2taY_bTmimdP-LE8ldk":
+        "0: Abstract\n1: Introduction\n10: Meta-regression\n11: Discussion\n12: \n2: Research questions\n3: Study identification and selection\n4: Publication bias and study quality\n5: Data collection\n6: Data analysis\n7: Study selection and study quality\n8: Publication bias\n9: Meta-analysis"
+}
+
+chunks_to_keep = []
+is_visited = {}
+
+for chunk_id in chunks:
+  chunks = evaluate_chunks_with_step(query, chunk_id, 0, chunks_to_keep, is_visited)
+  if chunks is not None:
+    chunks_to_keep.append(chunks)
+
+print(chunks_to_keep)
diff --git a/llm-guided-pdf-parsing/llm-guided-retrival/read_sql.py b/llm-guided-pdf-parsing/llm-guided-retrival/read_sql.py
@@ -0,0 +1,81 @@
+import sqlite3
+
+db = '/home/guest/ai-ta-backend/UIUC_Chat/pdf-parsing/articles-test.db'
+
+
+def get_context_given_contextID(context_id):
+  conn = sqlite3.connect(db)
+  cursor = conn.cursor()
+
+  query_article_id = """
+    SELECT articles.ID, articles.Outline
+    FROM contexts
+    JOIN sections_contexts ON contexts.ID = sections_contexts.Context_ID
+    JOIN article_sections ON sections_contexts.Section_ID = article_sections.Section_ID
+    JOIN articles ON article_sections.Article_ID = articles.ID
+    WHERE contexts.ID = ?
+    """
+  cursor.execute(query_article_id, (context_id,))
+  article_id_result = cursor.fetchone()
+
+  if article_id_result is None:
+    conn.close()
+    raise Exception(f"Context ID {context_id} not found in the database")
+
+  article_id = article_id_result[0]
+  outline = article_id_result[1]
+
+  query_contexts = """
+    SELECT contexts.Section_Num, contexts.num_tokens, contexts.Section_Title, contexts.text, contexts.ID
+    FROM contexts
+    JOIN sections_contexts ON contexts.ID = sections_contexts.Context_ID
+    JOIN article_sections ON sections_contexts.Section_ID = article_sections.Section_ID
+    WHERE article_sections.Article_ID = ?
+    """
+  cursor.execute(query_contexts, (article_id,))
+  all_contexts = cursor.fetchall()
+
+  query_current_context = """
+    SELECT contexts.Section_Title, contexts.text
+    FROM contexts
+    WHERE contexts.ID = ?
+    """
+  cursor.execute(query_current_context, (context_id,))
+  current_context = cursor.fetchone()
+
+  conn.close()
+
+  return all_contexts, current_context, outline
+
+
+# Given a list of context data, I want to get the context id of the preivous context
+def get_previous_context_id(context_data, current_context_id):
+  for i in range(len(context_data)):
+    if context_data[i][4] == current_context_id:
+      if i == 0:
+        return None
+      else:
+        return context_data[i - 1][4]
+
+
+def get_next_context_id(context_data, current_context_id):
+  for i in range(len(context_data)):
+    if context_data[i][4] == current_context_id:
+      if i == len(context_data) - 1:
+        return None
+      else:
+        return context_data[i + 1][4]
+
+
+# context_id = "DjkRDVj9YwkfmxAFfgXgJ"
+# context_data, current_context, outline = get_context_given_contextID(context_id)
+# for row in context_data:
+#     print(f"Section Number: {row[0]}")
+#     print(f"Number of Tokens: {row[1]}")
+#     print(f"Section Title: {row[2]}")
+#     print(f"ID: {row[4]}")
+#     print("---")
+# print(current_context)
+# print(outline)
+# print(get_previous_context_id(context_data, "DjkRDVj9YwkfmxAFfgXgJ"))
+# print(get_next_context_id(context_data, "lcP1p88Pu0hIP-rgWzf2u"))