Skip to content

Commit

Permalink
added .md ingest and fixed .py ingest
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Aug 15, 2023
1 parent 54e3fb0 commit 07238a2
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 12 deletions.
56 changes: 56 additions & 0 deletions ai_ta_backend/data_logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import os
import nomic
from nomic import atlas
from langchain.embeddings import OpenAIEmbeddings
import numpy as np

class DataLog():
def __init__(self):
self.login = nomic.login(os.getenv('NOMIC_API_KEY'))

def nomic_log(self, course_name:str, search_query:str, retrieved_contexts)-> str:
"""
Logs user query and retrieved contexts to Nomic.
"""
print("course_name: ", course_name)
print("search_query: ", search_query)
print("retrieved_contexts: ", len(retrieved_contexts))

# concat all retrieved contexts into one string
context_string = ""
for context in retrieved_contexts:
context_string += context['text'] + " "

print("context_string: ", context_string)

# convert query and context to embeddings
embeddings_model = OpenAIEmbeddings()
#embeddings = embeddings_model.embed_documents([search_query, context_string])
#embeddings = np.array(embeddings)

num_embeddings = 2
embeddings = np.random.rand(num_embeddings, 1536)

This comment has been minimized.

Copy link
@KastanDay

KastanDay Aug 16, 2023

Member

This is why the embeddings look normally distributed...


data = [{'course': course_name, 'id': i} for i in range(len(embeddings))]
print("len of data: ", len(data))
print("len of embeddings: ", embeddings.shape)

# project = atlas.map_embeddings(embeddings=np.array(embeddings),
# data=data,
# id_field='id',
# name='Search Query Viz',
# colorable_fields=['course'])
# print(project.maps)

project = atlas.AtlasProject(name="Search Query Viz", add_datums_if_exists=True)
#map = project.get_map('Search Query Viz')
print(project.name)
#print(map)

with project.wait_for_project_lock() as project:
project.add_embeddings(embeddings=embeddings, data=data)
project.rebuild_maps()

print("done")
# log to Nomic
return "WIP"
5 changes: 5 additions & 0 deletions ai_ta_backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from ai_ta_backend.vector_database import Ingest
from ai_ta_backend.web_scrape import main_crawler, mit_course_download
from ai_ta_backend.canvas import CanvasAPI
from ai_ta_backend.data_logging import DataLog

app = Flask(__name__)
CORS(app)
Expand Down Expand Up @@ -132,6 +133,10 @@ def getTopContexts():
ingester = Ingest()
found_documents = ingester.getTopContexts(search_query, course_name, token_limit)

# add nomic log function here
logger = DataLog()
result = logger.nomic_log(course_name, search_query, found_documents)

response = jsonify(found_documents)
response.headers.add('Access-Control-Allow-Origin', '*')
return response
Expand Down
111 changes: 111 additions & 0 deletions ai_ta_backend/nomic.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import nomic\n",
"from nomic import atlas\n",
"import supabase\n",
"import os\n",
"\n",
"nomic.login(\"z9vzLMiZvEv-Ub1Vc0wOFoKEI70Ute9OQC2_YjocH5TC8\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"https://twzwfuydgnnjcaopyfdv.supabase.co\n"
]
}
],
"source": [
"# create supabase client\n",
"url = \"https://twzwfuydgnnjcaopyfdv.supabase.co\"\n",
"key = \"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InR3endmdXlkZ25uamNhb3B5ZmR2Iiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTY4NDQzMzc0NiwiZXhwIjoyMDAwMDA5NzQ2fQ.dMyvULsZ8jnnLk8TUZtP0Ec_6LeOxcoLnD-uIjoYYs4\"\n",
"\n",
"client = supabase.create_client(supabase_url=url, supabase_key=key)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"response = client.table(\"documents\").select(\"*\").eq(\"course_name\", \"canvas\").limit(2).execute()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"data=[{'id': 46461, 'created_at': '2023-08-10T20:32:20.802466+00:00', 's3_path': 'courses/canvas/How_businesses_and_websites_can_use_third-party_data_to_target_advertising_through_LinkedIn_|_LinkedIn_Help.html', 'readable_filename': '8/10/23 How businesses and websites can use third-party data to target advertising through LinkedIn | LinkedIn Help', 'course_name': 'canvas', 'url': 'https://www.linkedin.com/help/linkedin/answer/a426264?trk=microsites-frontend_legal_privacy-policy&lang=en', 'contexts': [{'text': 'How businesses and websites can use third-party data to target advertising through LinkedIn | LinkedIn Help\\nDue to high support volume, it may take longer than usual to hear back from our Support Agents. \\xa0dismiss this messageAttention screen reader users, you are in a mobile optimized view and content may not appear where you expect it to be. To return the screen to its desktop view, please maximize your browser.Skip to contentSkip to searchClose jump menuClose menuGet help with:LinkedInSales NavigatorTalent HubRecruiterSales InsightsMarketing SolutionsTalent InsightsCorporate BillingLearningGo to LinkedInSign inHelpSign inContact usEnglish (English)Čeština (Czech)Deutsch (German)Bahasa Indonesia (Indonesian)Español (Spanish)Türkçe (Turkish)Français (French)हिंदी (Hindi)Italiano (Italian)日本語 (Japanese)Nederlands (Dutch)Português (Portuguese)Svenska (Swedish)Polski (Polish)Bahasa Malaysia (Malay)한국어 (Korean)Dansk (Danish)Norsk (Norwegian)Română (Romanian)Русский (Russian)Українська (Ukrainian)简体中文 (Chinese (Simplified))正體中文 (Chinese (Traditional))ภาษาไทย (Thai)العربية (Arabic)LinkedIn Corporation © 2023AboutTransparency CenterPrivacy and TermsCookiesCopyrightTermsPrivacyGuest controlsYour California Privacy ChoicesDismiss privacy menuLinkedIn Corporation © 2023', 'embedding': None, 'timestamp': None, 'pagenumber': ''}, {'text': 'How businesses and websites can use third-party data to target advertising through LinkedIn | LinkedIn Help\\nDue to high support volume, it may take longer than usual to hear back from our Support Agents. \\xa0dismiss this messageAttention screen reader users, you are in a mobile optimized view and content may not appear where you expect it to be. To return the screen to its desktop view, please maximize your browser.Skip to contentSkip to searchClose jump menuClose menuGet help with:LinkedInLearningCorporate BillingTalent InsightsMarketing SolutionsSales InsightsTalent HubSales NavigatorRecruiterGo to LinkedInSign inHelpSign inContact usEnglish (English)Čeština (Czech)Deutsch (German)Bahasa Indonesia (Indonesian)Español (Spanish)Türkçe (Turkish)Français (French)हिंदी (Hindi)Italiano (Italian)日本語 (Japanese)Nederlands (Dutch)Português (Portuguese)Svenska (Swedish)Polski (Polish)Bahasa Malaysia (Malay)한국어 (Korean)Dansk (Danish)Norsk (Norwegian)Română (Romanian)Русский (Russian)Українська (Ukrainian)简体中文 (Chinese (Simplified))正體中文 (Chinese (Traditional))ภาษาไทย (Thai)العربية (Arabic)LinkedIn Corporation © 2023AboutTransparency CenterPrivacy and TermsCookiesCopyrightTermsPrivacyGuest controlsYour California Privacy ChoicesDismiss privacy menuLinkedIn Corporation © 2023', 'embedding': None, 'timestamp': None, 'pagenumber': ''}], 'base_url': None}, {'id': 46519, 'created_at': '2023-08-10T20:32:30.368445+00:00', 's3_path': 'courses/canvas/Off-LinkedIn_Visibility_|_LinkedIn_Help.html', 'readable_filename': '8/10/23 Off-LinkedIn Visibility | LinkedIn Help', 'course_name': 'canvas', 'url': 'https://www.linkedin.com/help/linkedin/answer/a1340507?trk=microsites-frontend_legal_privacy-policy&lang=en', 'contexts': [{'text': 'Off-LinkedIn Visibility | LinkedIn Help\\nDue to high support volume, it may take longer than usual to hear back from our Support Agents. \\xa0dismiss this messageAttention screen reader users, you are in a mobile optimized view and content may not appear where you expect it to be. To return the screen to its desktop view, please maximize your browser.Skip to contentSkip to searchClose jump menuClose menuGet help with:LinkedInLearningRecruiterSales NavigatorTalent HubSales InsightsMarketing SolutionsCorporate BillingTalent InsightsGo to LinkedInSign inHelpSign inContact usEnglish (English)Čeština (Czech)Deutsch (German)Bahasa Indonesia (Indonesian)Español (Spanish)Türkçe (Turkish)Français (French)हिंदी (Hindi)Italiano (Italian)日本語 (Japanese)Nederlands (Dutch)Português (Portuguese)Svenska (Swedish)Polski (Polish)Bahasa Malaysia (Malay)한국어 (Korean)Dansk (Danish)Norsk (Norwegian)Română (Romanian)Русский (Russian)Українська (Ukrainian)简体中文 (Chinese (Simplified))正體中文 (Chinese (Traditional))ภาษาไทย (Thai)العربية (Arabic)LinkedIn Corporation © 2023AboutTransparency CenterPrivacy and TermsCookiesCopyrightTermsPrivacyGuest controlsYour California Privacy ChoicesDismiss privacy menuLinkedIn Corporation © 2023', 'embedding': None, 'timestamp': None, 'pagenumber': ''}, {'text': 'Off-LinkedIn Visibility | LinkedIn Help\\nDue to high support volume, it may take longer than usual to hear back from our Support Agents. \\xa0dismiss this messageAttention screen reader users, you are in a mobile optimized view and content may not appear where you expect it to be. To return the screen to its desktop view, please maximize your browser.Skip to contentSkip to searchClose jump menuClose menuGet help with:LinkedInSales NavigatorCorporate BillingTalent HubRecruiterMarketing SolutionsTalent InsightsLearningSales InsightsGo to LinkedInSign inHelpSign inContact usEnglish (English)Čeština (Czech)Deutsch (German)Bahasa Indonesia (Indonesian)Español (Spanish)Türkçe (Turkish)Français (French)हिंदी (Hindi)Italiano (Italian)日本語 (Japanese)Nederlands (Dutch)Português (Portuguese)Svenska (Swedish)Polski (Polish)Bahasa Malaysia (Malay)한국어 (Korean)Dansk (Danish)Norsk (Norwegian)Română (Romanian)Русский (Russian)Українська (Ukrainian)简体中文 (Chinese (Simplified))正體中文 (Chinese (Traditional))ภาษาไทย (Thai)العربية (Arabic)LinkedIn Corporation © 2023AboutTransparency CenterPrivacy and TermsCookiesCopyrightTermsPrivacyGuest controlsYour California Privacy ChoicesDismiss privacy menuLinkedIn Corporation © 2023', 'embedding': None, 'timestamp': None, 'pagenumber': ''}], 'base_url': None}] count=None\n"
]
}
],
"source": [
"print(response)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Document(page_content='import os\\nfrom canvasapi import Canvas\\nimport requests\\nfrom zipfile import ZipFile\\nfrom ai_ta_backend.aws import upload_data_files_to_s3\\nfrom ai_ta_backend.vector_database import Ingest\\n\\n\\nclass CanvasAPI():\\n def __init__(self):\\n self.canvas_client = Canvas(\"https://canvas.illinois.edu\", \\n os.getenv(\\'CANVAS_ACCESS_TOKEN\\'))\\n \\n def add_users(self, canvas_course_id: str, course_name: str):\\n \"\"\"\\n Get all users in a course\\n \"\"\"\\n course = self.canvas_client.get_course(canvas_course_id)\\n users = course.get_users()\\n user_emails = []\\n for user in users:\\n net_id = user.sis_user_id\\n email_id = net_id + \"@illinois.edu\"\\n user_emails.append(email_id)\\n \\n print(user_emails)\\n \\n if len(user_emails) > 0:\\n return \"Success\"\\n else:\\n return \"Failed\"\\n \\n def ingest_course_content(self, canvas_course_id: str, course_name: str):\\n \"\"\"\\n Ingests all Canvas course materials through the course ID.\\n \"\"\"\\n print(\"In ingest_course_content\")\\n\\n api_path = \"https://canvas.illinois.edu/api/v1/courses/\" + str(canvas_course_id)\\n headers = {\"Authorization\": \"Bearer \" + os.getenv(\\'CANVAS_ACCESS_TOKEN\\')}\\n\\n try:\\n # Start the content export\\n content_export_api_path = api_path + \"/content_exports?export_type=zip\"\\n start_content_export = requests.post(content_export_api_path, headers=headers)\\n content_export_id = start_content_export.json()[\\'id\\']\\n progress_url = start_content_export.json()[\\'progress_url\\']\\n\\n # Wait for the content export to finish\\n export_progress = requests.get(progress_url, headers=headers)\\n while export_progress.json()[\\'workflow_state\\'] != \\'completed\\':\\n export_progress = requests.get(progress_url, headers=headers)\\n \\n # View content export and get download URL\\n show_content_export_api_path = api_path + \"/content_exports/\" + str(content_export_id)\\n print(\"Show export path: \", show_content_export_api_path)\\n\\n show_content_export = requests.get(show_content_export_api_path, headers=headers)\\n download_url = show_content_export.json()[\\'attachment\\'][\\'url\\']\\n file_name = show_content_export.json()[\\'attachment\\'][\\'filename\\']\\n\\n # Create a directory for the content\\n directory = os.path.join(os.getcwd(), \"course_content\")\\n if not os.path.exists(directory):\\n os.mkdir(directory)\\n\\n # Download zip and save to directory\\n download = requests.get(download_url, headers=headers)\\n with open(os.path.join(directory, file_name), \\'wb\\') as f:\\n f.write(download.content)\\n print(\"Downloaded!\")\\n\\n # Extract and read from zip file\\n filepath = \"course_content/\" + file_name\\n with ZipFile(filepath, \\'r\\') as zip:\\n zip.printdir()\\n zip.extractall(\"course_content\")\\n print(\\'Done!\\')\\n os.remove(filepath)\\n\\n # Upload files to S3 and call bulk_ingest\\n s3_paths = upload_data_files_to_s3(course_name, \"course_content\")\\n ingest = Ingest()\\n canvas_ingest = ingest.bulk_ingest(s3_paths, course_name=course_name)\\n \\n return canvas_ingest\\n \\n except Exception as e:\\n print(e)\\n return \"Failed\"\\n\\n\\n ', metadata={'source': 'canvas.py'})]\n"
]
}
],
"source": [
"from langchain.document_loaders import PythonLoader\n",
"\n",
"file = \"canvas.py\"\n",
"\n",
"loader = PythonLoader(file)\n",
"data = loader.load()\n",
"\n",
"print(data)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
54 changes: 42 additions & 12 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# # from arize.utils.types import (Embedding, EmbeddingColumnNames, Environments,
# # Metrics, ModelTypes, Schema)
from langchain.document_loaders import (Docx2txtLoader, PythonLoader,
SRTLoader,
SRTLoader, UnstructuredFileLoader,
UnstructuredPowerPointLoader, TextLoader, GitLoader)
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import Document
Expand Down Expand Up @@ -243,7 +243,7 @@ def bulk_ingest(self, s3_paths: Union[List[str], str], course_name: str, **kwarg
success_status['failure_ingest'].append(s3_path)
else:
success_status['success_ingest'].append(s3_path)
elif s3_path.endswith('.txt'):
elif s3_path.endswith('.txt') or s3_path.endswith('.md'):
ret = self._ingest_single_txt(s3_path, course_name)
if ret != "Success":
success_status['failure_ingest'].append(s3_path)
Expand Down Expand Up @@ -280,21 +280,51 @@ def bulk_ingest(self, s3_paths: Union[List[str], str], course_name: str, **kwarg

def _ingest_single_py(self, s3_path: str, course_name: str):
try:
with NamedTemporaryFile() as tmpfile:
# download from S3 into vtt_tmpfile
self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile)
loader = PythonLoader(tmpfile.name)
documents = loader.load()
texts = [doc.page_content for doc in documents]
metadatas: List[Dict[str, Any]] = [{
print("in ingest_py")

file_name = s3_path.split("/")[-1]
file_path = "media/" + file_name

self.s3_client.download_file(os.getenv('S3_BUCKET_NAME'), s3_path, file_path)
loader = PythonLoader(file_path)
documents = loader.load()

texts = [doc.page_content for doc in documents]

metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': Path(s3_path).name,
'pagenumber_or_timestamp': '',
} for doc in documents]
#print(texts)
os.remove(file_path)

success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas)
return success_or_failure
success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas)
return success_or_failure

# with NamedTemporaryFile() as tmpfile:
# # download from S3 into tmpfile
# self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile)

# print("filename: ", file_name)
# loader = PythonLoader("media/" + file_name)
# print("file: ", tmpfile)

# documents = loader.load()
# texts = [doc.page_content for doc in documents]

# metadatas: List[Dict[str, Any]] = [{
# 'course_name': course_name,
# 's3_path': s3_path,
# 'readable_filename': Path(s3_path).name,
# 'pagenumber_or_timestamp': '',
# } for doc in documents]

# print(documents)

# success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas)
# return success_or_failure
except Exception as e:
print(f"ERROR IN py READING {e}")

Expand Down Expand Up @@ -566,7 +596,7 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
return "Success"

def _ingest_single_txt(self, s3_path: str, course_name: str) -> str:
"""Ingest a single .txt file from S3.
"""Ingest a single .txt or .md file from S3.
Args:
s3_path (str): A path to a .txt file in S3
course_name (str): The name of the course
Expand Down

0 comments on commit 07238a2

Please sign in to comment.