Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding various Canvas functions #33

Merged
merged 24 commits into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
c42f606
added the add_users() for Canvas
star-nox Aug 10, 2023
6854205
added canvas course ingest
star-nox Aug 13, 2023
54e3fb0
updated requirements
star-nox Aug 13, 2023
07238a2
added .md ingest and fixed .py ingest
star-nox Aug 15, 2023
deceb15
deleted test ipynb file
star-nox Aug 15, 2023
27383e1
added nomic viz
star-nox Aug 16, 2023
6f08340
added canvas file update function
Aug 21, 2023
34cbbdc
completed update function
star-nox Aug 25, 2023
efd9048
updated course export to include all contents
star-nox Aug 25, 2023
bf3726b
modified to handle diff file structures of downloaded content
star-nox Aug 25, 2023
93646ac
modified canvas update
Aug 30, 2023
05ab444
modified add_users() and ingest_course_content() functions
Sep 21, 2023
f5655ab
modified ingest function
star-nox Sep 21, 2023
6f80b96
modified update_files() for file replacement
star-nox Sep 22, 2023
0223a22
removed the extra os.remove()
star-nox Sep 22, 2023
2e10cc8
fix underscore to dash in for pip
KastanDay Sep 29, 2023
a38fb90
removed json import and added abort to canvas functions
star-nox Oct 2, 2023
79142c5
Merge branch 'main' into canvas
star-nox Oct 2, 2023
7a9c21d
removed file update functions from this PR
star-nox Oct 2, 2023
65b2ba1
added ability to receive canvas checkbox data
Maxwell-Lindsey Oct 23, 2023
acd61e7
Merge branch 'main' into canvas
Maxwell-Lindsey Oct 23, 2023
7c4a839
removed unused import
Maxwell-Lindsey Oct 23, 2023
8de8900
added comment for debugging
Maxwell-Lindsey Oct 23, 2023
46646c7
removing pypandoc because it appears to be unused
KastanDay Oct 23, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions ai_ta_backend/canvas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import os
from canvasapi import Canvas
import requests
from zipfile import ZipFile
from ai_ta_backend.aws import upload_data_files_to_s3
from ai_ta_backend.vector_database import Ingest


class CanvasAPI():
def __init__(self):
self.canvas_client = Canvas("https://canvas.illinois.edu",
os.getenv('CANVAS_ACCESS_TOKEN'))

def add_users(self, canvas_course_id: str, course_name: str):
"""
Get all users in a course
"""
course = self.canvas_client.get_course(canvas_course_id)
users = course.get_users()
user_emails = []
for user in users:
net_id = user.sis_user_id
email_id = net_id + "@illinois.edu"
user_emails.append(email_id)

print(user_emails)

if len(user_emails) > 0:
return "Success"
else:
return "Failed"

def ingest_course_content(self, canvas_course_id: str, course_name: str):
"""
Ingests all Canvas course materials through the course ID.
"""
print("In ingest_course_content")

api_path = "https://canvas.illinois.edu/api/v1/courses/" + str(canvas_course_id)
headers = {"Authorization": "Bearer " + os.getenv('CANVAS_ACCESS_TOKEN')}

try:
# Start the content export
content_export_api_path = api_path + "/content_exports?export_type=zip"
start_content_export = requests.post(content_export_api_path, headers=headers)
content_export_id = start_content_export.json()['id']
progress_url = start_content_export.json()['progress_url']

# Wait for the content export to finish
export_progress = requests.get(progress_url, headers=headers)
while export_progress.json()['workflow_state'] != 'completed':
export_progress = requests.get(progress_url, headers=headers)

# View content export and get download URL
show_content_export_api_path = api_path + "/content_exports/" + str(content_export_id)
print("Show export path: ", show_content_export_api_path)

show_content_export = requests.get(show_content_export_api_path, headers=headers)
download_url = show_content_export.json()['attachment']['url']
file_name = show_content_export.json()['attachment']['filename']

# Create a directory for the content
directory = os.path.join(os.getcwd(), "course_content")
if not os.path.exists(directory):
os.mkdir(directory)

# Download zip and save to directory
download = requests.get(download_url, headers=headers)
with open(os.path.join(directory, file_name), 'wb') as f:
f.write(download.content)
print("Downloaded!")

# Extract and read from zip file
filepath = "course_content/" + file_name
with ZipFile(filepath, 'r') as zip:
zip.printdir()
zip.extractall("course_content")
print('Done!')
os.remove(filepath)

# Upload files to S3 and call bulk_ingest
s3_paths = upload_data_files_to_s3(course_name, "course_content")
ingest = Ingest()
canvas_ingest = ingest.bulk_ingest(s3_paths, course_name=course_name)

return canvas_ingest

except Exception as e:
print(e)
return "Failed"



49 changes: 49 additions & 0 deletions ai_ta_backend/data_logging.py
KastanDay marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os
import nomic
from nomic import atlas
from langchain.embeddings import OpenAIEmbeddings
import numpy as np
import time

class DataLog():
def __init__(self):
self.login = nomic.login(os.getenv('NOMIC_API_KEY'))

def nomic_log(self, course_name:str, search_query:str, retrieved_contexts)-> str:
"""
Logs user query and retrieved contexts to Nomic.
"""
print("course_name: ", course_name)
print("search_query: ", search_query)
print("retrieved_contexts: ", len(retrieved_contexts))

# concat all retrieved contexts into one string
context_string = ""
for context in retrieved_contexts:
context_string += context['text'] + " "

#print("context_string: ", context_string)

# convert query and context to embeddings
embeddings_model = OpenAIEmbeddings()
embeddings = embeddings_model.embed_documents([search_query, context_string])

data = [{'course_name': course_name, 'query': search_query, 'id': time.time()},
{'course_name': course_name, 'query': context_string, 'id': time.time()}]

print("len of data: ", len(data))
print("len of embeddings: ", len(embeddings))
print(data)

project = atlas.AtlasProject(name="User Query Text Viz", add_datums_if_exists=True)
map = project.get_map('Search Query Viz')
print(project.name)
print(map)

with project.wait_for_project_lock() as project:
project.add_embeddings(embeddings=np.array(embeddings), data=data)
project.rebuild_maps()

print("done")
# log to Nomic
return "WIP"
42 changes: 41 additions & 1 deletion ai_ta_backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

from ai_ta_backend.vector_database import Ingest
from ai_ta_backend.web_scrape import main_crawler, mit_course_download
from ai_ta_backend.canvas import CanvasAPI
from ai_ta_backend.data_logging import DataLog

app = Flask(__name__)
CORS(app)
Expand Down Expand Up @@ -131,6 +133,10 @@ def getTopContexts():
ingester = Ingest()
found_documents = ingester.getTopContexts(search_query, course_name, token_limit)

# add nomic log function here
logger = DataLog()
result = logger.nomic_log(course_name, search_query, found_documents)

response = jsonify(found_documents)
response.headers.add('Access-Control-Allow-Origin', '*')
return response
Expand Down Expand Up @@ -267,7 +273,6 @@ def log():
"""
todo
"""

print("In /log")

ingester = Ingest()
Expand Down Expand Up @@ -312,6 +317,41 @@ def mit_download_course():
response.headers.add('Access-Control-Allow-Origin', '*')
return response

@app.route('/addCanvasUsers', methods=['GET'])
def add_canvas_users():
"""
Add users from canvas to the course
"""
print("In /addCanvasUsers")

canvas = CanvasAPI()
canvas_course_id: str = request.args.get('course_id')
course_name: str = request.args.get('course_name')

success_or_failure = canvas.add_users(canvas_course_id, course_name)

response = jsonify({"outcome": success_or_failure})

response.headers.add('Access-Control-Allow-Origin', '*')
return response

@app.route('/ingestCanvas', methods=['GET'])
def ingest_canvas():
"""
Ingest course content from Canvas
"""
canvas = CanvasAPI()
canvas_course_id: str = request.args.get('course_id')
course_name: str = request.args.get('course_name')

success_or_failure = canvas.ingest_course_content(canvas_course_id, course_name)

response = jsonify({"outcome": success_or_failure})

response.headers.add('Access-Control-Allow-Origin', '*')
return response


# TODO: add a way to delete items from course based on base_url

if __name__ == '__main__':
Expand Down
54 changes: 42 additions & 12 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# # from arize.utils.types import (Embedding, EmbeddingColumnNames, Environments,
# # Metrics, ModelTypes, Schema)
from langchain.document_loaders import (Docx2txtLoader, PythonLoader,
SRTLoader,
SRTLoader, UnstructuredFileLoader,
UnstructuredPowerPointLoader, TextLoader, GitLoader)
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import Document
Expand Down Expand Up @@ -243,7 +243,7 @@ def bulk_ingest(self, s3_paths: Union[List[str], str], course_name: str, **kwarg
success_status['failure_ingest'].append(s3_path)
else:
success_status['success_ingest'].append(s3_path)
elif s3_path.endswith('.txt'):
elif s3_path.endswith('.txt') or s3_path.endswith('.md'):
ret = self._ingest_single_txt(s3_path, course_name)
if ret != "Success":
success_status['failure_ingest'].append(s3_path)
Expand Down Expand Up @@ -280,21 +280,51 @@ def bulk_ingest(self, s3_paths: Union[List[str], str], course_name: str, **kwarg

def _ingest_single_py(self, s3_path: str, course_name: str):
try:
with NamedTemporaryFile() as tmpfile:
# download from S3 into vtt_tmpfile
self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile)
loader = PythonLoader(tmpfile.name)
documents = loader.load()
texts = [doc.page_content for doc in documents]
metadatas: List[Dict[str, Any]] = [{
print("in ingest_py")

file_name = s3_path.split("/")[-1]
file_path = "media/" + file_name

self.s3_client.download_file(os.getenv('S3_BUCKET_NAME'), s3_path, file_path)
loader = PythonLoader(file_path)
documents = loader.load()

texts = [doc.page_content for doc in documents]

metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': Path(s3_path).name,
'pagenumber_or_timestamp': '',
} for doc in documents]
#print(texts)
os.remove(file_path)
KastanDay marked this conversation as resolved.
Show resolved Hide resolved

success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas)
return success_or_failure
success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas)
return success_or_failure

# with NamedTemporaryFile() as tmpfile:
# # download from S3 into tmpfile
# self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile)

# print("filename: ", file_name)
# loader = PythonLoader("media/" + file_name)
# print("file: ", tmpfile)

# documents = loader.load()
# texts = [doc.page_content for doc in documents]

# metadatas: List[Dict[str, Any]] = [{
# 'course_name': course_name,
# 's3_path': s3_path,
# 'readable_filename': Path(s3_path).name,
# 'pagenumber_or_timestamp': '',
# } for doc in documents]

# print(documents)

# success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas)
# return success_or_failure
except Exception as e:
print(f"ERROR IN py READING {e}")

Expand Down Expand Up @@ -566,7 +596,7 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
return "Success"

def _ingest_single_txt(self, s3_path: str, course_name: str) -> str:
"""Ingest a single .txt file from S3.
"""Ingest a single .txt or .md file from S3.
Args:
s3_path (str): A path to a .txt file in S3
course_name (str): The name of the course
Expand Down
Binary file not shown.
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ ffmpeg-python
ffprobe
ffmpeg
beautifulsoup4
pypandoc
pypandoc_binary
canvasapi
cs-dlp @ git+https://github.com/raffaem/[email protected] # previously called coursera-dl

# No arize for now, huge build size with these additions.
Expand Down