Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Web Scrape Updates #163

Draft
wants to merge 26 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
9ba6e9d
Updated Nomic in requirements.txt
star-nox Nov 6, 2023
ca806eb
fix openai version to pre 1.0
KastanDay Nov 7, 2023
17a7779
upgrade python from 3.8 to 3.10
KastanDay Nov 7, 2023
8562de7
trying to fix tesseract // pdfminer requirements for image ingest
KastanDay Nov 7, 2023
be34f01
adding strict versions to all requirements
KastanDay Nov 7, 2023
d4b4e8f
Bump pymupdf from 1.22.5 to 1.23.6 (#136)
dependabot[bot] Nov 7, 2023
170ed79
compatible wheel version
KastanDay Nov 7, 2023
6b94aac
upgrade pip during image startup
KastanDay Nov 7, 2023
c084960
properly upgrade pip
KastanDay Nov 7, 2023
f4b8bd9
Fully lock ALL requirements. Hopefully speed up build times, too
KastanDay Nov 7, 2023
4e80002
Limit unstructured dependencies, image balloned from 700MB to 6GB. Ho…
KastanDay Nov 7, 2023
abf1fc2
Lock version of pip
KastanDay Nov 7, 2023
8a8eac2
Lock (correct) version of pip
KastanDay Nov 7, 2023
cf78800
add libgl1 for cv2 in Docker (for unstructured)
KastanDay Nov 7, 2023
62883e8
adding proper error logging to image ingest
KastanDay Nov 7, 2023
fcfa485
Installing unstructured requirements individually to hopefully redoce…
KastanDay Nov 7, 2023
97bbbd9
Reduce use of unstructured, hopefully the install is much smaller now
KastanDay Nov 7, 2023
2103336
Guarantee Unique S3 Upload paths (#137)
KastanDay Nov 7, 2023
e547a94
Bump typing-extensions from 4.7.1 to 4.8.0 (#90)
dependabot[bot] Nov 7, 2023
27a6680
Bump flask from 2.3.3 to 3.0.0 (#101)
dependabot[bot] Nov 7, 2023
a5b418c
Guard against kwargs failures during webscrape
KastanDay Nov 8, 2023
0d371ba
HOTFIX: kwargs in html and pdf ingest for /webscrape
KastanDay Nov 8, 2023
ba1cbb1
Export conversation history on /analysis page (#141)
star-nox Nov 20, 2023
cfca31c
added option for extending one URL our when on baseurl or to opt out …
jkmin3 Dec 5, 2023
36f7f90
removed depth search for now
jkmin3 Dec 5, 2023
95247b9
made filetypes a variable
jkmin3 Dec 5, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ coursera-dl/
*parsed.json
wandb
*.ipynb
*.pem

# don't expose env files
.env
Expand Down
66 changes: 66 additions & 0 deletions ai_ta_backend/export_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import os
import uuid
import pandas as pd
import supabase
from flask import send_file

def export_convo_history_csv(course_name: str, from_date= '', to_date= ''):
"""
Export conversation history to csv file.
Optional args: from_date, to_date
"""
print("Exporting conversation history to csv file...")
supabase_client = supabase.create_client( # type: ignore
supabase_url=os.getenv('SUPABASE_URL'), # type: ignore
supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore

if from_date == '' and to_date == '':
# Get all data
print("No dates")
response = supabase_client.table("llm-convo-monitor").select("id", count = 'exact').eq("course_name", course_name).order('id', desc=False).execute()
elif from_date != '' and to_date == '':
print("only from_date")
# Get data from from_date to now
response = supabase_client.table("llm-convo-monitor").select("id", count = 'exact').eq("course_name", course_name).gte('created_at', from_date).order('id', desc=False).execute()
elif from_date == '' and to_date != '':
print("only to_date")
# Get data from beginning to to_date
response = supabase_client.table("llm-convo-monitor").select("id", count = 'exact').eq("course_name", course_name).lte('created_at', to_date).order('id', desc=False).execute()
else:
print("both from_date and to_date")
# Get data from from_date to to_date
response = supabase_client.table("llm-convo-monitor").select("id", count = 'exact').eq("course_name", course_name).gte('created_at', from_date).lte('created_at', to_date).order('id', desc=False).execute()

# Fetch data
if response.count > 0:
print("id count greater than zero")
first_id = response.data[0]['id']
last_id = response.data[-1]['id']

filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.csv'
file_path = os.path.join(os.getcwd(), filename)
# Fetch data in batches of 25 from first_id to last_id
while first_id <= last_id:
print("Fetching data from id: ", first_id)
response = supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).gte('id', first_id).lte('id', last_id).order('id', desc=False).limit(25).execute()
# Convert to pandas dataframe
df = pd.DataFrame(response.data)
# Append to csv file
if not os.path.isfile(file_path):
df.to_csv(file_path, mode='a', header=True, index=False)
else:
df.to_csv(file_path, mode='a', header=False, index=False)

# Update first_id
first_id = response.data[-1]['id'] + 1
print("updated first_id: ", first_id)

# Download file
try:
return (file_path, filename, os.getcwd())
except Exception as e:
print(e)
return "Error downloading file"
else:
return "No data found between the dates"

35 changes: 32 additions & 3 deletions ai_ta_backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@
from typing import List

from dotenv import load_dotenv
from flask import Flask, Response, abort, jsonify, request
from flask import Flask, Response, abort, jsonify, request, send_file, make_response, send_from_directory
from flask_cors import CORS
from flask_executor import Executor
from sqlalchemy import JSON

from ai_ta_backend.canvas import CanvasAPI
from ai_ta_backend.nomic_logging import get_nomic_map, log_convo_to_nomic
from ai_ta_backend.vector_database import Ingest
from ai_ta_backend.web_scrape import WebScrape, mit_course_download
from ai_ta_backend.canvas import CanvasAPI
from ai_ta_backend.export_data import export_convo_history_csv

app = Flask(__name__)
CORS(app)
Expand Down Expand Up @@ -209,6 +210,7 @@ def ingest() -> Response:
str: Success or Failure message. Failure message if any failures. TODO: email on failure.
"""
s3_paths: List[str] | str = request.args.get('s3_paths', default='')
readable_filename: List[str] | str = request.args.get('readable_filename', default='')
course_name: List[str] | str = request.args.get('course_name', default='')
print(f"In top of /ingest route. course: {course_name}, s3paths: {s3_paths}")

Expand All @@ -221,7 +223,10 @@ def ingest() -> Response:
)

ingester = Ingest()
success_fail_dict = ingester.bulk_ingest(s3_paths, course_name)
if readable_filename == '':
success_fail_dict = ingester.bulk_ingest(s3_paths, course_name)
else:
success_fail_dict = ingester.bulk_ingest(s3_paths, course_name, readable_filename=readable_filename)
print(f"Bottom of /ingest route. success or fail dict: {success_fail_dict}")
del ingester

Expand Down Expand Up @@ -466,6 +471,30 @@ def logToNomic():
response.headers.add('Access-Control-Allow-Origin', '*')
return response

@app.route('/export-convo-history-csv', methods=['GET'])
def export_convo_history():
course_name: str = request.args.get('course_name', default='', type=str)
from_date: str = request.args.get('from_date', default='', type=str)
to_date: str = request.args.get('to_date', default='', type=str)

if course_name == '':
# proper web error "400 Bad request"
abort(
400,
description=
f"Missing required parameter: 'course_name' must be provided. Course name: `{course_name}`"
)

export_status = export_convo_history_csv(course_name, from_date, to_date)
print("EXPORT FILE LINKS: ", export_status)

response = make_response(send_from_directory(export_status[2], export_status[1], as_attachment=True))
response.headers.add('Access-Control-Allow-Origin', '*')
response.headers["Content-Disposition"] = f"attachment; filename={export_status[1]}"

os.remove(export_status[0])
return response


if __name__ == '__main__':
app.run(debug=True, port=int(os.getenv("PORT", default=8000)))
Loading