diff --git a/final_project/README.md b/final_project/README.md new file mode 100644 index 000000000..038644454 --- /dev/null +++ b/final_project/README.md @@ -0,0 +1,42 @@ +# Final Assignment: Lecture Q&A Summarizer + +This directory contains a simple prototype system for summarizing a large number of questions collected during a lecture. The goal is to help instructors answer related questions together and reduce their workload while keeping student satisfaction high. + +## Overview +1. Questions are clustered by semantic similarity using sentence embeddings. +2. Each cluster is summarized using Google's Gemini API to produce a representative question or summary. +3. These summaries can then be answered by the lecturer in bulk. + +The system is designed to handle up to around 1000 questions in a single run. + +## Requirements +- Python 3.10 or later +- See `requirements.txt` for required packages + +Install dependencies with: +```bash +pip install -r requirements.txt +``` + +Set your Gemini API key in the environment: +```bash +export GOOGLE_API_KEY="" +``` + +## Usage +Prepare a text file containing one question per line (see `sample_questions.txt` for an example), then run: +```bash +python summarize.py questions.txt +``` +The script outputs summaries for each cluster of related questions. Summaries are generated using Gemini, so an internet connection and a valid API key are required. + +## Notes +- This is a minimal prototype. In a production setting you may want a more advanced clustering algorithm and better control over the summarization model. +- Gemini API calls may incur latency or quota limits depending on your account. + +## Using Google Sheets +Questions can also be fetched directly from a Google Sheet. Provide a service account credentials JSON and run: +```bash +python sheets_summarize.py SHEET_ID "Sheet1!A:A" path/to/credentials.json +``` +This will read the specified column from the sheet, cluster the questions, and output representative topics generated with Gemini. diff --git a/final_project/requirements.txt b/final_project/requirements.txt new file mode 100644 index 000000000..156e463e8 --- /dev/null +++ b/final_project/requirements.txt @@ -0,0 +1,5 @@ +sentence-transformers +scikit-learn +google-generativeai +google-api-python-client +google-auth diff --git a/final_project/sample_questions.txt b/final_project/sample_questions.txt new file mode 100644 index 000000000..047676c98 --- /dev/null +++ b/final_project/sample_questions.txt @@ -0,0 +1,6 @@ +What is the difference between supervised and unsupervised learning? +How do I choose the right evaluation metric for my model? +Can you explain cross-validation? +What is overfitting and how can we avoid it? +How is unsupervised learning different from supervised? +What are some common evaluation metrics for classification problems? diff --git a/final_project/sheets_summarize.py b/final_project/sheets_summarize.py new file mode 100644 index 000000000..85b37ea92 --- /dev/null +++ b/final_project/sheets_summarize.py @@ -0,0 +1,75 @@ +import argparse +from pathlib import Path +from typing import List, Dict +import os + +from google.oauth2 import service_account +from googleapiclient.discovery import build +import google.generativeai as genai +from sklearn.cluster import KMeans +import numpy as np + + +def fetch_questions(sheet_id: str, range_: str, creds_file: str) -> List[str]: + """Fetch questions from a Google Sheet range.""" + scopes = ["https://www.googleapis.com/auth/spreadsheets.readonly"] + creds = service_account.Credentials.from_service_account_file(creds_file, scopes=scopes) + service = build("sheets", "v4", credentials=creds) + resp = service.spreadsheets().values().get(spreadsheetId=sheet_id, range=range_).execute() + values = resp.get("values", []) + # flatten and filter empty strings + return [row[0].strip() for row in values if row and row[0].strip()] + + +def embed_questions(questions: List[str], model: str = "models/embedding-001") -> np.ndarray: + """Get embeddings for each question using Gemini.""" + return np.array([ + genai.embed_content(model=model, content=q)["embedding"] + for q in questions + ]) + + +def cluster_questions(questions: List[str], n_clusters: int) -> Dict[int, List[str]]: + """Cluster questions using KMeans.""" + embeddings = embed_questions(questions) + kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto") + labels = kmeans.fit_predict(embeddings) + clusters: Dict[int, List[str]] = {} + for label, question in zip(labels, questions): + clusters.setdefault(label, []).append(question) + return clusters + + +def summarize_cluster(model, questions: List[str]) -> str: + text = " \n".join(questions) + prompt = ( + "あなたは講義担当教員です。以下の質問をまとめて代表質問を作成し、その回答を日本語で200字以内で出力してください:\n" + text + ) + resp = model.generate_content(prompt) + return resp.text.strip() + + +def process_sheet(sheet_id: str, range_: str, creds_file: str) -> None: + api_key = os.getenv("GOOGLE_API_KEY") + if not api_key: + raise EnvironmentError("GOOGLE_API_KEY not set") + genai.configure(api_key=api_key) + questions = fetch_questions(sheet_id, range_, creds_file) + if not questions: + print("No questions found.") + return + n_clusters = max(1, int(len(questions) ** 0.5)) + clusters = cluster_questions(questions, n_clusters) + model = genai.GenerativeModel("gemini-pro") + for i, qs in clusters.items(): + summary = summarize_cluster(model, qs) + print(f"\n### Topic {i + 1}\n{summary}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Summarize questions from Google Sheets") + parser.add_argument("sheet_id", help="Spreadsheet ID") + parser.add_argument("range", help="Range like Sheet1!A:A") + parser.add_argument("credentials", type=Path, help="Path to service account JSON") + args = parser.parse_args() + process_sheet(args.sheet_id, args.range, str(args.credentials)) diff --git a/final_project/summarize.py b/final_project/summarize.py new file mode 100644 index 000000000..fc0036bda --- /dev/null +++ b/final_project/summarize.py @@ -0,0 +1,67 @@ +import argparse +import math +from pathlib import Path +from typing import List, Dict + +from sentence_transformers import SentenceTransformer +from sklearn.cluster import KMeans +import os +import google.generativeai as genai + + +def load_questions(path: Path) -> List[str]: + with open(path, "r", encoding="utf-8") as f: + return [line.strip() for line in f if line.strip()] + + +def cluster_questions(questions: List[str], model_name: str = "all-MiniLM-L6-v2") -> Dict[int, List[str]]: + model = SentenceTransformer(model_name) + embeddings = model.encode(questions) + n_clusters = max(1, int(math.sqrt(len(questions)))) + kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto") + labels = kmeans.fit_predict(embeddings) + clusters: Dict[int, List[str]] = {} + for label, question in zip(labels, questions): + clusters.setdefault(label, []).append(question) + return clusters + + +def summarize_clusters(clusters: Dict[int, List[str]]) -> List[str]: + api_key = os.getenv("GOOGLE_API_KEY") + if not api_key: + raise EnvironmentError("GOOGLE_API_KEY not set") + + genai.configure(api_key=api_key) + model = genai.GenerativeModel("gemini-pro") + + summaries = [] + for questions in clusters.values(): + text = " ".join(questions) + prompt = ( + "Summarize the following questions into one representative question or short summary:\n" + + text + ) + response = model.generate_content(prompt) + summaries.append(response.text.strip()) + return summaries + + +def main(): + parser = argparse.ArgumentParser(description="Summarize lecture questions") + parser.add_argument("input", type=Path, help="Text file with one question per line") + args = parser.parse_args() + + questions = load_questions(args.input) + if not questions: + print("No questions found.") + return + + clusters = cluster_questions(questions) + summaries = summarize_clusters(clusters) + + for i, summary in enumerate(summaries, 1): + print(f"\n### Topic {i}\n{summary}") + + +if __name__ == "__main__": + main()