Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions final_project/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Final Assignment: Lecture Q&A Summarizer

This directory contains a simple prototype system for summarizing a large number of questions collected during a lecture. The goal is to help instructors answer related questions together and reduce their workload while keeping student satisfaction high.

## Overview
1. Questions are clustered by semantic similarity using sentence embeddings.
2. Each cluster is summarized using Google's Gemini API to produce a representative question or summary.
3. These summaries can then be answered by the lecturer in bulk.

The system is designed to handle up to around 1000 questions in a single run.

## Requirements
- Python 3.10 or later
- See `requirements.txt` for required packages

Install dependencies with:
```bash
pip install -r requirements.txt
```

Set your Gemini API key in the environment:
```bash
export GOOGLE_API_KEY="<YOUR_API_KEY>"
```

## Usage
Prepare a text file containing one question per line (see `sample_questions.txt` for an example), then run:
```bash
python summarize.py questions.txt
```
The script outputs summaries for each cluster of related questions. Summaries are generated using Gemini, so an internet connection and a valid API key are required.

## Notes
- This is a minimal prototype. In a production setting you may want a more advanced clustering algorithm and better control over the summarization model.
- Gemini API calls may incur latency or quota limits depending on your account.

## Using Google Sheets
Questions can also be fetched directly from a Google Sheet. Provide a service account credentials JSON and run:
```bash
python sheets_summarize.py SHEET_ID "Sheet1!A:A" path/to/credentials.json
```
This will read the specified column from the sheet, cluster the questions, and output representative topics generated with Gemini.
5 changes: 5 additions & 0 deletions final_project/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
sentence-transformers
scikit-learn
google-generativeai
google-api-python-client
google-auth
6 changes: 6 additions & 0 deletions final_project/sample_questions.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
What is the difference between supervised and unsupervised learning?
How do I choose the right evaluation metric for my model?
Can you explain cross-validation?
What is overfitting and how can we avoid it?
How is unsupervised learning different from supervised?
What are some common evaluation metrics for classification problems?
75 changes: 75 additions & 0 deletions final_project/sheets_summarize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import argparse
from pathlib import Path
from typing import List, Dict
import os

from google.oauth2 import service_account
from googleapiclient.discovery import build
import google.generativeai as genai
from sklearn.cluster import KMeans
import numpy as np


def fetch_questions(sheet_id: str, range_: str, creds_file: str) -> List[str]:
"""Fetch questions from a Google Sheet range."""
scopes = ["https://www.googleapis.com/auth/spreadsheets.readonly"]
creds = service_account.Credentials.from_service_account_file(creds_file, scopes=scopes)
service = build("sheets", "v4", credentials=creds)
resp = service.spreadsheets().values().get(spreadsheetId=sheet_id, range=range_).execute()
values = resp.get("values", [])
# flatten and filter empty strings
return [row[0].strip() for row in values if row and row[0].strip()]


def embed_questions(questions: List[str], model: str = "models/embedding-001") -> np.ndarray:
"""Get embeddings for each question using Gemini."""
return np.array([
genai.embed_content(model=model, content=q)["embedding"]
for q in questions
])


def cluster_questions(questions: List[str], n_clusters: int) -> Dict[int, List[str]]:
"""Cluster questions using KMeans."""
embeddings = embed_questions(questions)
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
labels = kmeans.fit_predict(embeddings)
clusters: Dict[int, List[str]] = {}
for label, question in zip(labels, questions):
clusters.setdefault(label, []).append(question)
return clusters


def summarize_cluster(model, questions: List[str]) -> str:
text = " \n".join(questions)
prompt = (
"あなたは講義担当教員です。以下の質問をまとめて代表質問を作成し、その回答を日本語で200字以内で出力してください:\n" + text
)
resp = model.generate_content(prompt)
return resp.text.strip()


def process_sheet(sheet_id: str, range_: str, creds_file: str) -> None:
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise EnvironmentError("GOOGLE_API_KEY not set")
genai.configure(api_key=api_key)
questions = fetch_questions(sheet_id, range_, creds_file)
if not questions:
print("No questions found.")
return
n_clusters = max(1, int(len(questions) ** 0.5))
clusters = cluster_questions(questions, n_clusters)
model = genai.GenerativeModel("gemini-pro")
for i, qs in clusters.items():
summary = summarize_cluster(model, qs)
print(f"\n### Topic {i + 1}\n{summary}")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Summarize questions from Google Sheets")
parser.add_argument("sheet_id", help="Spreadsheet ID")
parser.add_argument("range", help="Range like Sheet1!A:A")
parser.add_argument("credentials", type=Path, help="Path to service account JSON")
args = parser.parse_args()
process_sheet(args.sheet_id, args.range, str(args.credentials))
67 changes: 67 additions & 0 deletions final_project/summarize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import argparse
import math
from pathlib import Path
from typing import List, Dict

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import os
import google.generativeai as genai


def load_questions(path: Path) -> List[str]:
with open(path, "r", encoding="utf-8") as f:
return [line.strip() for line in f if line.strip()]


def cluster_questions(questions: List[str], model_name: str = "all-MiniLM-L6-v2") -> Dict[int, List[str]]:
model = SentenceTransformer(model_name)
embeddings = model.encode(questions)
n_clusters = max(1, int(math.sqrt(len(questions))))
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
labels = kmeans.fit_predict(embeddings)
clusters: Dict[int, List[str]] = {}
for label, question in zip(labels, questions):
clusters.setdefault(label, []).append(question)
return clusters


def summarize_clusters(clusters: Dict[int, List[str]]) -> List[str]:
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise EnvironmentError("GOOGLE_API_KEY not set")

genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-pro")

summaries = []
for questions in clusters.values():
text = " ".join(questions)
prompt = (
"Summarize the following questions into one representative question or short summary:\n"
+ text
)
response = model.generate_content(prompt)
summaries.append(response.text.strip())
return summaries


def main():
parser = argparse.ArgumentParser(description="Summarize lecture questions")
parser.add_argument("input", type=Path, help="Text file with one question per line")
args = parser.parse_args()

questions = load_questions(args.input)
if not questions:
print("No questions found.")
return

clusters = cluster_questions(questions)
summaries = summarize_clusters(clusters)

for i, summary in enumerate(summaries, 1):
print(f"\n### Topic {i}\n{summary}")


if __name__ == "__main__":
main()