Shohei965 · Shohei965 · Jun 19, 2025 · Jun 19, 2025
diff --git a/final_project/README.md b/final_project/README.md
@@ -0,0 +1,42 @@
+# Final Assignment: Lecture Q&A Summarizer
+
+This directory contains a simple prototype system for summarizing a large number of questions collected during a lecture. The goal is to help instructors answer related questions together and reduce their workload while keeping student satisfaction high.
+
+## Overview
+1. Questions are clustered by semantic similarity using sentence embeddings.
+2. Each cluster is summarized using Google's Gemini API to produce a representative question or summary.
+3. These summaries can then be answered by the lecturer in bulk.
+
+The system is designed to handle up to around 1000 questions in a single run.
+
+## Requirements
+- Python 3.10 or later
+- See `requirements.txt` for required packages
+
+Install dependencies with:
+```bash
+pip install -r requirements.txt
+```
+
+Set your Gemini API key in the environment:
+```bash
+export GOOGLE_API_KEY="<YOUR_API_KEY>"
+```
+
+## Usage
+Prepare a text file containing one question per line (see `sample_questions.txt` for an example), then run:
+```bash
+python summarize.py questions.txt
+```
+The script outputs summaries for each cluster of related questions. Summaries are generated using Gemini, so an internet connection and a valid API key are required.
+
+## Notes
+- This is a minimal prototype. In a production setting you may want a more advanced clustering algorithm and better control over the summarization model.
+- Gemini API calls may incur latency or quota limits depending on your account.
+
+## Using Google Sheets
+Questions can also be fetched directly from a Google Sheet. Provide a service account credentials JSON and run:
+```bash
+python sheets_summarize.py SHEET_ID "Sheet1!A:A" path/to/credentials.json
+```
+This will read the specified column from the sheet, cluster the questions, and output representative topics generated with Gemini.
diff --git a/final_project/requirements.txt b/final_project/requirements.txt
@@ -0,0 +1,5 @@
+sentence-transformers
+scikit-learn
+google-generativeai
+google-api-python-client
+google-auth
diff --git a/final_project/sample_questions.txt b/final_project/sample_questions.txt
@@ -0,0 +1,6 @@
+What is the difference between supervised and unsupervised learning?
+How do I choose the right evaluation metric for my model?
+Can you explain cross-validation?
+What is overfitting and how can we avoid it?
+How is unsupervised learning different from supervised?
+What are some common evaluation metrics for classification problems?
diff --git a/final_project/sheets_summarize.py b/final_project/sheets_summarize.py
@@ -0,0 +1,75 @@
+import argparse
+from pathlib import Path
+from typing import List, Dict
+import os
+
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+import google.generativeai as genai
+from sklearn.cluster import KMeans
+import numpy as np
+
+
+def fetch_questions(sheet_id: str, range_: str, creds_file: str) -> List[str]:
+    """Fetch questions from a Google Sheet range."""
+    scopes = ["https://www.googleapis.com/auth/spreadsheets.readonly"]
+    creds = service_account.Credentials.from_service_account_file(creds_file, scopes=scopes)
+    service = build("sheets", "v4", credentials=creds)
+    resp = service.spreadsheets().values().get(spreadsheetId=sheet_id, range=range_).execute()
+    values = resp.get("values", [])
+    # flatten and filter empty strings
+    return [row[0].strip() for row in values if row and row[0].strip()]
+
+
+def embed_questions(questions: List[str], model: str = "models/embedding-001") -> np.ndarray:
+    """Get embeddings for each question using Gemini."""
+    return np.array([
+        genai.embed_content(model=model, content=q)["embedding"]
+        for q in questions
+    ])
+
+
+def cluster_questions(questions: List[str], n_clusters: int) -> Dict[int, List[str]]:
+    """Cluster questions using KMeans."""
+    embeddings = embed_questions(questions)
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
+    labels = kmeans.fit_predict(embeddings)
+    clusters: Dict[int, List[str]] = {}
+    for label, question in zip(labels, questions):
+        clusters.setdefault(label, []).append(question)
+    return clusters
+
+
+def summarize_cluster(model, questions: List[str]) -> str:
+    text = " \n".join(questions)
+    prompt = (
+        "あなたは講義担当教員です。以下の質問をまとめて代表質問を作成し、その回答を日本語で200字以内で出力してください:\n" + text
+    )
+    resp = model.generate_content(prompt)
+    return resp.text.strip()
+
+
+def process_sheet(sheet_id: str, range_: str, creds_file: str) -> None:
+    api_key = os.getenv("GOOGLE_API_KEY")
+    if not api_key:
+        raise EnvironmentError("GOOGLE_API_KEY not set")
+    genai.configure(api_key=api_key)
+    questions = fetch_questions(sheet_id, range_, creds_file)
+    if not questions:
+        print("No questions found.")
+        return
+    n_clusters = max(1, int(len(questions) ** 0.5))
+    clusters = cluster_questions(questions, n_clusters)
+    model = genai.GenerativeModel("gemini-pro")
+    for i, qs in clusters.items():
+        summary = summarize_cluster(model, qs)
+        print(f"\n### Topic {i + 1}\n{summary}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Summarize questions from Google Sheets")
+    parser.add_argument("sheet_id", help="Spreadsheet ID")
+    parser.add_argument("range", help="Range like Sheet1!A:A")
+    parser.add_argument("credentials", type=Path, help="Path to service account JSON")
+    args = parser.parse_args()
+    process_sheet(args.sheet_id, args.range, str(args.credentials))
diff --git a/final_project/summarize.py b/final_project/summarize.py
@@ -0,0 +1,67 @@
+import argparse
+import math
+from pathlib import Path
+from typing import List, Dict
+
+from sentence_transformers import SentenceTransformer
+from sklearn.cluster import KMeans
+import os
+import google.generativeai as genai
+
+
+def load_questions(path: Path) -> List[str]:
+    with open(path, "r", encoding="utf-8") as f:
+        return [line.strip() for line in f if line.strip()]
+
+
+def cluster_questions(questions: List[str], model_name: str = "all-MiniLM-L6-v2") -> Dict[int, List[str]]:
+    model = SentenceTransformer(model_name)
+    embeddings = model.encode(questions)
+    n_clusters = max(1, int(math.sqrt(len(questions))))
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
+    labels = kmeans.fit_predict(embeddings)
+    clusters: Dict[int, List[str]] = {}
+    for label, question in zip(labels, questions):
+        clusters.setdefault(label, []).append(question)
+    return clusters
+
+
+def summarize_clusters(clusters: Dict[int, List[str]]) -> List[str]:
+    api_key = os.getenv("GOOGLE_API_KEY")
+    if not api_key:
+        raise EnvironmentError("GOOGLE_API_KEY not set")
+
+    genai.configure(api_key=api_key)
+    model = genai.GenerativeModel("gemini-pro")
+
+    summaries = []
+    for questions in clusters.values():
+        text = " ".join(questions)
+        prompt = (
+            "Summarize the following questions into one representative question or short summary:\n"
+            + text
+        )
+        response = model.generate_content(prompt)
+        summaries.append(response.text.strip())
+    return summaries
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Summarize lecture questions")
+    parser.add_argument("input", type=Path, help="Text file with one question per line")
+    args = parser.parse_args()
+
+    questions = load_questions(args.input)
+    if not questions:
+        print("No questions found.")
+        return
+
+    clusters = cluster_questions(questions)
+    summaries = summarize_clusters(clusters)
+
+    for i, summary in enumerate(summaries, 1):
+        print(f"\n### Topic {i}\n{summary}")
+
+
+if __name__ == "__main__":
+    main()