Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions final_project/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Final Assignment: Lecture Q&A Summarizer

This directory contains a simple prototype system for summarizing a large number of questions collected during a lecture. The goal is to help instructors answer related questions together and reduce their workload while keeping student satisfaction high.

## Overview
1. Questions are clustered by semantic similarity using sentence embeddings.
2. Each cluster is summarized using Google's Gemini API to produce a representative question or summary.
3. These summaries can then be answered by the lecturer in bulk.

The system is designed to handle up to around 1000 questions in a single run.

## Requirements
- Python 3.10 or later
- See `requirements.txt` for required packages

Install dependencies with:
```bash
pip install -r requirements.txt
```

Set your Gemini API key in the environment:
```bash
export GOOGLE_API_KEY="<YOUR_API_KEY>"
```

## Usage
Prepare a text file containing one question per line (see `sample_questions.txt` for an example), then run:
```bash
python summarize.py questions.txt
```
The script outputs summaries for each cluster of related questions. Summaries are generated using Gemini, so an internet connection and a valid API key are required.

## Notes
- This is a minimal prototype. In a production setting you may want a more advanced clustering algorithm and better control over the summarization model.
- Gemini API calls may incur latency or quota limits depending on your account.

## Using Google Sheets
Questions can also be fetched directly from a Google Sheet. Provide a service account credentials JSON and run:
```bash
python sheets_summarize.py SHEET_ID "Sheet1!A:A" path/to/credentials.json
```
This will read the specified column from the sheet, cluster the questions, and output representative topics generated with Gemini.

## Google Apps Script Version
For integration directly within Google Sheets, copy the `apps_script.gs` file into
an Apps Script project bound to your spreadsheet. Set the script property
`GEMINI_API_KEY` with your Gemini API key and reload the sheet. A new **Q&A Tools**
menu will appear with a **Summarize Questions** item.

When invoked, it reads all questions from column B, clusters them via Gemini
embeddings, prioritizes clusters by size, and writes a summary with counts to a
new sheet named `QA Summary`. A word frequency chart is also inserted for a quick
visual overview of common terms.
122 changes: 122 additions & 0 deletions final_project/apps_script.gs
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// Google Apps Script for summarizing lecture questions with Gemini
// Place this script in the Google Sheets Apps Script editor.

const API_BASE = 'https://generativelanguage.googleapis.com/v1beta';

function getApiKey() {
const key = PropertiesService.getScriptProperties().getProperty('GEMINI_API_KEY');
if (!key) throw new Error('Set GEMINI_API_KEY in script properties.');
return key;
}

function onOpen() {
SpreadsheetApp.getActiveSpreadsheet()
.addMenu('Q&A Tools', [{name: 'Summarize Questions', functionName: 'summarizeQuestions'}]);
}

function summarizeQuestions() {
const ss = SpreadsheetApp.getActiveSpreadsheet();
const sheet = ss.getSheets()[0];
const lastRow = sheet.getLastRow();
const values = sheet.getRange(2, 2, lastRow - 1).getValues().flat(); // column B
const questions = values.filter(q => q);
if (!questions.length) return;

const embeddings = questions.map(q => embedText(q));
const k = Math.ceil(Math.sqrt(questions.length));
const result = kmeans(embeddings, k, 6);
const clusters = {};
result.labels.forEach((label, i) => {
(clusters[label] = clusters[label] || []).push(questions[i]);
});

const ordered = Object.entries(clusters)
.sort((a, b) => b[1].length - a[1].length);

let summarySheet = ss.getSheetByName('QA Summary');
if (summarySheet) ss.deleteSheet(summarySheet);
summarySheet = ss.insertSheet('QA Summary');
summarySheet.appendRow(['Priority', 'Representative QA', 'Count']);

ordered.forEach(([label, qs], idx) => {
const text = qs.join('\n');
const prompt = '以下の学生質問をまとめて代表質問と簡潔な回答を日本語で作成してください。\n' + text;
const summary = generateText(prompt);
summarySheet.appendRow([idx + 1, summary, qs.length]);
});

addWordCloud(summarySheet, questions);
}

function embedText(text) {
const url = `${API_BASE}/models/embedding-001:embedContent?key=${getApiKey()}`;
const payload = {content: {parts: [{text}]}};
const res = UrlFetchApp.fetch(url, {method: 'post', contentType: 'application/json', payload: JSON.stringify(payload)});
const data = JSON.parse(res.getContentText());
return data.embedding.values;
}

function generateText(prompt) {
const url = `${API_BASE}/models/gemini-pro:generateContent?key=${getApiKey()}`;
const payload = {contents: [{parts: [{text: prompt}]}]};
const res = UrlFetchApp.fetch(url, {method: 'post', contentType: 'application/json', payload: JSON.stringify(payload)});
const data = JSON.parse(res.getContentText());
return data.candidates[0].content.parts[0].text.trim();
}

function addWordCloud(sheet, questions) {
const freq = {};
questions.forEach(q => q.split(/\s+/).forEach(w => {
w = w.replace(/[\p{P}\p{S}]/gu, '').toLowerCase();
if (w) freq[w] = (freq[w] || 0) + 1;
}));
const data = Charts.newDataTable()
.addColumn(Charts.ColumnType.STRING, 'Word')
.addColumn(Charts.ColumnType.NUMBER, 'Count');
Object.entries(freq).forEach(([w, c]) => data.addRow([w, c]));
const chart = Charts.newBarChart()
.setDataTable(data)
.setTitle('Word Frequencies')
.setDimensions(600, 400)
.build();
sheet.insertChart(chart);
}

function kmeans(vectors, k, iters) {
const n = vectors.length;
const dims = vectors[0].length;
let centroids = [];
for (let i = 0; i < k; i++) {
centroids.push(vectors[Math.floor(Math.random() * n)].slice());
}
let labels = new Array(n).fill(0);
for (let t = 0; t < iters; t++) {
for (let i = 0; i < n; i++) {
let best = 0; let minD = Infinity;
for (let j = 0; j < k; j++) {
const d = distance2(vectors[i], centroids[j]);
if (d < minD) { minD = d; best = j; }
}
labels[i] = best;
}
let sums = Array.from({length: k}, () => Array(dims).fill(0));
let counts = Array(k).fill(0);
for (let i = 0; i < n; i++) {
counts[labels[i]]++;
for (let d = 0; d < dims; d++) sums[labels[i]][d] += vectors[i][d];
}
for (let j = 0; j < k; j++) {
if (counts[j]) centroids[j] = sums[j].map(x => x / counts[j]);
}
}
return {centroids, labels};
}

function distance2(a, b) {
let sum = 0;
for (let i = 0; i < a.length; i++) {
const diff = a[i] - b[i];
sum += diff * diff;
}
return sum;
}
5 changes: 5 additions & 0 deletions final_project/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
sentence-transformers
scikit-learn
google-generativeai
google-api-python-client
google-auth
6 changes: 6 additions & 0 deletions final_project/sample_questions.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
What is the difference between supervised and unsupervised learning?
How do I choose the right evaluation metric for my model?
Can you explain cross-validation?
What is overfitting and how can we avoid it?
How is unsupervised learning different from supervised?
What are some common evaluation metrics for classification problems?
75 changes: 75 additions & 0 deletions final_project/sheets_summarize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import argparse
from pathlib import Path
from typing import List, Dict
import os

from google.oauth2 import service_account
from googleapiclient.discovery import build
import google.generativeai as genai
from sklearn.cluster import KMeans
import numpy as np


def fetch_questions(sheet_id: str, range_: str, creds_file: str) -> List[str]:
"""Fetch questions from a Google Sheet range."""
scopes = ["https://www.googleapis.com/auth/spreadsheets.readonly"]
creds = service_account.Credentials.from_service_account_file(creds_file, scopes=scopes)
service = build("sheets", "v4", credentials=creds)
resp = service.spreadsheets().values().get(spreadsheetId=sheet_id, range=range_).execute()
values = resp.get("values", [])
# flatten and filter empty strings
return [row[0].strip() for row in values if row and row[0].strip()]


def embed_questions(questions: List[str], model: str = "models/embedding-001") -> np.ndarray:
"""Get embeddings for each question using Gemini."""
return np.array([
genai.embed_content(model=model, content=q)["embedding"]
for q in questions
])


def cluster_questions(questions: List[str], n_clusters: int) -> Dict[int, List[str]]:
"""Cluster questions using KMeans."""
embeddings = embed_questions(questions)
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
labels = kmeans.fit_predict(embeddings)
clusters: Dict[int, List[str]] = {}
for label, question in zip(labels, questions):
clusters.setdefault(label, []).append(question)
return clusters


def summarize_cluster(model, questions: List[str]) -> str:
text = " \n".join(questions)
prompt = (
"あなたは講義担当教員です。以下の質問をまとめて代表質問を作成し、その回答を日本語で200字以内で出力してください:\n" + text
)
resp = model.generate_content(prompt)
return resp.text.strip()


def process_sheet(sheet_id: str, range_: str, creds_file: str) -> None:
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise EnvironmentError("GOOGLE_API_KEY not set")
genai.configure(api_key=api_key)
questions = fetch_questions(sheet_id, range_, creds_file)
if not questions:
print("No questions found.")
return
n_clusters = max(1, int(len(questions) ** 0.5))
clusters = cluster_questions(questions, n_clusters)
model = genai.GenerativeModel("gemini-pro")
for i, qs in clusters.items():
summary = summarize_cluster(model, qs)
print(f"\n### Topic {i + 1}\n{summary}")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Summarize questions from Google Sheets")
parser.add_argument("sheet_id", help="Spreadsheet ID")
parser.add_argument("range", help="Range like Sheet1!A:A")
parser.add_argument("credentials", type=Path, help="Path to service account JSON")
args = parser.parse_args()
process_sheet(args.sheet_id, args.range, str(args.credentials))
67 changes: 67 additions & 0 deletions final_project/summarize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import argparse
import math
from pathlib import Path
from typing import List, Dict

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import os
import google.generativeai as genai


def load_questions(path: Path) -> List[str]:
with open(path, "r", encoding="utf-8") as f:
return [line.strip() for line in f if line.strip()]


def cluster_questions(questions: List[str], model_name: str = "all-MiniLM-L6-v2") -> Dict[int, List[str]]:
model = SentenceTransformer(model_name)
embeddings = model.encode(questions)
n_clusters = max(1, int(math.sqrt(len(questions))))
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
labels = kmeans.fit_predict(embeddings)
clusters: Dict[int, List[str]] = {}
for label, question in zip(labels, questions):
clusters.setdefault(label, []).append(question)
return clusters


def summarize_clusters(clusters: Dict[int, List[str]]) -> List[str]:
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise EnvironmentError("GOOGLE_API_KEY not set")

genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-pro")

summaries = []
for questions in clusters.values():
text = " ".join(questions)
prompt = (
"Summarize the following questions into one representative question or short summary:\n"
+ text
)
response = model.generate_content(prompt)
summaries.append(response.text.strip())
return summaries


def main():
parser = argparse.ArgumentParser(description="Summarize lecture questions")
parser.add_argument("input", type=Path, help="Text file with one question per line")
args = parser.parse_args()

questions = load_questions(args.input)
if not questions:
print("No questions found.")
return

clusters = cluster_questions(questions)
summaries = summarize_clusters(clusters)

for i, summary in enumerate(summaries, 1):
print(f"\n### Topic {i}\n{summary}")


if __name__ == "__main__":
main()