From 28f62f03c464c110464574cb10d3ef963769826a Mon Sep 17 00:00:00 2001
From: Shohei965 <shohei.yamaguchi@hdydmedia.com>
Date: Thu, 19 Jun 2025 20:36:13 +0900
Subject: [PATCH] Add Apps Script for question clustering

---
 final_project/README.md            |  53 +++++++++++++
 final_project/apps_script.gs       | 122 +++++++++++++++++++++++++++++
 final_project/requirements.txt     |   5 ++
 final_project/sample_questions.txt |   6 ++
 final_project/sheets_summarize.py  |  75 ++++++++++++++++++
 final_project/summarize.py         |  67 ++++++++++++++++
 6 files changed, 328 insertions(+)
 create mode 100644 final_project/README.md
 create mode 100644 final_project/apps_script.gs
 create mode 100644 final_project/requirements.txt
 create mode 100644 final_project/sample_questions.txt
 create mode 100644 final_project/sheets_summarize.py
 create mode 100644 final_project/summarize.py

diff --git a/final_project/README.md b/final_project/README.md
new file mode 100644
index 000000000..3b3bcef4f
--- /dev/null
+++ b/final_project/README.md
@@ -0,0 +1,53 @@
+# Final Assignment: Lecture Q&A Summarizer
+
+This directory contains a simple prototype system for summarizing a large number of questions collected during a lecture. The goal is to help instructors answer related questions together and reduce their workload while keeping student satisfaction high.
+
+## Overview
+1. Questions are clustered by semantic similarity using sentence embeddings.
+2. Each cluster is summarized using Google's Gemini API to produce a representative question or summary.
+3. These summaries can then be answered by the lecturer in bulk.
+
+The system is designed to handle up to around 1000 questions in a single run.
+
+## Requirements
+- Python 3.10 or later
+- See `requirements.txt` for required packages
+
+Install dependencies with:
+```bash
+pip install -r requirements.txt
+```
+
+Set your Gemini API key in the environment:
+```bash
+export GOOGLE_API_KEY="<YOUR_API_KEY>"
+```
+
+## Usage
+Prepare a text file containing one question per line (see `sample_questions.txt` for an example), then run:
+```bash
+python summarize.py questions.txt
+```
+The script outputs summaries for each cluster of related questions. Summaries are generated using Gemini, so an internet connection and a valid API key are required.
+
+## Notes
+- This is a minimal prototype. In a production setting you may want a more advanced clustering algorithm and better control over the summarization model.
+- Gemini API calls may incur latency or quota limits depending on your account.
+
+## Using Google Sheets
+Questions can also be fetched directly from a Google Sheet. Provide a service account credentials JSON and run:
+```bash
+python sheets_summarize.py SHEET_ID "Sheet1!A:A" path/to/credentials.json
+```
+This will read the specified column from the sheet, cluster the questions, and output representative topics generated with Gemini.
+
+## Google Apps Script Version
+For integration directly within Google Sheets, copy the `apps_script.gs` file into
+an Apps Script project bound to your spreadsheet. Set the script property
+`GEMINI_API_KEY` with your Gemini API key and reload the sheet. A new **Q&A Tools**
+menu will appear with a **Summarize Questions** item.
+
+When invoked, it reads all questions from column B, clusters them via Gemini
+embeddings, prioritizes clusters by size, and writes a summary with counts to a
+new sheet named `QA Summary`. A word frequency chart is also inserted for a quick
+visual overview of common terms.
diff --git a/final_project/apps_script.gs b/final_project/apps_script.gs
new file mode 100644
index 000000000..aba378d05
--- /dev/null
+++ b/final_project/apps_script.gs
@@ -0,0 +1,122 @@
+// Google Apps Script for summarizing lecture questions with Gemini
+// Place this script in the Google Sheets Apps Script editor.
+
+const API_BASE = 'https://generativelanguage.googleapis.com/v1beta';
+
+function getApiKey() {
+  const key = PropertiesService.getScriptProperties().getProperty('GEMINI_API_KEY');
+  if (!key) throw new Error('Set GEMINI_API_KEY in script properties.');
+  return key;
+}
+
+function onOpen() {
+  SpreadsheetApp.getActiveSpreadsheet()
+    .addMenu('Q&A Tools', [{name: 'Summarize Questions', functionName: 'summarizeQuestions'}]);
+}
+
+function summarizeQuestions() {
+  const ss = SpreadsheetApp.getActiveSpreadsheet();
+  const sheet = ss.getSheets()[0];
+  const lastRow = sheet.getLastRow();
+  const values = sheet.getRange(2, 2, lastRow - 1).getValues().flat(); // column B
+  const questions = values.filter(q => q);
+  if (!questions.length) return;
+
+  const embeddings = questions.map(q => embedText(q));
+  const k = Math.ceil(Math.sqrt(questions.length));
+  const result = kmeans(embeddings, k, 6);
+  const clusters = {};
+  result.labels.forEach((label, i) => {
+    (clusters[label] = clusters[label] || []).push(questions[i]);
+  });
+
+  const ordered = Object.entries(clusters)
+    .sort((a, b) => b[1].length - a[1].length);
+
+  let summarySheet = ss.getSheetByName('QA Summary');
+  if (summarySheet) ss.deleteSheet(summarySheet);
+  summarySheet = ss.insertSheet('QA Summary');
+  summarySheet.appendRow(['Priority', 'Representative QA', 'Count']);
+
+  ordered.forEach(([label, qs], idx) => {
+    const text = qs.join('\n');
+    const prompt = '以下の学生質問をまとめて代表質問と簡潔な回答を日本語で作成してください。\n' + text;
+    const summary = generateText(prompt);
+    summarySheet.appendRow([idx + 1, summary, qs.length]);
+  });
+
+  addWordCloud(summarySheet, questions);
+}
+
+function embedText(text) {
+  const url = `${API_BASE}/models/embedding-001:embedContent?key=${getApiKey()}`;
+  const payload = {content: {parts: [{text}]}};
+  const res = UrlFetchApp.fetch(url, {method: 'post', contentType: 'application/json', payload: JSON.stringify(payload)});
+  const data = JSON.parse(res.getContentText());
+  return data.embedding.values;
+}
+
+function generateText(prompt) {
+  const url = `${API_BASE}/models/gemini-pro:generateContent?key=${getApiKey()}`;
+  const payload = {contents: [{parts: [{text: prompt}]}]};
+  const res = UrlFetchApp.fetch(url, {method: 'post', contentType: 'application/json', payload: JSON.stringify(payload)});
+  const data = JSON.parse(res.getContentText());
+  return data.candidates[0].content.parts[0].text.trim();
+}
+
+function addWordCloud(sheet, questions) {
+  const freq = {};
+  questions.forEach(q => q.split(/\s+/).forEach(w => {
+    w = w.replace(/[\p{P}\p{S}]/gu, '').toLowerCase();
+    if (w) freq[w] = (freq[w] || 0) + 1;
+  }));
+  const data = Charts.newDataTable()
+    .addColumn(Charts.ColumnType.STRING, 'Word')
+    .addColumn(Charts.ColumnType.NUMBER, 'Count');
+  Object.entries(freq).forEach(([w, c]) => data.addRow([w, c]));
+  const chart = Charts.newBarChart()
+    .setDataTable(data)
+    .setTitle('Word Frequencies')
+    .setDimensions(600, 400)
+    .build();
+  sheet.insertChart(chart);
+}
+
+function kmeans(vectors, k, iters) {
+  const n = vectors.length;
+  const dims = vectors[0].length;
+  let centroids = [];
+  for (let i = 0; i < k; i++) {
+    centroids.push(vectors[Math.floor(Math.random() * n)].slice());
+  }
+  let labels = new Array(n).fill(0);
+  for (let t = 0; t < iters; t++) {
+    for (let i = 0; i < n; i++) {
+      let best = 0; let minD = Infinity;
+      for (let j = 0; j < k; j++) {
+        const d = distance2(vectors[i], centroids[j]);
+        if (d < minD) { minD = d; best = j; }
+      }
+      labels[i] = best;
+    }
+    let sums = Array.from({length: k}, () => Array(dims).fill(0));
+    let counts = Array(k).fill(0);
+    for (let i = 0; i < n; i++) {
+      counts[labels[i]]++;
+      for (let d = 0; d < dims; d++) sums[labels[i]][d] += vectors[i][d];
+    }
+    for (let j = 0; j < k; j++) {
+      if (counts[j]) centroids[j] = sums[j].map(x => x / counts[j]);
+    }
+  }
+  return {centroids, labels};
+}
+
+function distance2(a, b) {
+  let sum = 0;
+  for (let i = 0; i < a.length; i++) {
+    const diff = a[i] - b[i];
+    sum += diff * diff;
+  }
+  return sum;
+}
diff --git a/final_project/requirements.txt b/final_project/requirements.txt
new file mode 100644
index 000000000..156e463e8
--- /dev/null
+++ b/final_project/requirements.txt
@@ -0,0 +1,5 @@
+sentence-transformers
+scikit-learn
+google-generativeai
+google-api-python-client
+google-auth
diff --git a/final_project/sample_questions.txt b/final_project/sample_questions.txt
new file mode 100644
index 000000000..047676c98
--- /dev/null
+++ b/final_project/sample_questions.txt
@@ -0,0 +1,6 @@
+What is the difference between supervised and unsupervised learning?
+How do I choose the right evaluation metric for my model?
+Can you explain cross-validation?
+What is overfitting and how can we avoid it?
+How is unsupervised learning different from supervised?
+What are some common evaluation metrics for classification problems?
diff --git a/final_project/sheets_summarize.py b/final_project/sheets_summarize.py
new file mode 100644
index 000000000..85b37ea92
--- /dev/null
+++ b/final_project/sheets_summarize.py
@@ -0,0 +1,75 @@
+import argparse
+from pathlib import Path
+from typing import List, Dict
+import os
+
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+import google.generativeai as genai
+from sklearn.cluster import KMeans
+import numpy as np
+
+
+def fetch_questions(sheet_id: str, range_: str, creds_file: str) -> List[str]:
+    """Fetch questions from a Google Sheet range."""
+    scopes = ["https://www.googleapis.com/auth/spreadsheets.readonly"]
+    creds = service_account.Credentials.from_service_account_file(creds_file, scopes=scopes)
+    service = build("sheets", "v4", credentials=creds)
+    resp = service.spreadsheets().values().get(spreadsheetId=sheet_id, range=range_).execute()
+    values = resp.get("values", [])
+    # flatten and filter empty strings
+    return [row[0].strip() for row in values if row and row[0].strip()]
+
+
+def embed_questions(questions: List[str], model: str = "models/embedding-001") -> np.ndarray:
+    """Get embeddings for each question using Gemini."""
+    return np.array([
+        genai.embed_content(model=model, content=q)["embedding"]
+        for q in questions
+    ])
+
+
+def cluster_questions(questions: List[str], n_clusters: int) -> Dict[int, List[str]]:
+    """Cluster questions using KMeans."""
+    embeddings = embed_questions(questions)
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
+    labels = kmeans.fit_predict(embeddings)
+    clusters: Dict[int, List[str]] = {}
+    for label, question in zip(labels, questions):
+        clusters.setdefault(label, []).append(question)
+    return clusters
+
+
+def summarize_cluster(model, questions: List[str]) -> str:
+    text = " \n".join(questions)
+    prompt = (
+        "あなたは講義担当教員です。以下の質問をまとめて代表質問を作成し、その回答を日本語で200字以内で出力してください:\n" + text
+    )
+    resp = model.generate_content(prompt)
+    return resp.text.strip()
+
+
+def process_sheet(sheet_id: str, range_: str, creds_file: str) -> None:
+    api_key = os.getenv("GOOGLE_API_KEY")
+    if not api_key:
+        raise EnvironmentError("GOOGLE_API_KEY not set")
+    genai.configure(api_key=api_key)
+    questions = fetch_questions(sheet_id, range_, creds_file)
+    if not questions:
+        print("No questions found.")
+        return
+    n_clusters = max(1, int(len(questions) ** 0.5))
+    clusters = cluster_questions(questions, n_clusters)
+    model = genai.GenerativeModel("gemini-pro")
+    for i, qs in clusters.items():
+        summary = summarize_cluster(model, qs)
+        print(f"\n### Topic {i + 1}\n{summary}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Summarize questions from Google Sheets")
+    parser.add_argument("sheet_id", help="Spreadsheet ID")
+    parser.add_argument("range", help="Range like Sheet1!A:A")
+    parser.add_argument("credentials", type=Path, help="Path to service account JSON")
+    args = parser.parse_args()
+    process_sheet(args.sheet_id, args.range, str(args.credentials))
diff --git a/final_project/summarize.py b/final_project/summarize.py
new file mode 100644
index 000000000..fc0036bda
--- /dev/null
+++ b/final_project/summarize.py
@@ -0,0 +1,67 @@
+import argparse
+import math
+from pathlib import Path
+from typing import List, Dict
+
+from sentence_transformers import SentenceTransformer
+from sklearn.cluster import KMeans
+import os
+import google.generativeai as genai
+
+
+def load_questions(path: Path) -> List[str]:
+    with open(path, "r", encoding="utf-8") as f:
+        return [line.strip() for line in f if line.strip()]
+
+
+def cluster_questions(questions: List[str], model_name: str = "all-MiniLM-L6-v2") -> Dict[int, List[str]]:
+    model = SentenceTransformer(model_name)
+    embeddings = model.encode(questions)
+    n_clusters = max(1, int(math.sqrt(len(questions))))
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
+    labels = kmeans.fit_predict(embeddings)
+    clusters: Dict[int, List[str]] = {}
+    for label, question in zip(labels, questions):
+        clusters.setdefault(label, []).append(question)
+    return clusters
+
+
+def summarize_clusters(clusters: Dict[int, List[str]]) -> List[str]:
+    api_key = os.getenv("GOOGLE_API_KEY")
+    if not api_key:
+        raise EnvironmentError("GOOGLE_API_KEY not set")
+
+    genai.configure(api_key=api_key)
+    model = genai.GenerativeModel("gemini-pro")
+
+    summaries = []
+    for questions in clusters.values():
+        text = " ".join(questions)
+        prompt = (
+            "Summarize the following questions into one representative question or short summary:\n"
+            + text
+        )
+        response = model.generate_content(prompt)
+        summaries.append(response.text.strip())
+    return summaries
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Summarize lecture questions")
+    parser.add_argument("input", type=Path, help="Text file with one question per line")
+    args = parser.parse_args()
+
+    questions = load_questions(args.input)
+    if not questions:
+        print("No questions found.")
+        return
+
+    clusters = cluster_questions(questions)
+    summaries = summarize_clusters(clusters)
+
+    for i, summary in enumerate(summaries, 1):
+        print(f"\n### Topic {i}\n{summary}")
+
+
+if __name__ == "__main__":
+    main()