From 121559566b1f5287b228e3b18ad98ec8bfc4b7ac Mon Sep 17 00:00:00 2001
From: Oleg Korznikov <54912398+sudoleg@users.noreply.github.com>
Date: Sun, 9 Jun 2024 15:39:59 +0200
Subject: [PATCH] Add basic tests (#56)

* refactor: minor project structure changes

* chore: git ignore data

* chore: adjust gitignore transcripts dir

* test: add some basic tests

* test: execute tests on every branch

* test: rm redundant comment
---
 .github/workflows/test.yaml |  22 +
 .gitignore                  |   5 +-
 Dockerfile                  |   3 +-
 data/chroma/.gitkeep        |   0
 main.py                     |   3 +-
 modules/__init__.py         |   0
 requirements.txt            |   3 +-
 scripts/chat.ipynb          | 829 ++++++++++++++++++++++++++++++++++++
 tests/.gitkeep              |   0
 tests/__init__.py           |   0
 tests/test_main.py          |  41 ++
 11 files changed, 902 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/test.yaml
 create mode 100644 data/chroma/.gitkeep
 create mode 100644 modules/__init__.py
 create mode 100644 scripts/chat.ipynb
 create mode 100644 tests/.gitkeep
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_main.py

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
new file mode 100644
index 0000000..e088017
--- /dev/null
+++ b/.github/workflows/test.yaml
@@ -0,0 +1,22 @@
+# .github/workflows/streamlit-app.yml
+name: Test streamlit app
+
+on:
+  push:
+    branches:
+      - '**'
+
+permissions:
+  contents: read
+
+jobs:
+  test_streamlit_app:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12.3"
+      - uses: streamlit/streamlit-app-action@v0.0.3
+        with:
+          app-path: main.py
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 58203f3..722432a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,10 @@ design/
 test/
 certs/
 *.ipynb
-transcripts/
+transcripts*/
+/data/chroma/*
+!/data/chroma/.gitkeep
+*.sqlite3
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/Dockerfile b/Dockerfile
index ddcf6c0..5b292be 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 # Base stage for shared environment setup
-FROM python:3.12-slim
+FROM python:3.12.3-slim
 
 # Set working directory
 WORKDIR /app
@@ -14,6 +14,7 @@ RUN pip3 install -r requirements.txt
 COPY . /app/
 
 # Streamlit's configuration options
+ENV PYTHONPATH="/app"
 ENV STREAMLIT_CLIENT_TOOLBAR_MODE="viewer"
 ENV STREAMLIT_SERVER_PORT=8501
 
diff --git a/data/chroma/.gitkeep b/data/chroma/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/main.py b/main.py
index 093ab4e..4180283 100644
--- a/main.py
+++ b/main.py
@@ -121,9 +121,10 @@ def main():
         )
         custom_prompt = st.text_area(
             "Enter a custom prompt if you want:",
+            key="custom_prompt_input",
             help=get_default_config_value("help_texts.custom_prompt"),
         )
-        summarize_button = st.button("Summarize")
+        summarize_button = st.button("Summarize", key="summarize_button")
 
         if url != "":
             try:
diff --git a/modules/__init__.py b/modules/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/requirements.txt b/requirements.txt
index 9e805aa..0c5ac7f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,5 @@ youtube-transcript-api==0.6.2
 langchain-community==0.2.4
 langchain-openai==0.1.8
 streamlit==1.35.0
-watchdog==4.0.1
\ No newline at end of file
+watchdog==4.0.1
+pytest==8.2.2
\ No newline at end of file
diff --git a/scripts/chat.ipynb b/scripts/chat.ipynb
new file mode 100644
index 0000000..4dafb37
--- /dev/null
+++ b/scripts/chat.ipynb
@@ -0,0 +1,829 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "# for PYTHONPATH\n",
+    "# set PYTHONPATH to the root directory of this project on your system\n",
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import textwrap\n",
+    "\n",
+    "def print_wrapped(text: str, width: int = 128):\n",
+    "    wrapped_content = textwrap.fill(text=text, width=width)\n",
+    "    print(wrapped_content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Set up SQLite\n",
+    "\n",
+    "It's useful to save some metadata about the video for future use."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from peewee import SqliteDatabase, Model, CharField, BooleanField, IntegerField, DateTimeField\n",
+    "sql_db = SqliteDatabase('../data/videos.sqlite3')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Video(Model):\n",
+    "    yt_video_id = CharField(unique=True)\n",
+    "    title = CharField()\n",
+    "    language = CharField(null=True)\n",
+    "    channel = CharField(null=True)\n",
+    "    saved_on = DateTimeField(null=True)\n",
+    "    preprocessed = BooleanField(null=True)\n",
+    "    chunk_size = IntegerField(null=True)\n",
+    "    transcript_token_num = IntegerField(null=True)\n",
+    "\n",
+    "    class Meta:\n",
+    "        database = sql_db"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sql_db.connect(reuse_if_open=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['video']"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sql_db.get_tables()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Table already exists\n"
+     ]
+    }
+   ],
+   "source": [
+    "if not sql_db.table_exists('video'):\n",
+    "    sql_db.create_tables([Video])\n",
+    "else:\n",
+    "    print(\"Table already exists\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fetch transcript"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from modules.youtube import fetch_youtube_transcript, extract_youtube_video_id\n",
+    "from modules.helpers import save_response_as_file\n",
+    "from modules.helpers import num_tokens_from_string\n",
+    "\n",
+    "video_url = \"https://youtu.be/qe6dSDq5GV0?si=wN13pRWZPqzeyrCz\"\n",
+    "video_id = extract_youtube_video_id(video_url)\n",
+    "transcript = fetch_youtube_transcript(video_url)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "How Fasting & Caloric Restriction Impact Health | Dr. Satchin Panda & Dr. Andrew Huberman\n"
+     ]
+    }
+   ],
+   "source": [
+    "from modules.youtube import get_video_metadata\n",
+    "\n",
+    "meta = get_video_metadata(video_url)\n",
+    "video_title = meta['name']\n",
+    "print(video_title)\n",
+    "save_response_as_file(\"../transcripts\", video_title, transcript)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "video = Video.create(\n",
+    "    yt_video_id = video_id,\n",
+    "    title = video_title,\n",
+    "    channel = meta['channel']\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datetime import datetime\n",
+    "\n",
+    "Video.update({Video.saved_on: datetime.now()}).where(Video.yt_video_id == video_id).execute()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Split yet unprocessed transcript into chunks"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A relatively small chunk size is used, as the model tends to ignore the middle part of the transcript, if it's too long. Probably because of \"Lost in the middle\".\n",
+    "\n",
+    "- https://arxiv.org/abs/2307.03172"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CHUNK_SIZE_FOR_UNPROCESSED_TRANSCRIPT = 932"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Split unprocessed transcript into 4 chunks.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
+    "\n",
+    "text_splitter = RecursiveCharacterTextSplitter(\n",
+    "    chunk_size=CHUNK_SIZE_FOR_UNPROCESSED_TRANSCRIPT,\n",
+    "    chunk_overlap=32,\n",
+    "    length_function=num_tokens_from_string,\n",
+    "    is_separator_regex=False,\n",
+    ")\n",
+    "\n",
+    "# Split the transcript into chunks \n",
+    "transcript_excerpts = text_splitter.create_documents([transcript])\n",
+    "print(f\"Split unprocessed transcript into {len(transcript_excerpts)} chunks.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The unprocessed transcript has 2845 tokens.\n"
+     ]
+    }
+   ],
+   "source": [
+    "num_tokens_transcript = num_tokens_from_string(transcript, encoding_name=\"cl100k_base\")\n",
+    "print_wrapped(f\"The unprocessed transcript has {num_tokens_transcript} tokens.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Initialize LLM and prompts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Option 1: OpenAI (GPT-3.5-turbo)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from os import getenv\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "llm = ChatOpenAI(\n",
+    "    api_key=getenv(\"OPENAI_API_KEY\"),\n",
+    "    temperature=0.3,\n",
+    "    model=\"gpt-3.5-turbo\",\n",
+    "    max_tokens=2048\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts.chat import SystemMessage, HumanMessagePromptTemplate\n",
+    "\n",
+    "user_prompt = HumanMessagePromptTemplate.from_template(\n",
+    "    \"\"\"Here is part {number}, delimited by ---\n",
+    "\n",
+    "    ---\n",
+    "    {transcript_excerpt}\n",
+    "    ---\n",
+    "    \"\"\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Token number in system prompt: 85\n"
+     ]
+    }
+   ],
+   "source": [
+    "system_prompt = \"You are giong to receive excerpts from an automatically generated video transcript. Your task is to convert every excerpt into structured text. Ensure that the content of the excerpts remains unchanged. Add appropriate punctuation, correct any grammatical errors, remove filler words and divide the text into logical paragraphs, separating them with a single new line. The final output should be in plain text and only include the modified transcript excerpt without any prelude.\"\n",
+    "print(\"Token number in system prompt: \" + str(num_tokens_from_string(system_prompt)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Process transcript"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_messages = []\n",
+    "for num, excerpt in enumerate(transcript_excerpts):\n",
+    "    batch_messages.append([\n",
+    "        SystemMessage(content=system_prompt),\n",
+    "        user_prompt.format(number=num, transcript_excerpt=excerpt.page_content)\n",
+    "    ])\n",
+    "response = llm.generate(batch_messages)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result =  \"\\n\\n\".join(gen[0].text for gen in response.generations)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The initial transcript has 2845 tokens.\n",
+      "The response has 2320 tokens.\n"
+     ]
+    }
+   ],
+   "source": [
+    "num_tokens_response = num_tokens_from_string(result, encoding_name=\"cl100k_base\")\n",
+    "print(f\"The initial transcript has {num_tokens_transcript} tokens.\")\n",
+    "print(f\"The response has {num_tokens_response} tokens.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_response_as_file(dir_name=\"../transcripts_processed\", filename=video_title, file_content=result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Video.update(\n",
+    "    {\n",
+    "        Video.preprocessed: True,\n",
+    "        Video.transcript_token_num: num_tokens_transcript\n",
+    "    }).where(Video.yt_video_id == video_id).execute()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Split the processed transcript"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CHUNK_SIZE_FOR_PROCESSED_TRANSCRIPT = 1024"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Video.update(\n",
+    "    {\n",
+    "        Video.chunk_size: CHUNK_SIZE_FOR_PROCESSED_TRANSCRIPT\n",
+    "    }).where(Video.yt_video_id == video_id).execute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "So there's a famous experiment that was published last year by Joe Takahashi's lab, and it came out in Science, and that relates\n",
+      "to caloric restriction. We kind of started with this idea, discussing that the rat experiments were done with caloric\n",
+      "restriction. Researchers gave reduced calorie conjunction by 20% or 30%, and the rats, and subsequently mice, all lived longer.\n",
+      "What is interesting is, in all those experiments, the researchers gave a bolus of food at one time, whereas the ad libitum fed\n",
+      "mice or rats had access to food all the time, eating constantly. The rats given 20% less food consumed it all within two to four\n",
+      "hours, following an OMAD diet, one meal a day, concept. They either finished eating in three to four hours or had a four-hour\n",
+      "eating period followed by 20 hours of fasting.\n",
+      "----------------\n",
+      "The question arose whether the benefit of caloric restriction is due to reduced calories or time-restricted feeding. There is a\n",
+      "timing component to it, as the animals consumed all the food within three to four hours, followed by a long fasting period. This\n",
+      "posed a challenge as splitting the food into multiple small portions for frequent feeding intervals would be cumbersome.  Joe\n",
+      "Takahashi, who published the first paper in 2017 on caloric restriction, showed that the protocol used in the caloric\n",
+      "restriction field created a condition of time-restriction. He collaborated with engineers to develop a programmable feeding\n",
+      "system to control the timing and amount of food given to mice throughout the day and night.\n",
+      "----------------\n",
+      "For example, if an ad libitum fed mouse eats five grams of chow in a day, reducing calories by 20% would mean the CR mouse\n",
+      "should receive four grams of food divided into 9 or 10 meals given every 90 minutes. This feeding schedule eliminated fasting\n",
+      "periods, ensuring the mice received food at regular intervals.  The study measured the lifespan of mice under different feeding\n",
+      "conditions. The ad libitum fed mice lived a certain number of days, while the caloric restricted mice, who snacked throughout\n",
+      "the day and night without super fasting, lived 10% longer. This indicated that caloric restriction extended lifespan by 10%.  So\n",
+      "that means caloric restriction extended lifespan by 10%. I've wondered about this because recently, you know, there were a bunch\n",
+      "of news headlines about intermittent fasting, and frankly I was frustrated. If you looked at one major news outlet, they would\n",
+      "say time-restricted feeding affords no additional benefit beyond caloric restriction for weight loss.\n",
+      "----------------\n",
+      "SATCHIN: Yeah. Then another popular press venue, let's call it that, same study, described as time-restricted feeding doesn't\n",
+      "work. Yeah. Right, and then another one, it may be someplace even more extreme, you know, time-restricted feeding only\n",
+      "beneficial because of caloric restriction, or something like that. So what you've essentially got are three different\n",
+      "interpretations of the same data, all of which are, well, two of which are true, one of which is false, in my opinion.\n",
+      "----------------\n",
+      "But, what I think people take away from that is, \"Oh, time-restricted feeding isn't valuable,\" which is not the case. I think\n",
+      "for many people it's a convenient way to eat because, at least for people like me, it's simpler to designate between portions of\n",
+      "my day when I'm eating and portions of my day when I'm not eating, as opposed to portion control. For other people, portion\n",
+      "control can work, but all of that is related to either maintenance or loss of weight. None of it deals with the potential health\n",
+      "benefits independent of weight loss.  Yeah. And so I think that if we can segment those out, obviously in humans it's hard to\n",
+      "know if a given treatment or experiment is extending life because you don't really know how long people would live anyway.\n",
+      "Right, whereas with mice, you have some sense of when the mortality was likely to occur.\n",
+      "----------------\n",
+      "So, what can we say about time-restricted feeding and longevity in terms of biomarkers or in terms of any other indication that\n",
+      "people who start and stop their feeding window at a consistent time somewhere between 8 and 12 hours per 24-hour cycle are\n",
+      "tilting the scales towards living longer as opposed to living shorter.  This example of this news article that you mentioned is\n",
+      "really interesting because that relates to Joe Takahashi's study, because I described that if you split calories and eat\n",
+      "throughout the day, throughout day and night, then the mice lived 10% extra. But if you now give mouse the same caloric\n",
+      "restricted diet and fit them during daytime, whether within 12 hours or 2 hours, then the mice live 10% extra. Beyond that. Yes.\n",
+      "So 20%.\n",
+      "----------------\n",
+      "So, okay, so let me make sure I understand so that I make sure I understand. If you take a certain number of calories and you\n",
+      "distribute them throughout the 24-hour cycle, it's caloric restriction, the mice will live 10% longer. If you however restrict\n",
+      "that to the active cycle, so for humans, the daytime, then they live 20% longer. 20% longer.  So, it's not just total caloric\n",
+      "intake. Meaning it's not just important to be sub-maintenance in calories for sake of longevity. It also is important as to when\n",
+      "in the 24-hour cycle you eat those calories. Do I have that right?\n",
+      "----------------\n",
+      "So now, still, the story is not over because these mice are fed during daytime when they're not supposed to eat. That's right.\n",
+      "So for us it would be the equivalent of being on the night shift and only eating at night, but a sub-maintenance calorie diet, I\n",
+      "guess, is the right way to say it. But when he fed mice during nighttime, when they're supposed to eat, and they're getting the\n",
+      "same number of calories within 12 hours or 2 hours, then the mice lived 35% longer than they control. 35% longer. So, scale to\n",
+      "human lifespan,\n",
+      "----------------\n",
+      "Then the mice lived 35% longer than the control, 35% longer. Scaling to human lifespan, which we don't know, but 35% longer\n",
+      "would mean that - and again, no one knows - but humans, now, what is the average mortality in the United States? Somewhere\n",
+      "around 80? Yeah, so it's around 80, it used to be 80. Now reduced a little bit because of COVID, but let's take 80. Okay, so\n",
+      "people are then now living somewhere between 25 and 35 years longer. I'm putting some error bars on there. Yeah, yeah, yeah.\n",
+      "----------------\n",
+      "Amazing. So that was really profound. But now, you pointed out biomarkers and other stuff, so now if you look at any given time\n",
+      "within that experiment, and actually Joe went back and had a separate cohort of mice, very similar, and so that he could take\n",
+      "tissue samples and of course in this case you have to sacrifice the mouse. He looked for, he did a lot of molecular analysis\n",
+      "with known markers. For example, hemoglobin A1C equivalent or glucose control, cholesterol, all this stuff. He could not find\n",
+      "anything that predicted the benefit of caloric restriction. So that means in this experiment, whatever we know so far, the\n",
+      "predictor of longevity, none of them could predict whether this CR-only mouse, throughout day and night, that mouse is going to\n",
+      "live less than the night-fed mouse that was going to live 25% extra.\n",
+      "----------------\n",
+      "Does that mean that there are biomarkers related to longevity that we just haven't discovered yet? Yeah, so that's exactly. So\n",
+      "that means whatever we know so far about biomarkers, he could not use to predict. Maybe there was a lot of noise. Maybe he had\n",
+      "to use more number of mice to get that because you know, biomarkers are not going to predict in every instance, or there is some\n",
+      "error. What is also very interesting is, if you look at the body weight and body composition of all these mice, there is no\n",
+      "difference in body weight and body composition across all these different groups, across all these groups. So it doesn't matter\n",
+      "when they ate, provided they were sub-maintenance calorie intake, so fewer calories than is required to maintain their weight.\n",
+      "Didn't matter what pattern of eating, they were the same weight.\n",
+      "----------------\n",
+      "So that, in many ways, seems to mimic the human studies where they say, \"Look, it doesn't really matter whether or not you use\n",
+      "caloric restriction, or you start your feeding window in the morning or start your feeding window in the evening, or you portion\n",
+      "control for the sake of weight loss.\" Because we're taking a snapshot of that. And then another thing with the human study that\n",
+      "we are referring to here, in that human study, people are actually already eating within a 10-hour window habitually when they\n",
+      "selected these people to have them enroll in the study. So they were already eating for 10 hours and fasting for 14 hours. All\n",
+      "participants had to reduce their caloric intake, and they reduced by almost 25%. The CR group continued with a 10-hour eating\n",
+      "window and the CR plus time-restricted group had to eat the same number of calories within eight hours. So it's just a two-hour\n",
+      "difference. It's just a two-hour difference.\n",
+      "----------------\n",
+      "Okay, so I just want to make sure people can understand. So, in this human study, which is the one that I felt that the popular\n",
+      "press venues, all except one venue, got either semi-wisely or badly wrong in terms of their conclusion, that was my\n",
+      "interpretation anyway, was that either people came into the study eating basically in a 10-hour feeding window, which goes back\n",
+      "to my first question, which is that most people are not eating in the middle of the night. Yeah, yeah. Or if they're on shift\n",
+      "work and they are.  In the middle of the night.  Yeah.  Or if they're on shift work and they are, then they're sleeping during\n",
+      "the day anyway. So they're eating in a 10 to 12-hour feeding window anyway.\n",
+      "----------------\n",
+      "So you're saying they either did caloric restriction portion control within a 10-hour window, or another group within the study\n",
+      "ate sub-maintenance calories, so caloric restriction, CR, as we're calling it, the acronym CR, but restricted that to an 8-hour\n",
+      "feeding window. And they didn't see any difference in terms of weight loss.  Yeah.  But it's not all that surprising, right? I\n",
+      "mean if it's just a 2-hour difference.  Yeah, exactly.  So, we have done that experiment in mice and we don't see difference in\n",
+      "not only weight loss, many other markers. And I was telling you about Joe Takahashi's paper where I told you that he allowed\n",
+      "this mice to eat within 2 hours or 12 hours, sub-caloric diet.  2 or 12.  2 or 12, yeah.  That's dramatic.\n",
+      "----------------\n",
+      "But still he did not see change in longevity even within those 2. So that means when you do caloric restriction and then, at\n",
+      "least with mouse, and you are within 12-hours window, that is giving the mice the best benefit, the optimum benefit. And 2, 3,\n",
+      "or 5, or 12 per mouse doesn't matter, at least for longevity.  [MUSIC PLAYING]\n",
+      "----------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
+    "\n",
+    "splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=0)\n",
+    "chunks = splitter.create_documents([result])\n",
+    "for chunk in chunks:\n",
+    "    print_wrapped(chunk.page_content)\n",
+    "    print(\"----------------\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create a vector DB"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Option 1: OpenAI embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "\n",
+    "# https://platform.openai.com/docs/models/embeddings\n",
+    "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import chromadb\n",
+    "from langchain_chroma import Chroma\n",
+    "from chromadb.config import Settings\n",
+    "\n",
+    "chroma_settings = Settings(allow_reset=True)\n",
+    "chroma_client = chromadb.HttpClient(settings=chroma_settings)\n",
+    "\n",
+    "db = Chroma(\n",
+    "    client=chroma_client, collection_name=f\"{video_id}_{CHUNK_SIZE_FOR_PROCESSED_TRANSCRIPT}\", embedding_function=embeddings\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import uuid\n",
+    "\n",
+    "collection = chroma_client.get_or_create_collection(name=f\"{video_id}_{CHUNK_SIZE_FOR_PROCESSED_TRANSCRIPT}\")\n",
+    "if collection.count() <= 0:\n",
+    "    for d in chunks:\n",
+    "        response = embeddings.embed_query(d.page_content)\n",
+    "        collection.add(\n",
+    "            ids=[str(uuid.uuid1())],\n",
+    "            embeddings=[response],\n",
+    "            documents=[d.page_content],\n",
+    "            #metadatas=[d.metadata]\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Test generation - answer a question"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question = \"What did the caloric restriction experiment in mice and rats show?\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "retriever = db.as_retriever(search_kwargs={\"k\": 3})\n",
+    "relevant_docs = retriever.invoke(input=question)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "So there's a famous experiment that was published last year by Joe Takahashi's lab, and it came out in Science, and that relates\n",
+      "to caloric restriction. We kind of started with this idea, discussing that the rat experiments were done with caloric\n",
+      "restriction. Researchers gave reduced calorie conjunction by 20% or 30%, and the rats, and subsequently mice, all lived longer.\n",
+      "What is interesting is, in all those experiments, the researchers gave a bolus of food at one time, whereas the ad libitum fed\n",
+      "mice or rats had access to food all the time, eating constantly. The rats given 20% less food consumed it all within two to four\n",
+      "hours, following an OMAD diet, one meal a day, concept. They either finished eating in three to four hours or had a four-hour\n",
+      "eating period followed by 20 hours of fasting.\n",
+      "----------------------\n",
+      "The question arose whether the benefit of caloric restriction is due to reduced calories or time-restricted feeding. There is a\n",
+      "timing component to it, as the animals consumed all the food within three to four hours, followed by a long fasting period. This\n",
+      "posed a challenge as splitting the food into multiple small portions for frequent feeding intervals would be cumbersome.  Joe\n",
+      "Takahashi, who published the first paper in 2017 on caloric restriction, showed that the protocol used in the caloric\n",
+      "restriction field created a condition of time-restriction. He collaborated with engineers to develop a programmable feeding\n",
+      "system to control the timing and amount of food given to mice throughout the day and night.\n",
+      "----------------------\n",
+      "For example, if an ad libitum fed mouse eats five grams of chow in a day, reducing calories by 20% would mean the CR mouse\n",
+      "should receive four grams of food divided into 9 or 10 meals given every 90 minutes. This feeding schedule eliminated fasting\n",
+      "periods, ensuring the mice received food at regular intervals.  The study measured the lifespan of mice under different feeding\n",
+      "conditions. The ad libitum fed mice lived a certain number of days, while the caloric restricted mice, who snacked throughout\n",
+      "the day and night without super fasting, lived 10% longer. This indicated that caloric restriction extended lifespan by 10%.  So\n",
+      "that means caloric restriction extended lifespan by 10%. I've wondered about this because recently, you know, there were a bunch\n",
+      "of news headlines about intermittent fasting, and frankly I was frustrated. If you looked at one major news outlet, they would\n",
+      "say time-restricted feeding affords no additional benefit beyond caloric restriction for weight loss.\n",
+      "----------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "for doc in relevant_docs:\n",
+    "    print_wrapped(doc.page_content)\n",
+    "    print(\"----------------------\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import PromptTemplate\n",
+    "\n",
+    "rag_prompt = PromptTemplate.from_template(\"\"\"Context: {context}\n",
+    "                                          \n",
+    "Answer the question based on the context provided above. Keep your answer ground in the facts of the context.\n",
+    "If the context does not contain the facts to answer the question, apologize and say that you don't know the answer.\n",
+    "                                          \n",
+    "Here is the question: {question}\n",
+    "\n",
+    "\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def format_docs_for_context(docs):\n",
+    "    return \"\\n\\n---\\n\\n\".join(doc.page_content for doc in docs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.output_parsers import StrOutputParser\n",
+    "\n",
+    "rag_chain = rag_prompt | llm | StrOutputParser()\n",
+    "\n",
+    "answer = rag_chain.invoke({\"question\": question, \"context\": format_docs_for_context(relevant_docs)})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The caloric restriction experiment in mice and rats showed that reducing calorie intake by 20% or 30% led to longer lifespans\n",
+      "compared to ad libitum fed mice or rats. The animals given reduced calories consumed their food within a few hours, followed by\n",
+      "a long fasting period, resembling an OMAD diet. Additionally, a study by Joe Takahashi's lab demonstrated that caloric\n",
+      "restriction extended lifespan by 10% when compared to mice who snacked throughout the day and night without extended fasting\n",
+      "periods.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_wrapped(answer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/.gitkeep b/tests/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_main.py b/tests/test_main.py
new file mode 100644
index 0000000..9a20c03
--- /dev/null
+++ b/tests/test_main.py
@@ -0,0 +1,41 @@
+import os
+
+import pytest
+from streamlit.testing.v1 import AppTest
+
+from modules.youtube import (
+    InvalidUrlException,
+    fetch_youtube_transcript,
+    get_video_metadata,
+)
+
+
+def test_initial_session_state():
+    # Initialize the simulated app and execute the first script run
+    os.environ["OPENAI_API_KEY"] = "sk-proj-xyz"
+    at = AppTest("main.py", default_timeout=60.0).run()
+    assert at.session_state.model == "gpt-3.5-turbo"
+    assert at.session_state.temperature == 1.0
+    assert at.session_state.top_p == 1.0
+    assert at.session_state.save_responses is False
+    assert at.session_state.openai_api_key is not None
+
+
+def test_checking_save_responses():
+    at = AppTest("main.py", default_timeout=60.0).run()
+    assert at.session_state.save_responses is False
+    assert at.checkbox(key="save_responses").value is False
+    at.checkbox(key="save_responses").check().run()
+    assert at.session_state.save_responses is True
+    assert at.checkbox(key="save_responses").value is True
+
+
+def test_invalid_video_urls():
+    invalid_urls = ["", "Hello World", "https://platform.openai.com/docs/overview"]
+    for url in invalid_urls:
+        with pytest.raises(InvalidUrlException):
+            get_video_metadata(url)
+
+    for url in invalid_urls:
+        with pytest.raises(InvalidUrlException):
+            fetch_youtube_transcript(url)