From fa3517cc5f40c8675c1d60a4fb8c3d8ef874685c Mon Sep 17 00:00:00 2001
From: Farid Taba <farid.t@gmail.com>
Date: Tue, 30 Apr 2024 11:16:44 -0700
Subject: [PATCH 1/9] Blogger v0.1 (Work in Progress)

---
 demo/blog_writer/actions.py       |  55 ++++++++
 demo/blog_writer/agent_config.yml |  49 +++++++
 demo/blog_writer/main.py          |  82 ++++++++++++
 demo/blog_writer/outliner.py      | 214 ++++++++++++++++++++++++++++++
 demo/blog_writer/requirements.txt |   4 +
 5 files changed, 404 insertions(+)
 create mode 100644 demo/blog_writer/actions.py
 create mode 100644 demo/blog_writer/agent_config.yml
 create mode 100644 demo/blog_writer/main.py
 create mode 100644 demo/blog_writer/outliner.py
 create mode 100644 demo/blog_writer/requirements.txt

diff --git a/demo/blog_writer/actions.py b/demo/blog_writer/actions.py
new file mode 100644
index 00000000..c3f64383
--- /dev/null
+++ b/demo/blog_writer/actions.py
@@ -0,0 +1,55 @@
+from langchain.document_loaders import PDFMinerLoader
+from langchain.text_splitter import SentenceTransformersTokenTextSplitter
+from langchain.vectorstores.chroma import Chroma
+from loguru import logger
+
+from sherpa_ai.actions.base import BaseAction
+
+
+class DocumentSearch(BaseAction):
+    def __init__(self, filename, embedding_function, k=4):
+        # file name of the pdf
+        self.filename = filename
+        # the embedding function to use
+        self.embedding_function = embedding_function
+        # number of results to return in search
+        self.k = k
+
+        # load the pdf and create the vector store
+        self.chroma = Chroma(embedding_function = embedding_function)
+        documents = PDFMinerLoader(self.filename).load()
+        documents = SentenceTransformersTokenTextSplitter(chunk_overlap=0).split_documents(documents)
+
+        logger.info(f"Adding {len(documents)} documents to the vector store")
+        self.chroma.add_documents(documents)
+        logger.info("Finished adding documents to the vector store")
+
+    def execute(self, query):
+        """
+        Execute the action by searching the document store for the query
+
+        Args:
+            query (str): The query to search for
+
+        Returns:
+            str: The search results combined into a single string
+        """
+
+        results = self.chroma.search(query, search_type="mmr", k=self.k)
+        return "\n\n".join([result.page_content for result in results])
+
+    @property
+    def name(self) -> str:
+        """
+        The name of the action, used to describe the action to the agent.
+        """
+        return "DocumentSearch"
+
+    @property
+    def args(self) -> dict:
+        """
+        The arguments that the action takes, used to describe the action to the agent.
+        """
+        return {
+            "query": "string"
+        }
diff --git a/demo/blog_writer/agent_config.yml b/demo/blog_writer/agent_config.yml
new file mode 100644
index 00000000..82f76afa
--- /dev/null
+++ b/demo/blog_writer/agent_config.yml
@@ -0,0 +1,49 @@
+shared_memory:
+    _target_: sherpa_ai.memory.shared_memory.SharedMemory  # The absolute path to the share memory class in the library
+    objective: Answer the question  # Objective for the agent, since this is a question answering agent, the objective is to answer questions
+
+agent_config: # For the demo, default configuration is used. You can change the configuration as per your requirement
+    _target_: sherpa_ai.config.task_config.AgentConfig
+
+
+llm:  # Configuration for the llm, here we are using the OpenAI GPT-3.5-turbo model
+    _target_: langchain.chat_models.ChatOpenAI
+    model_name: gpt-3.5-turbo
+    temperature: 0
+
+embedding_func:
+    _target_: langchain.embeddings.SentenceTransformerEmbeddings
+    model_name: sentence-transformers/all-mpnet-base-v2
+
+doc_search:
+    _target_: actions.DocumentSearch
+    filename: transcript.pdf
+    embedding_function: ${embedding_func}
+    k: 4
+
+google_search:
+    _target_: sherpa_ai.actions.GoogleSearch
+    role_description: Act as a question answering agent
+    task: Question answering
+    llm: ${llm}
+    include_metadata: true
+    config: ${agent_config}
+
+citation_validation:  # The tool used to validate and add citation to the answer
+    _target_: sherpa_ai.output_parsers.citation_validation.CitationValidation
+    sequence_threshold: 0.5
+    jaccard_threshold: 0.5
+    token_overlap: 0.5
+
+qa_agent:
+    _target_: sherpa_ai.agents.qa_agent.QAAgent
+    llm: ${llm}
+    shared_memory: ${shared_memory}
+    name: QA Sherpa
+    description: You are a question-answering assistant helping users to find answers based on the document. For each question, first try to collection relevant information by DocumentSearch. Then, use Google Search to find the answer in the next step.
+    agent_config: ${agent_config}
+    num_runs: 2
+    validation_steps: 1
+    actions:
+        - ${doc_search}
+        - ${google_search}
diff --git a/demo/blog_writer/main.py b/demo/blog_writer/main.py
new file mode 100644
index 00000000..52ca207c
--- /dev/null
+++ b/demo/blog_writer/main.py
@@ -0,0 +1,82 @@
+import json
+import os
+from argparse import ArgumentParser
+
+import pypandoc
+from hydra.utils import instantiate
+from omegaconf import OmegaConf
+from sherpa_ai.agents import QAAgent
+from sherpa_ai.events import EventType
+
+from outliner import Outliner
+
+
+def get_qa_agent_from_config_file(
+    config_path: str,
+) -> QAAgent:
+    """
+    Create a QAAgent from a config file.
+
+    Args:
+        config_path: Path to the config file
+
+    Returns:
+        QAAgent: A QAAgent instance
+    """
+
+    config = OmegaConf.load(config_path)
+
+    agent_config = instantiate(config.agent_config)
+    qa_agent: QAAgent = instantiate(config.qa_agent, agent_config=agent_config)
+
+    return qa_agent
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--config", type=str, default="agent_config.yaml")
+    parser.add_argument("--transcript", type=str, default="transcript.txt")
+    args = parser.parse_args()
+
+    qa_agent = get_qa_agent_from_config_file(args.config)
+
+    outliner = Outliner(args.transcript)
+    blueprint = outliner.full_transcript2blueprint()
+    if blueprint.startswith("```"):
+        # The first and last lines are code block delimiters; remove them
+        lines = blueprint.split("\n")[1:-1]
+        pure_json_str = "\n".join(lines)
+    else:
+        pure_json_str = blueprint
+
+    with open("blueprint.json", "w") as f:
+        f.write(pure_json_str)
+
+    parsed_json = json.loads(pure_json_str)
+
+    blog = ""
+    for key in parsed_json:
+        blog += "# " + key + "\n"
+        for question in parsed_json[key]:
+            # Add the question to the shared memory. By default, the agent will take the last
+            # message in the shared memory as the task.
+            qa_agent.shared_memory.add(EventType.task, "human", question)
+            result = qa_agent.run()
+            blog += result + "\n"
+        blog += "\n"
+
+    with open("blog.md", "w") as f:
+        f.write(blog)
+
+    print("\nBlog generated successfully!\n")
+
+    # save_format = None
+    # while save_format is None:
+    #     save_format = input(
+    #         "Select format to save the blog in: 1. Markdown (Default) 2. ReStructured Text\n"
+    #     )
+
+    # if save_format == "2":
+    #     output = pypandoc.convert("blog.md", "rst")
+    #     if os.path.exists("blog.md"):
+    #         os.remove("blog.md")
diff --git a/demo/blog_writer/outliner.py b/demo/blog_writer/outliner.py
new file mode 100644
index 00000000..27c644ba
--- /dev/null
+++ b/demo/blog_writer/outliner.py
@@ -0,0 +1,214 @@
+import os
+import time
+
+import tiktoken
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.text_splitter import MarkdownTextSplitter
+
+
+class Outliner:
+    def __init__(self, transcript_file) -> None:
+        with open(transcript_file, "r") as f:
+            self.raw_transcript = f.read()
+        # instantiate chat model
+        self.chat = ChatOpenAI(
+            openai_api_key=os.environ.get("OPENAI_API_KEY"),
+            temperature=0,
+            model="gpt-3.5-turbo",
+        )
+
+    def num_tokens_from_string(
+        self, string: str, encoding_name="cl100k_base"
+    ) -> int:
+        """Returns the number of tokens in a text string."""
+        encoding = tiktoken.get_encoding(encoding_name)
+        num_tokens = len(encoding.encode(string))
+        return num_tokens
+
+    def transcript_splitter(self, chunk_size=3000, chunk_overlap=200):
+        markdown_splitter = MarkdownTextSplitter(
+            chunk_size=chunk_size, chunk_overlap=chunk_overlap
+        )
+        transcript_chunks = markdown_splitter.create_documents(
+            [self.raw_transcript]
+        )
+        return transcript_chunks
+
+    def transcript2question(self, transcript):
+        system_template = "You are a helpful assistant that reflects on a transcript of podcasts or lectures."
+        system_prompt = SystemMessagePromptTemplate.from_template(
+            system_template
+        )
+        human_template = """From the contents and information of the presentation, extract a single, \
+        succinct and clear question which the presentation attempts to answer. Write the question as though \
+        the speaker wrote it himself when outlining the presentation prior to creating it. Only output the question itself and nothing else. \
+        The transcript of this presentation is delimited in triple backticks:
+        ```{transcript}```"""
+        human_prompt = HumanMessagePromptTemplate.from_template(human_template)
+        chat_prompt = ChatPromptTemplate.from_messages(
+            [system_prompt, human_prompt]
+        )
+
+        result = self.chat(
+            chat_prompt.format_prompt(transcript=transcript).to_messages()
+        )
+        return result.content
+
+    def create_essay_questions(self, transcript_chunks):
+        essay_response = ""
+        for i, text in enumerate(transcript_chunks):
+            essay = self.transcript2question(text.page_content)
+            essay_response = f"\n\n**Question {i+1}**: ".join(
+                [essay_response, essay]
+            )
+        return essay_response
+
+    def organize_questions(self, questions):
+        # system_template = """You are a helpful assistant that summarizes and \
+        # processes large text information."""
+        system_template = """You are a helpful technical writer that processes and \
+        organizes large text information."""
+        system_prompt = SystemMessagePromptTemplate.from_template(
+            system_template
+        )
+
+        # human_template = """Given the list of questions below (enclosed in triple backticks), come up \
+        #     with a list of topics covered by them, and then output a valid JSON string where each key is a \
+        #     topic and its value is a list of questions from the list below which are relevant to that \
+        #     topic. Sort the topics in a logical order and number the topics (the JSON keys) accordingly. \
+        #     Only output the final JSON object and nothing else.\n
+        human_template = """Given the list of questions below (enclosed in triple backticks), come up with the list of \
+            topics covered by them, sort the topics in a logical order, and then output a valid JSON string where each \
+            key is an enumerated topic (e.g. "1. Topic Name") and the corresponding value is a list of questions from \
+            the list below which are relevant to that topic. Only output the final JSON object and nothing else.\n
+        #     ```{questions}```"""
+        human_prompt = HumanMessagePromptTemplate.from_template(human_template)
+        chat_prompt = ChatPromptTemplate.from_messages(
+            [system_prompt, human_prompt]
+        )
+
+        outline = self.chat(
+            chat_prompt.format_prompt(questions=questions).to_messages()
+        )
+
+        return outline.content
+
+    # @timer_decorator
+    def full_transcript2blueprint(self, verbose=True):
+        print("Chunking transcript...")
+        transcript_docs = self.transcript_splitter()
+        t1 = time.time()
+        print("Creating essay parts...")
+        chunk_questions = self.create_essay_questions(transcript_docs)
+        t2 = time.time() - t1
+        print("Merging essay parts...")
+        t1 = time.time()
+        blueprint = self.organize_questions(chunk_questions)
+        t3 = time.time() - t1
+        if verbose:
+            print(f"Created essay parts in {t2:.2f} seconds")
+            print(f"Merged essay parts in {t3:.2f} seconds")
+        return blueprint
+
+
+# # """# Part 2: Extracting from essay"""
+
+
+# def extract_metadata_as_json(essay, chat_model=chat):
+
+#     system_template = """ Given the essay delimited in triple backticks, generate and extract important \
+#   information such as the title, speaker, summary, a list of key topics, \
+#   and a list of important takeaways for each topic. \
+#   Format the response as a JSON object, with the keys 'Title', 'Topics', 'Speaker', \
+#   'Summary', and 'Topics' as the keys and each topic will be keys for list of takeaways. \
+#   Example of JSON output: \n \
+#  {{\
+#   'Title': 'Title of the presentation',\
+#   'Speaker': 'John Smith',\
+#   'Summary': 'summary of the presentation',\
+#   'Topics': [\
+#   {{\
+#   'Topic': 'topic 1',\
+#   'Takeaways': [\
+#   'takeaway 1',\
+#   'takeaway 2',\
+#   'takeaway 3'\
+#   ]\
+#   }},\
+#   {{\
+#   'Topic': 'topic 2',\
+#   'Takeaways': [\
+#   'takeaway 1',\
+#   'takeaway 2',\
+#   'takeaway 3'\
+#   ]\
+#   }},\
+#   {{\
+#   'Topic': 'topic 3',\
+#   'Takeaways': [\
+#   'takeaway 1',\
+#   'takeaway 2',\
+#   'takeaway 3'\
+#   ]\
+#   }},\
+#   {{\
+#   'Topic': 'topic 4',\
+#   'Takeaways': [\
+#   'takeaway 1',\
+#   'takeaway 2',\
+#   'takeaway 3'\
+#   ]\
+#   }}\
+#   ]\
+#   }}"""
+
+#     system_prompt = SystemMessagePromptTemplate.from_template(system_template)
+
+#     human_template = """Essay: ```{text}```"""
+
+#     human_prompt = HumanMessagePromptTemplate.from_template(human_template)
+#     chat_prompt = ChatPromptTemplate.from_messages(
+#         [system_prompt, human_prompt]
+#     )
+
+#     result = chat_model(chat_prompt.format_prompt(text=essay).to_messages())
+#     try:
+#         metadata_json = json.loads(result.content)
+#     except Exception as e:
+#         print(e)
+#         metadata_json = result.content
+#     return metadata_json
+
+
+# def json2rst(metadata, rst_filepath):
+#     if not isinstance(metadata, dict):
+#         metadata = json.loads(metadata)
+
+#     # rst_filepath = './essays/test.rst'
+#     with open(rst_filepath, "a") as the_file:
+#         the_file.write("\n\n")
+#         for key, value in metadata.items():
+#             if key == "Title":
+#                 title_mark = "=" * len(f"{value}")
+#                 the_file.write(title_mark + "\n")
+#                 the_file.write(f"{value} \n")
+#                 the_file.write(title_mark + "\n")
+#             elif key == "Speaker":
+#                 the_file.write("*" + f"{value}" + "* \n\n")
+#             elif key == "Summary":
+#                 title_mark = "-" * len(f"{key}")
+#                 the_file.write("Summary \n")
+#                 the_file.write(title_mark + "\n")
+#                 the_file.write(f"{value} \n\n")
+#             elif key == "Topics":
+#                 the_file.write("Topics: \n")
+#                 the_file.write(title_mark + "\n")
+#                 for topic in value:
+#                     the_file.write("\t" + f"{topic['Topic']} \n")
+#                     for takeaway in topic["Takeaways"]:
+#                         the_file.write("\t\t" + f"* {takeaway} \n")
diff --git a/demo/blog_writer/requirements.txt b/demo/blog_writer/requirements.txt
new file mode 100644
index 00000000..0b3387b9
--- /dev/null
+++ b/demo/blog_writer/requirements.txt
@@ -0,0 +1,4 @@
+langchain==0.0.332
+python-dotenv>=1.0.0
+openai>=0.28.0
+tiktoken>=0.4.0

From 178f40fbdbd9ac6b9230f191ef67c099f640946a Mon Sep 17 00:00:00 2001
From: Farid Taba <farid.t@gmail.com>
Date: Tue, 30 Apr 2024 20:27:19 -0700
Subject: [PATCH 2/9] Blogger v0.2 (Work in Progress)

---
 demo/blog_writer/agent_config.yml |  3 +-
 demo/blog_writer/main.py          |  2 -
 demo/blog_writer/outliner.py      | 98 +++++++++++++++++++------------
 3 files changed, 63 insertions(+), 40 deletions(-)

diff --git a/demo/blog_writer/agent_config.yml b/demo/blog_writer/agent_config.yml
index 82f76afa..a0372d2d 100644
--- a/demo/blog_writer/agent_config.yml
+++ b/demo/blog_writer/agent_config.yml
@@ -40,7 +40,8 @@ qa_agent:
     llm: ${llm}
     shared_memory: ${shared_memory}
     name: QA Sherpa
-    description: You are a question-answering assistant helping users to find answers based on the document. For each question, first try to collection relevant information by DocumentSearch. Then, use Google Search to find the answer in the next step.
+    # description: You are a question-answering assistant helping users to find answers based on the document. For each question, first try to collection relevant information by DocumentSearch. Then, use Google Search to find the answer in the next step.
+    description: You are a technical writer assistant. You help users to find information based on the document. For each prompt, first try to collect relevant information by DocumentSearch. Then, use Google Search to find information in the next step.
     agent_config: ${agent_config}
     num_runs: 2
     validation_steps: 1
diff --git a/demo/blog_writer/main.py b/demo/blog_writer/main.py
index 52ca207c..1550479a 100644
--- a/demo/blog_writer/main.py
+++ b/demo/blog_writer/main.py
@@ -1,8 +1,6 @@
 import json
-import os
 from argparse import ArgumentParser
 
-import pypandoc
 from hydra.utils import instantiate
 from omegaconf import OmegaConf
 from sherpa_ai.agents import QAAgent
diff --git a/demo/blog_writer/outliner.py b/demo/blog_writer/outliner.py
index 27c644ba..7303fae2 100644
--- a/demo/blog_writer/outliner.py
+++ b/demo/blog_writer/outliner.py
@@ -39,15 +39,15 @@ def transcript_splitter(self, chunk_size=3000, chunk_overlap=200):
         )
         return transcript_chunks
 
-    def transcript2question(self, transcript):
-        system_template = "You are a helpful assistant that reflects on a transcript of podcasts or lectures."
+    def transcript2insights(self, transcript):
+        system_template = "You are a helpful assistant that summarizes transcripts of podcasts or lectures."
         system_prompt = SystemMessagePromptTemplate.from_template(
             system_template
         )
-        human_template = """From the contents and information of the presentation, extract a single, \
-        succinct and clear question which the presentation attempts to answer. Write the question as though \
-        the speaker wrote it himself when outlining the presentation prior to creating it. Only output the question itself and nothing else. \
-        The transcript of this presentation is delimited in triple backticks:
+        human_template = """From the content of the presentation, extract at least 1 and at most 3 key insight(s). \
+            If a topic is stated to be discussed in detail later on in the presentation, do not include that topic. \
+            Do not explain what you're doing. Do not output anything other than the list of insights. Do not format \
+            the output. The transcript of the presentation is delimited in triple backticks: \
         ```{transcript}```"""
         human_prompt = HumanMessagePromptTemplate.from_template(human_template)
         chat_prompt = ChatPromptTemplate.from_messages(
@@ -59,41 +59,60 @@ def transcript2question(self, transcript):
         )
         return result.content
 
-    def create_essay_questions(self, transcript_chunks):
-        essay_response = ""
+    def create_essay_insights(self, transcript_chunks):
+        response = ""
         for i, text in enumerate(transcript_chunks):
-            essay = self.transcript2question(text.page_content)
-            essay_response = f"\n\n**Question {i+1}**: ".join(
-                [essay_response, essay]
-            )
-        return essay_response
-
-    def organize_questions(self, questions):
-        # system_template = """You are a helpful assistant that summarizes and \
-        # processes large text information."""
-        system_template = """You are a helpful technical writer that processes and \
-        organizes large text information."""
+            insights = self.transcript2insights(text.page_content)
+            response = f"\n\n**Question {i+1}**: ".join([response, insights])
+        return response
+
+    def extract_thesis_statement(self, insights):
+        system_template = """You are a helpful assistant that summarizes large text information."""
+        system_prompt = SystemMessagePromptTemplate.from_template(
+            system_template
+        )
+
+        human_template = """From the below list of key points discussed in a presentation, extract \
+            a single, coherent, and succinct thesis statement that captures the essence of the \
+            presentation. Your thesis statement must be the combination of a topic and a claim \
+            the presenter is making about that topic. Only output the thesis statement and nothing \
+            else. The list of key points is delimited between triple backticks: \
+            ```{insights}```"""
+        human_prompt = HumanMessagePromptTemplate.from_template(human_template)
+        chat_prompt = ChatPromptTemplate.from_messages(
+            [system_prompt, human_prompt]
+        )
+
+        outline = self.chat(
+            chat_prompt.format_prompt(insights=insights).to_messages()
+        )
+
+        return outline.content
+
+    def create_blueprint(self, thesis_statement, insights):
+        system_template = """You are a helpful AI blogger who writes essays on technical topics."""
         system_prompt = SystemMessagePromptTemplate.from_template(
             system_template
         )
 
-        # human_template = """Given the list of questions below (enclosed in triple backticks), come up \
-        #     with a list of topics covered by them, and then output a valid JSON string where each key is a \
-        #     topic and its value is a list of questions from the list below which are relevant to that \
-        #     topic. Sort the topics in a logical order and number the topics (the JSON keys) accordingly. \
-        #     Only output the final JSON object and nothing else.\n
-        human_template = """Given the list of questions below (enclosed in triple backticks), come up with the list of \
-            topics covered by them, sort the topics in a logical order, and then output a valid JSON string where each \
-            key is an enumerated topic (e.g. "1. Topic Name") and the corresponding value is a list of questions from \
-            the list below which are relevant to that topic. Only output the final JSON object and nothing else.\n
-        #     ```{questions}```"""
+        human_template = """Given the provided thesis statement and list of key points, reorganize \
+            and condense the information into a logical, coherent blueprint for an essay. Output a \
+            JSON object where each key is a section heading and each value is a list of key points \
+            to cover relevant to that section. All sections must support the thesis statement. \
+            Do not include any information that has little to do with the thesis statement. \
+            Only output the final JSON object and nothing else. Do not format the output. \
+            The thesis statement is : "{thesis_statement}" and the key insights are delimited \
+            in triple backticks below: \
+            ```{insights}```"""
         human_prompt = HumanMessagePromptTemplate.from_template(human_template)
         chat_prompt = ChatPromptTemplate.from_messages(
             [system_prompt, human_prompt]
         )
 
         outline = self.chat(
-            chat_prompt.format_prompt(questions=questions).to_messages()
+            chat_prompt.format_prompt(
+                thesis_statement=thesis_statement, insights=insights
+            ).to_messages()
         )
 
         return outline.content
@@ -103,17 +122,22 @@ def full_transcript2blueprint(self, verbose=True):
         print("Chunking transcript...")
         transcript_docs = self.transcript_splitter()
         t1 = time.time()
-        print("Creating essay parts...")
-        chunk_questions = self.create_essay_questions(transcript_docs)
+        print("Extracting key insights...")
+        essay_insights = self.create_essay_insights(transcript_docs)
         t2 = time.time() - t1
-        print("Merging essay parts...")
+        print("Extracting thesis statement...")
         t1 = time.time()
-        blueprint = self.organize_questions(chunk_questions)
+        thesis_statement = self.extract_thesis_statement(essay_insights)
         t3 = time.time() - t1
+        print("Creating essay...")
+        t1 = time.time()
+        essay = self.create_blueprint(thesis_statement, essay_insights)
+        t4 = time.time() - t1
         if verbose:
-            print(f"Created essay parts in {t2:.2f} seconds")
-            print(f"Merged essay parts in {t3:.2f} seconds")
-        return blueprint
+            print(f"Extracted essay insights in {t2:.2f} seconds")
+            print(f"Extracted thesis statement in {t3:.2f} seconds")
+            print(f"Created essay blueprint in {t4:.2f} seconds")
+        return essay
 
 
 # # """# Part 2: Extracting from essay"""

From f35ed56461d62bce897e4fb9e333d3a639331aa7 Mon Sep 17 00:00:00 2001
From: Farid Taba <farid.t@gmail.com>
Date: Thu, 2 May 2024 14:08:10 -0700
Subject: [PATCH 3/9] Blogger v0.3 (Work in Progress)

---
 demo/blog_writer/agent_config.yml |  14 +-
 demo/blog_writer/main.py          |  47 ++++---
 demo/blog_writer/outliner.py      | 223 +++++++++++-------------------
 3 files changed, 119 insertions(+), 165 deletions(-)

diff --git a/demo/blog_writer/agent_config.yml b/demo/blog_writer/agent_config.yml
index a0372d2d..9949a7f9 100644
--- a/demo/blog_writer/agent_config.yml
+++ b/demo/blog_writer/agent_config.yml
@@ -31,9 +31,9 @@ google_search:
 
 citation_validation:  # The tool used to validate and add citation to the answer
     _target_: sherpa_ai.output_parsers.citation_validation.CitationValidation
-    sequence_threshold: 0.5
-    jaccard_threshold: 0.5
-    token_overlap: 0.5
+    sequence_threshold: 0.6
+    jaccard_threshold: 0.6
+    token_overlap: 0.6
 
 qa_agent:
     _target_: sherpa_ai.agents.qa_agent.QAAgent
@@ -41,10 +41,12 @@ qa_agent:
     shared_memory: ${shared_memory}
     name: QA Sherpa
     # description: You are a question-answering assistant helping users to find answers based on the document. For each question, first try to collection relevant information by DocumentSearch. Then, use Google Search to find the answer in the next step.
-    description: You are a technical writer assistant. You help users to find information based on the document. For each prompt, first try to collect relevant information by DocumentSearch. Then, use Google Search to find information in the next step.
+    # description: You are a technical writing assistant. You help users find information based on the document. For each prompt, use Google Search as the first action and Document Search as the second.
+    description: You are a technical writing assistant that helps users find information they need. For each prompt, collect and output relevant information.
     agent_config: ${agent_config}
-    num_runs: 2
+    num_runs: 1
     validation_steps: 1
     actions:
-        - ${doc_search}
         - ${google_search}
+    validations:
+        - ${citation_validation}
diff --git a/demo/blog_writer/main.py b/demo/blog_writer/main.py
index 1550479a..724bcb8d 100644
--- a/demo/blog_writer/main.py
+++ b/demo/blog_writer/main.py
@@ -8,6 +8,8 @@
 
 from outliner import Outliner
 
+# from sherpa_ai.memory import Belief
+
 
 def get_qa_agent_from_config_file(
     config_path: str,
@@ -36,32 +38,39 @@ def get_qa_agent_from_config_file(
     parser.add_argument("--transcript", type=str, default="transcript.txt")
     args = parser.parse_args()
 
-    qa_agent = get_qa_agent_from_config_file(args.config)
+    writer_agent = get_qa_agent_from_config_file(args.config)
 
     outliner = Outliner(args.transcript)
-    blueprint = outliner.full_transcript2blueprint()
-    if blueprint.startswith("```"):
-        # The first and last lines are code block delimiters; remove them
-        lines = blueprint.split("\n")[1:-1]
-        pure_json_str = "\n".join(lines)
-    else:
-        pure_json_str = blueprint
+    # blueprint = outliner.full_transcript2outline_json(verbose=True)
+    # if blueprint.startswith("```"):
+    #     # The first and last lines are code block delimiters; remove them
+    #     lines = blueprint.split("\n")[1:-1]
+    #     pure_json_str = "\n".join(lines)
+    # else:
+    #     pure_json_str = blueprint
+
+    # with open("blueprint.json", "w") as f:
+    #     f.write(pure_json_str)
 
-    with open("blueprint.json", "w") as f:
-        f.write(pure_json_str)
+    with open("blueprint_10.json", "r") as f:
+        pure_json_str = f.read()
 
     parsed_json = json.loads(pure_json_str)
 
     blog = ""
-    for key in parsed_json:
-        blog += "# " + key + "\n"
-        for question in parsed_json[key]:
-            # Add the question to the shared memory. By default, the agent will take the last
-            # message in the shared memory as the task.
-            qa_agent.shared_memory.add(EventType.task, "human", question)
-            result = qa_agent.run()
-            blog += result + "\n"
-        blog += "\n"
+    thesis = parsed_json.get("Thesis Statement", "")
+    blog += f"# Introduction\n{thesis}\n"
+    arguments = parsed_json.get("Supporting Arguments", [])
+    for argument in arguments:
+        blog += f"## {argument['Argument']}\n"
+        evidences = argument.get("Evidence", [])
+        for evidence in evidences:
+            writer_agent.shared_memory.add(EventType.task, "human", evidence)
+            result = writer_agent.run()
+            # writer_agent.belief = Belief()
+            blog += f"{result}\n"
+
+    print(blog)
 
     with open("blog.md", "w") as f:
         f.write(blog)
diff --git a/demo/blog_writer/outliner.py b/demo/blog_writer/outliner.py
index 7303fae2..030977b1 100644
--- a/demo/blog_writer/outliner.py
+++ b/demo/blog_writer/outliner.py
@@ -44,11 +44,19 @@ def transcript2insights(self, transcript):
         system_prompt = SystemMessagePromptTemplate.from_template(
             system_template
         )
-        human_template = """From the content of the presentation, extract at least 1 and at most 3 key insight(s). \
-            If a topic is stated to be discussed in detail later on in the presentation, do not include that topic. \
-            Do not explain what you're doing. Do not output anything other than the list of insights. Do not format \
-            the output. The transcript of the presentation is delimited in triple backticks: \
-        ```{transcript}```"""
+        human_template = """From this chunk of a presentation transcript, extract a short list of key insights. \
+            Skip explaining what you're doing, labeling the insights and writing conclusion paragraphs. \
+            The insights have to be phrased as statements of facts with no references to the presentation or the transcript. \
+            Statements have to be full sentences and in terms of words and phrases as close as possible to those used in the transcript. \
+            Keep as much detail as possible. The transcript of the presentation is delimited in triple backticks.
+
+            Desired output format:
+            - [Key insight #1]
+            - [Key insight #2]
+            - [...]
+
+            Transcript:
+            ```{transcript}```"""
         human_prompt = HumanMessagePromptTemplate.from_template(human_template)
         chat_prompt = ChatPromptTemplate.from_messages(
             [system_prompt, human_prompt]
@@ -57,26 +65,40 @@ def transcript2insights(self, transcript):
         result = self.chat(
             chat_prompt.format_prompt(transcript=transcript).to_messages()
         )
+
         return result.content
 
-    def create_essay_insights(self, transcript_chunks):
+    def create_essay_insights(self, transcript_chunks, verbose=True):
         response = ""
         for i, text in enumerate(transcript_chunks):
             insights = self.transcript2insights(text.page_content)
-            response = f"\n\n**Question {i+1}**: ".join([response, insights])
+            response = "\n".join([response, insights])
+            if verbose:
+                print(
+                    f"\nInsights extracted from chunk {i+1}/{len(transcript_chunks)}:\n {insights}"
+                )
         return response
 
-    def extract_thesis_statement(self, insights):
-        system_template = """You are a helpful assistant that summarizes large text information."""
+    def create_blueprint(self, insights, verbose=True):
+        system_template = """You are a helpful AI blogger who writes essays on technical topics."""
         system_prompt = SystemMessagePromptTemplate.from_template(
             system_template
         )
 
-        human_template = """From the below list of key points discussed in a presentation, extract \
-            a single, coherent, and succinct thesis statement that captures the essence of the \
-            presentation. Your thesis statement must be the combination of a topic and a claim \
-            the presenter is making about that topic. Only output the thesis statement and nothing \
-            else. The list of key points is delimited between triple backticks: \
+        human_template = """Organize the following statements (delimited in triple backticks) to create the outline for \
+            a blog post. Output the outline in a tree structure where the highest level is the most plausible statement \
+            as the thesis statement for the post, the next layers are statements providing supporting arguments for the \
+            thesis statement, and the last layer are pieces of evidence for each of the supporting arguments. Use all of \
+            the provuded statements and keep them as is instead of paraphrasing them. The thesis statement, supporting argument, \
+            and evidences have to be full sentences containing claims. Label each layer with the appropriate level title \
+            like the desired output format below:
+
+            Desired output format:
+            - Thesis Statement: [xxx]
+                - Supporting Argument: [yyy]
+                    - Evidence: [zzz]
+
+            Statements:
             ```{insights}```"""
         human_prompt = HumanMessagePromptTemplate.from_template(human_template)
         chat_prompt = ChatPromptTemplate.from_messages(
@@ -87,152 +109,73 @@ def extract_thesis_statement(self, insights):
             chat_prompt.format_prompt(insights=insights).to_messages()
         )
 
+        if verbose:
+            print(f"\nEssay outline: {outline.content}\n")
         return outline.content
 
-    def create_blueprint(self, thesis_statement, insights):
-        system_template = """You are a helpful AI blogger who writes essays on technical topics."""
+    def convert2JSON(self, outline):
+        system_template = (
+            """You are a helpful assistant that outputs JSON data from text."""
+        )
         system_prompt = SystemMessagePromptTemplate.from_template(
             system_template
         )
 
-        human_template = """Given the provided thesis statement and list of key points, reorganize \
-            and condense the information into a logical, coherent blueprint for an essay. Output a \
-            JSON object where each key is a section heading and each value is a list of key points \
-            to cover relevant to that section. All sections must support the thesis statement. \
-            Do not include any information that has little to do with the thesis statement. \
-            Only output the final JSON object and nothing else. Do not format the output. \
-            The thesis statement is : "{thesis_statement}" and the key insights are delimited \
-            in triple backticks below: \
-            ```{insights}```"""
+        human_template = """Convert the outline below (delimited within triple backticks) to a valid JSON string. \
+            Only output the JSON object and skip explaining what you're doing.
+
+            Desired output format:
+            {{
+            "Thesis Statement": "...",
+            "Supporting Arguments": [
+                {{
+                "Argument": "...",
+                "Evidence": [
+                    "...", "...", "...", ...
+                ]
+                }},
+                {{
+                "Argument": "...",
+                "Evidence": [
+                    "...", "...", "...", ...
+                ]
+                }},
+                ...
+            ]
+            }}
+
+            Outline:
+            ```{outline}```"""
         human_prompt = HumanMessagePromptTemplate.from_template(human_template)
         chat_prompt = ChatPromptTemplate.from_messages(
             [system_prompt, human_prompt]
         )
 
-        outline = self.chat(
-            chat_prompt.format_prompt(
-                thesis_statement=thesis_statement, insights=insights
-            ).to_messages()
+        json = self.chat(
+            chat_prompt.format_prompt(outline=outline).to_messages()
         )
 
-        return outline.content
+        return json.content
 
     # @timer_decorator
-    def full_transcript2blueprint(self, verbose=True):
-        print("Chunking transcript...")
+    def full_transcript2outline_json(self, verbose=True):
+        print("\nChunking transcript...")
         transcript_docs = self.transcript_splitter()
         t1 = time.time()
-        print("Extracting key insights...")
-        essay_insights = self.create_essay_insights(transcript_docs)
+        print("\nExtracting key insights...")
+        essay_insights = self.create_essay_insights(transcript_docs, verbose)
         t2 = time.time() - t1
-        print("Extracting thesis statement...")
+        print("\nCreating essay...")
         t1 = time.time()
-        thesis_statement = self.extract_thesis_statement(essay_insights)
+        blueprint = self.create_blueprint(essay_insights, verbose)
         t3 = time.time() - t1
-        print("Creating essay...")
+        print("\nCreating JSON...")
         t1 = time.time()
-        essay = self.create_blueprint(thesis_statement, essay_insights)
+        blueprint_json = self.convert2JSON(blueprint)
         t4 = time.time() - t1
         if verbose:
-            print(f"Extracted essay insights in {t2:.2f} seconds")
-            print(f"Extracted thesis statement in {t3:.2f} seconds")
-            print(f"Created essay blueprint in {t4:.2f} seconds")
-        return essay
-
-
-# # """# Part 2: Extracting from essay"""
-
-
-# def extract_metadata_as_json(essay, chat_model=chat):
-
-#     system_template = """ Given the essay delimited in triple backticks, generate and extract important \
-#   information such as the title, speaker, summary, a list of key topics, \
-#   and a list of important takeaways for each topic. \
-#   Format the response as a JSON object, with the keys 'Title', 'Topics', 'Speaker', \
-#   'Summary', and 'Topics' as the keys and each topic will be keys for list of takeaways. \
-#   Example of JSON output: \n \
-#  {{\
-#   'Title': 'Title of the presentation',\
-#   'Speaker': 'John Smith',\
-#   'Summary': 'summary of the presentation',\
-#   'Topics': [\
-#   {{\
-#   'Topic': 'topic 1',\
-#   'Takeaways': [\
-#   'takeaway 1',\
-#   'takeaway 2',\
-#   'takeaway 3'\
-#   ]\
-#   }},\
-#   {{\
-#   'Topic': 'topic 2',\
-#   'Takeaways': [\
-#   'takeaway 1',\
-#   'takeaway 2',\
-#   'takeaway 3'\
-#   ]\
-#   }},\
-#   {{\
-#   'Topic': 'topic 3',\
-#   'Takeaways': [\
-#   'takeaway 1',\
-#   'takeaway 2',\
-#   'takeaway 3'\
-#   ]\
-#   }},\
-#   {{\
-#   'Topic': 'topic 4',\
-#   'Takeaways': [\
-#   'takeaway 1',\
-#   'takeaway 2',\
-#   'takeaway 3'\
-#   ]\
-#   }}\
-#   ]\
-#   }}"""
-
-#     system_prompt = SystemMessagePromptTemplate.from_template(system_template)
-
-#     human_template = """Essay: ```{text}```"""
-
-#     human_prompt = HumanMessagePromptTemplate.from_template(human_template)
-#     chat_prompt = ChatPromptTemplate.from_messages(
-#         [system_prompt, human_prompt]
-#     )
-
-#     result = chat_model(chat_prompt.format_prompt(text=essay).to_messages())
-#     try:
-#         metadata_json = json.loads(result.content)
-#     except Exception as e:
-#         print(e)
-#         metadata_json = result.content
-#     return metadata_json
-
-
-# def json2rst(metadata, rst_filepath):
-#     if not isinstance(metadata, dict):
-#         metadata = json.loads(metadata)
-
-#     # rst_filepath = './essays/test.rst'
-#     with open(rst_filepath, "a") as the_file:
-#         the_file.write("\n\n")
-#         for key, value in metadata.items():
-#             if key == "Title":
-#                 title_mark = "=" * len(f"{value}")
-#                 the_file.write(title_mark + "\n")
-#                 the_file.write(f"{value} \n")
-#                 the_file.write(title_mark + "\n")
-#             elif key == "Speaker":
-#                 the_file.write("*" + f"{value}" + "* \n\n")
-#             elif key == "Summary":
-#                 title_mark = "-" * len(f"{key}")
-#                 the_file.write("Summary \n")
-#                 the_file.write(title_mark + "\n")
-#                 the_file.write(f"{value} \n\n")
-#             elif key == "Topics":
-#                 the_file.write("Topics: \n")
-#                 the_file.write(title_mark + "\n")
-#                 for topic in value:
-#                     the_file.write("\t" + f"{topic['Topic']} \n")
-#                     for takeaway in topic["Takeaways"]:
-#                         the_file.write("\t\t" + f"* {takeaway} \n")
+            print()
+            print(f"Extracted essay insights in {t2:.2f} seconds.")
+            print(f"Created essay blueprint in {t3:.2f} seconds.")
+            print(f"Created JSON in {t4:.2f} seconds.")
+        return blueprint_json

From 3cd7ad357265a50b88614bc382ddb970d1823ff8 Mon Sep 17 00:00:00 2001
From: Farid Taba <farid.t@gmail.com>
Date: Fri, 3 May 2024 10:49:49 -0700
Subject: [PATCH 4/9] Blogger vDemo (Used for the demo)

---
 demo/blog_writer/agent_config.yml |  5 +--
 demo/blog_writer/main.py          | 24 +++++------
 demo/blog_writer/outliner.py      | 71 +++++++++----------------------
 3 files changed, 32 insertions(+), 68 deletions(-)

diff --git a/demo/blog_writer/agent_config.yml b/demo/blog_writer/agent_config.yml
index 9949a7f9..1da0b6d5 100644
--- a/demo/blog_writer/agent_config.yml
+++ b/demo/blog_writer/agent_config.yml
@@ -40,13 +40,12 @@ qa_agent:
     llm: ${llm}
     shared_memory: ${shared_memory}
     name: QA Sherpa
-    # description: You are a question-answering assistant helping users to find answers based on the document. For each question, first try to collection relevant information by DocumentSearch. Then, use Google Search to find the answer in the next step.
-    # description: You are a technical writing assistant. You help users find information based on the document. For each prompt, use Google Search as the first action and Document Search as the second.
-    description: You are a technical writing assistant that helps users find information they need. For each prompt, collect and output relevant information.
+    description: You are a technical writing assistant that helps users write articles. For each prompt, use Google Search to find detailed information that supports and expands on the prompt.
     agent_config: ${agent_config}
     num_runs: 1
     validation_steps: 1
     actions:
         - ${google_search}
+        # - ${doc_search}
     validations:
         - ${citation_validation}
diff --git a/demo/blog_writer/main.py b/demo/blog_writer/main.py
index 724bcb8d..c061d22e 100644
--- a/demo/blog_writer/main.py
+++ b/demo/blog_writer/main.py
@@ -41,19 +41,19 @@ def get_qa_agent_from_config_file(
     writer_agent = get_qa_agent_from_config_file(args.config)
 
     outliner = Outliner(args.transcript)
-    # blueprint = outliner.full_transcript2outline_json(verbose=True)
-    # if blueprint.startswith("```"):
-    #     # The first and last lines are code block delimiters; remove them
-    #     lines = blueprint.split("\n")[1:-1]
-    #     pure_json_str = "\n".join(lines)
-    # else:
-    #     pure_json_str = blueprint
+    blueprint = outliner.full_transcript2outline_json(verbose=True)
+    if blueprint.startswith("```"):
+        # The first and last lines are code block delimiters; remove them
+        lines = blueprint.split("\n")[1:-1]
+        pure_json_str = "\n".join(lines)
+    else:
+        pure_json_str = blueprint
 
-    # with open("blueprint.json", "w") as f:
-    #     f.write(pure_json_str)
+    with open("blueprint.json", "w") as f:
+        f.write(pure_json_str)
 
-    with open("blueprint_10.json", "r") as f:
-        pure_json_str = f.read()
+    # with open("blueprint_manual.json", "r") as f:
+    #     pure_json_str = f.read()
 
     parsed_json = json.loads(pure_json_str)
 
@@ -70,8 +70,6 @@ def get_qa_agent_from_config_file(
             # writer_agent.belief = Belief()
             blog += f"{result}\n"
 
-    print(blog)
-
     with open("blog.md", "w") as f:
         f.write(blog)
 
diff --git a/demo/blog_writer/outliner.py b/demo/blog_writer/outliner.py
index 030977b1..29273b13 100644
--- a/demo/blog_writer/outliner.py
+++ b/demo/blog_writer/outliner.py
@@ -75,54 +75,24 @@ def create_essay_insights(self, transcript_chunks, verbose=True):
             response = "\n".join([response, insights])
             if verbose:
                 print(
-                    f"\nInsights extracted from chunk {i+1}/{len(transcript_chunks)}:\n {insights}"
+                    f"\nInsights extracted from chunk {i+1}/{len(transcript_chunks)}:\n{insights}"
                 )
         return response
 
-    def create_blueprint(self, insights, verbose=True):
+    def create_blueprint(self, statements, verbose=True):
         system_template = """You are a helpful AI blogger who writes essays on technical topics."""
         system_prompt = SystemMessagePromptTemplate.from_template(
             system_template
         )
 
-        human_template = """Organize the following statements (delimited in triple backticks) to create the outline for \
-            a blog post. Output the outline in a tree structure where the highest level is the most plausible statement \
-            as the thesis statement for the post, the next layers are statements providing supporting arguments for the \
-            thesis statement, and the last layer are pieces of evidence for each of the supporting arguments. Use all of \
-            the provuded statements and keep them as is instead of paraphrasing them. The thesis statement, supporting argument, \
-            and evidences have to be full sentences containing claims. Label each layer with the appropriate level title \
-            like the desired output format below:
-
-            Desired output format:
-            - Thesis Statement: [xxx]
-                - Supporting Argument: [yyy]
-                    - Evidence: [zzz]
-
-            Statements:
-            ```{insights}```"""
-        human_prompt = HumanMessagePromptTemplate.from_template(human_template)
-        chat_prompt = ChatPromptTemplate.from_messages(
-            [system_prompt, human_prompt]
-        )
-
-        outline = self.chat(
-            chat_prompt.format_prompt(insights=insights).to_messages()
-        )
-
-        if verbose:
-            print(f"\nEssay outline: {outline.content}\n")
-        return outline.content
-
-    def convert2JSON(self, outline):
-        system_template = (
-            """You are a helpful assistant that outputs JSON data from text."""
-        )
-        system_prompt = SystemMessagePromptTemplate.from_template(
-            system_template
-        )
-
-        human_template = """Convert the outline below (delimited within triple backticks) to a valid JSON string. \
-            Only output the JSON object and skip explaining what you're doing.
+        human_template = """Organize the following list of statements (delimited in triple backticks) to create the outline \
+            for a blog post in JSON format. The highest level is the most plausible statement as the overarching thesis \
+            statement of the post, the next layers are statements providing supporting arguments for the thesis statement. \
+            The last layer are pieces of evidence for each of the supporting arguments, directly quoted from the provided \
+            list of statements. Use as many of the provided statements as possible. Keep their wording as is without paraphrasing them. \
+            Retain as many technical details as possible. The thesis statement, supporting arguments, and evidences must be \
+            full sentences containing claims. Label each layer with the appropriate level title and create the desired JSON output format below. \
+            Only output the JSON and skip explaining what you're doing:
 
             Desired output format:
             {{
@@ -144,18 +114,20 @@ def convert2JSON(self, outline):
             ]
             }}
 
-            Outline:
-            ```{outline}```"""
+            Statements:
+            ```{statements}```"""
         human_prompt = HumanMessagePromptTemplate.from_template(human_template)
         chat_prompt = ChatPromptTemplate.from_messages(
             [system_prompt, human_prompt]
         )
 
-        json = self.chat(
-            chat_prompt.format_prompt(outline=outline).to_messages()
+        outline = self.chat(
+            chat_prompt.format_prompt(statements=statements).to_messages()
         )
 
-        return json.content
+        if verbose:
+            print(f"\nEssay outline: {outline.content}\n")
+        return outline.content
 
     # @timer_decorator
     def full_transcript2outline_json(self, verbose=True):
@@ -165,17 +137,12 @@ def full_transcript2outline_json(self, verbose=True):
         print("\nExtracting key insights...")
         essay_insights = self.create_essay_insights(transcript_docs, verbose)
         t2 = time.time() - t1
-        print("\nCreating essay...")
+        print("\nCreating essay outline...")
         t1 = time.time()
         blueprint = self.create_blueprint(essay_insights, verbose)
         t3 = time.time() - t1
-        print("\nCreating JSON...")
-        t1 = time.time()
-        blueprint_json = self.convert2JSON(blueprint)
-        t4 = time.time() - t1
         if verbose:
             print()
             print(f"Extracted essay insights in {t2:.2f} seconds.")
             print(f"Created essay blueprint in {t3:.2f} seconds.")
-            print(f"Created JSON in {t4:.2f} seconds.")
-        return blueprint_json
+        return blueprint

From 6d3762f910b018e6d52f74a4d9acf3af0c3a86b8 Mon Sep 17 00:00:00 2001
From: Farid Taba <farid.t@gmail.com>
Date: Mon, 6 May 2024 21:32:51 -0700
Subject: [PATCH 5/9] Add sample .env file

---
 demo/blog_writer/.env.sample | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 demo/blog_writer/.env.sample

diff --git a/demo/blog_writer/.env.sample b/demo/blog_writer/.env.sample
new file mode 100644
index 00000000..3fb32146
--- /dev/null
+++ b/demo/blog_writer/.env.sample
@@ -0,0 +1,6 @@
+# Enter your OpenAI API key and Serper API key immediately after the equals sign without spaces or enclosing quotation marks.
+OPENAI_API_KEY=
+SERPER_API_KEY=
+
+# Uncomment the line below to enable debugging logs.
+# LOG_LEVEL=DEBUG

From 75a305f6f6a75585927b521f8441b13503c81daa Mon Sep 17 00:00:00 2001
From: Farid Taba <farid.t@gmail.com>
Date: Mon, 6 May 2024 22:22:01 -0700
Subject: [PATCH 6/9] Add reqs from PDF Reader tutorial

---
 demo/blog_writer/requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/demo/blog_writer/requirements.txt b/demo/blog_writer/requirements.txt
index 0b3387b9..11ec5ec7 100644
--- a/demo/blog_writer/requirements.txt
+++ b/demo/blog_writer/requirements.txt
@@ -1,3 +1,5 @@
+pdfminer.six
+sentence-transformers
 langchain==0.0.332
 python-dotenv>=1.0.0
 openai>=0.28.0

From afce6420b27ac7a78f55407544a75dd72e77258c Mon Sep 17 00:00:00 2001
From: Farid Taba <farid.t@gmail.com>
Date: Mon, 6 May 2024 23:04:32 -0700
Subject: [PATCH 7/9] Add a how-to guide for blog-writer

---
 docs/How_To/Tutorials/blog_writer.rst | 116 ++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 docs/How_To/Tutorials/blog_writer.rst

diff --git a/docs/How_To/Tutorials/blog_writer.rst b/docs/How_To/Tutorials/blog_writer.rst
new file mode 100644
index 00000000..3fc7bf68
--- /dev/null
+++ b/docs/How_To/Tutorials/blog_writer.rst
@@ -0,0 +1,116 @@
+Create a Blog Writer with Sherpa
+================================
+
+In this tutorial we will create a simple blog writer using Sherpa. The
+blog writer will be able to read the transcript of a presentation,
+create an outline for a blog post, and then write the blog post section
+by section using a “Writer” agent that can gather information about the
+topic of the section from the web by performing a Google Search and/or
+from the transcript itself by performing a Document Search.
+
+Overview
+--------
+
+The two main Python files are:
+
+1. ``main.py``: This file instantiates
+and configures a Sherpa agent with access to various “actions” (wrappers
+for “tools”) such as Google search (with citation validation) and
+document search (for retrieving relevant context from the transcript).
+The agent takes statements that comprise “evidence” for claims to be
+made in the blog post and extends and expands them into paragraphs.
+These pieces of “evidence” are part of the blog outline generated by the
+second component.
+
+2. ``ouliner.py``: This file houses the Outliner
+component which performs the following:
+
+* Preprocessing (chunking) the document. This is necessary due to GPT-3.5’s
+  context window limit of 4096 tokens.
+
+* Analyzing the transcript. This consists of two steps: First, a short list of
+  “key insights” is extracted from each chunk. In the next step, these lists are
+  concatenated together and a blog post “blueprint” (analogous to an “essay
+  outline”) is synthesized from the list of all key insights. This blueprint can
+  be thought of as a tree with three levels of depth:
+
+  1. Thesis Statement: A single statement that forms the core message of the
+  blog post (essentially a topic and a claim made about the topic).
+
+  2. Supporting Arguments: A list of statements
+  that support the Thesis Statement at a high level.
+
+  3. Evidence: Lists of
+  statements that provide factual evidence for the Supporting Argument
+  under which they appear. The blueprint is output as a JSON string.
+
+How to Install
+--------------
+
+Step 1. Install Python 3.9 using your preferred installation method.
+
+Step 2. Create a folder for storing the blog writer code and
+input/output files:
+
+.. code:: bash
+
+   cd <your development directory>
+   mkdir sherpa_blog_writer
+   cd sherpa_blog_writer
+
+Step 3. You may wish to create a virtual environment to isolate the
+Python libraries used for this tutorial from your other Python code.
+This step is optional but highly recommended. An example of this (using
+``venv``) would be:
+
+.. code:: bash
+
+   python -m venv bwvenv
+   source bwvenv/bin/activate
+
+Step 4. Install the Sherpa library using ``pip``.
+
+.. code:: bash
+
+   pip install sherpa_ai
+
+Step 5. Download all files from
+(https://github.com/Aggregate-Intellect/sherpa/tree/main/demo/blog_writer)
+into this directory.
+
+Step 6. Install additional requiremetns with
+``pip``.
+
+.. code:: bash
+
+   pip install -r requirements.txt
+
+Step 7. Rename the file ``.env.sample`` to ``.env``. Then open it in
+your favourite text editor and add your OpenAI and Serper API keys.
+
+Step 8. Source the environment variables from the ``.env`` file with
+``direnv`` if you use it, or alternatively using:
+
+.. code:: bash
+
+   export $(grep -v '^#' .env | xargs)
+
+How to Use
+----------
+
+Step 1. Currently the blog writer needs the transcript in both ``.txt``
+*and* in ``.pdf`` formats. So the first step is to ensure you have both
+files and copy them into the current (blog writer) directory. Most text
+editors will have an “Export to PDF” feature. Alternatively, you can
+“print” the file as a PDF. Name the files ``transcript.txt`` and
+``transcript.pdf``.
+
+Step 2. Run:
+
+.. code:: bash
+
+   python main.py --config agent_config.yml --transcript transcript.txt
+
+The blog writer will output verbose feedback to the console as it works
+through the files. The blueprint will be saved as ``blueprint.json`` and
+the final output (blog post) as ``blog.md``.

From 3adc7a6a79456e14dfcde1cd34a51915600dc82b Mon Sep 17 00:00:00 2001
From: "Farid \"Freddie\" Taba" <farid.t@gmail.com>
Date: Tue, 7 May 2024 10:48:42 -0700
Subject: [PATCH 8/9] Create README.md

---
 demo/blog_writer/README.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 demo/blog_writer/README.md

diff --git a/demo/blog_writer/README.md b/demo/blog_writer/README.md
new file mode 100644
index 00000000..75379efd
--- /dev/null
+++ b/demo/blog_writer/README.md
@@ -0,0 +1,5 @@
+# Blog Writer Project
+This community project aims to utilize Sherpa as a library (`sherpa_ai`) as well as direct LLM calls (to the OpenAI API) with LangChain to construct a blog post from the raw 
+transcript of a lecture or presenttaion.  
+
+Please refer to [this How-to Guide](https://github.com/Aggregate-Intellect/sherpa/tree/main/docs/How_To/Tutorials/blog_writer.rst) for further details.

From eb0fa8219762e44b6fdd4b91582e4e2dbe858f08 Mon Sep 17 00:00:00 2001
From: Boqi Chen <boqi.chen@mail.mcgill.ca>
Date: Tue, 7 May 2024 15:17:13 -0400
Subject: [PATCH 9/9] Add sherpa to the dependency

---
 demo/blog_writer/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/demo/blog_writer/requirements.txt b/demo/blog_writer/requirements.txt
index 11ec5ec7..5b9a7277 100644
--- a/demo/blog_writer/requirements.txt
+++ b/demo/blog_writer/requirements.txt
@@ -4,3 +4,4 @@ langchain==0.0.332
 python-dotenv>=1.0.0
 openai>=0.28.0
 tiktoken>=0.4.0
+sherpa-ai >= 0.2.1