diff --git a/demo/blog_writer/.env.sample b/demo/blog_writer/.env.sample new file mode 100644 index 00000000..3fb32146 --- /dev/null +++ b/demo/blog_writer/.env.sample @@ -0,0 +1,6 @@ +# Enter your OpenAI API key and Serper API key immediately after the equals sign without spaces or enclosing quotation marks. +OPENAI_API_KEY= +SERPER_API_KEY= + +# Uncomment the line below to enable debugging logs. +# LOG_LEVEL=DEBUG diff --git a/demo/blog_writer/README.md b/demo/blog_writer/README.md new file mode 100644 index 00000000..75379efd --- /dev/null +++ b/demo/blog_writer/README.md @@ -0,0 +1,5 @@ +# Blog Writer Project +This community project aims to utilize Sherpa as a library (`sherpa_ai`) as well as direct LLM calls (to the OpenAI API) with LangChain to construct a blog post from the raw +transcript of a lecture or presenttaion. + +Please refer to [this How-to Guide](https://github.com/Aggregate-Intellect/sherpa/tree/main/docs/How_To/Tutorials/blog_writer.rst) for further details. diff --git a/demo/blog_writer/actions.py b/demo/blog_writer/actions.py new file mode 100644 index 00000000..c3f64383 --- /dev/null +++ b/demo/blog_writer/actions.py @@ -0,0 +1,55 @@ +from langchain.document_loaders import PDFMinerLoader +from langchain.text_splitter import SentenceTransformersTokenTextSplitter +from langchain.vectorstores.chroma import Chroma +from loguru import logger + +from sherpa_ai.actions.base import BaseAction + + +class DocumentSearch(BaseAction): + def __init__(self, filename, embedding_function, k=4): + # file name of the pdf + self.filename = filename + # the embedding function to use + self.embedding_function = embedding_function + # number of results to return in search + self.k = k + + # load the pdf and create the vector store + self.chroma = Chroma(embedding_function = embedding_function) + documents = PDFMinerLoader(self.filename).load() + documents = SentenceTransformersTokenTextSplitter(chunk_overlap=0).split_documents(documents) + + logger.info(f"Adding {len(documents)} documents to the vector store") + self.chroma.add_documents(documents) + logger.info("Finished adding documents to the vector store") + + def execute(self, query): + """ + Execute the action by searching the document store for the query + + Args: + query (str): The query to search for + + Returns: + str: The search results combined into a single string + """ + + results = self.chroma.search(query, search_type="mmr", k=self.k) + return "\n\n".join([result.page_content for result in results]) + + @property + def name(self) -> str: + """ + The name of the action, used to describe the action to the agent. + """ + return "DocumentSearch" + + @property + def args(self) -> dict: + """ + The arguments that the action takes, used to describe the action to the agent. + """ + return { + "query": "string" + } diff --git a/demo/blog_writer/agent_config.yml b/demo/blog_writer/agent_config.yml new file mode 100644 index 00000000..1da0b6d5 --- /dev/null +++ b/demo/blog_writer/agent_config.yml @@ -0,0 +1,51 @@ +shared_memory: + _target_: sherpa_ai.memory.shared_memory.SharedMemory # The absolute path to the share memory class in the library + objective: Answer the question # Objective for the agent, since this is a question answering agent, the objective is to answer questions + +agent_config: # For the demo, default configuration is used. You can change the configuration as per your requirement + _target_: sherpa_ai.config.task_config.AgentConfig + + +llm: # Configuration for the llm, here we are using the OpenAI GPT-3.5-turbo model + _target_: langchain.chat_models.ChatOpenAI + model_name: gpt-3.5-turbo + temperature: 0 + +embedding_func: + _target_: langchain.embeddings.SentenceTransformerEmbeddings + model_name: sentence-transformers/all-mpnet-base-v2 + +doc_search: + _target_: actions.DocumentSearch + filename: transcript.pdf + embedding_function: ${embedding_func} + k: 4 + +google_search: + _target_: sherpa_ai.actions.GoogleSearch + role_description: Act as a question answering agent + task: Question answering + llm: ${llm} + include_metadata: true + config: ${agent_config} + +citation_validation: # The tool used to validate and add citation to the answer + _target_: sherpa_ai.output_parsers.citation_validation.CitationValidation + sequence_threshold: 0.6 + jaccard_threshold: 0.6 + token_overlap: 0.6 + +qa_agent: + _target_: sherpa_ai.agents.qa_agent.QAAgent + llm: ${llm} + shared_memory: ${shared_memory} + name: QA Sherpa + description: You are a technical writing assistant that helps users write articles. For each prompt, use Google Search to find detailed information that supports and expands on the prompt. + agent_config: ${agent_config} + num_runs: 1 + validation_steps: 1 + actions: + - ${google_search} + # - ${doc_search} + validations: + - ${citation_validation} diff --git a/demo/blog_writer/main.py b/demo/blog_writer/main.py new file mode 100644 index 00000000..c061d22e --- /dev/null +++ b/demo/blog_writer/main.py @@ -0,0 +1,87 @@ +import json +from argparse import ArgumentParser + +from hydra.utils import instantiate +from omegaconf import OmegaConf +from sherpa_ai.agents import QAAgent +from sherpa_ai.events import EventType + +from outliner import Outliner + +# from sherpa_ai.memory import Belief + + +def get_qa_agent_from_config_file( + config_path: str, +) -> QAAgent: + """ + Create a QAAgent from a config file. + + Args: + config_path: Path to the config file + + Returns: + QAAgent: A QAAgent instance + """ + + config = OmegaConf.load(config_path) + + agent_config = instantiate(config.agent_config) + qa_agent: QAAgent = instantiate(config.qa_agent, agent_config=agent_config) + + return qa_agent + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--config", type=str, default="agent_config.yaml") + parser.add_argument("--transcript", type=str, default="transcript.txt") + args = parser.parse_args() + + writer_agent = get_qa_agent_from_config_file(args.config) + + outliner = Outliner(args.transcript) + blueprint = outliner.full_transcript2outline_json(verbose=True) + if blueprint.startswith("```"): + # The first and last lines are code block delimiters; remove them + lines = blueprint.split("\n")[1:-1] + pure_json_str = "\n".join(lines) + else: + pure_json_str = blueprint + + with open("blueprint.json", "w") as f: + f.write(pure_json_str) + + # with open("blueprint_manual.json", "r") as f: + # pure_json_str = f.read() + + parsed_json = json.loads(pure_json_str) + + blog = "" + thesis = parsed_json.get("Thesis Statement", "") + blog += f"# Introduction\n{thesis}\n" + arguments = parsed_json.get("Supporting Arguments", []) + for argument in arguments: + blog += f"## {argument['Argument']}\n" + evidences = argument.get("Evidence", []) + for evidence in evidences: + writer_agent.shared_memory.add(EventType.task, "human", evidence) + result = writer_agent.run() + # writer_agent.belief = Belief() + blog += f"{result}\n" + + with open("blog.md", "w") as f: + f.write(blog) + + print("\nBlog generated successfully!\n") + + # save_format = None + # while save_format is None: + # save_format = input( + # "Select format to save the blog in: 1. Markdown (Default) 2. ReStructured Text\n" + # ) + + # if save_format == "2": + # output = pypandoc.convert("blog.md", "rst") + # if os.path.exists("blog.md"): + # os.remove("blog.md") diff --git a/demo/blog_writer/outliner.py b/demo/blog_writer/outliner.py new file mode 100644 index 00000000..29273b13 --- /dev/null +++ b/demo/blog_writer/outliner.py @@ -0,0 +1,148 @@ +import os +import time + +import tiktoken +from langchain.chat_models import ChatOpenAI +from langchain.prompts.chat import ( + ChatPromptTemplate, + HumanMessagePromptTemplate, + SystemMessagePromptTemplate, +) +from langchain.text_splitter import MarkdownTextSplitter + + +class Outliner: + def __init__(self, transcript_file) -> None: + with open(transcript_file, "r") as f: + self.raw_transcript = f.read() + # instantiate chat model + self.chat = ChatOpenAI( + openai_api_key=os.environ.get("OPENAI_API_KEY"), + temperature=0, + model="gpt-3.5-turbo", + ) + + def num_tokens_from_string( + self, string: str, encoding_name="cl100k_base" + ) -> int: + """Returns the number of tokens in a text string.""" + encoding = tiktoken.get_encoding(encoding_name) + num_tokens = len(encoding.encode(string)) + return num_tokens + + def transcript_splitter(self, chunk_size=3000, chunk_overlap=200): + markdown_splitter = MarkdownTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + transcript_chunks = markdown_splitter.create_documents( + [self.raw_transcript] + ) + return transcript_chunks + + def transcript2insights(self, transcript): + system_template = "You are a helpful assistant that summarizes transcripts of podcasts or lectures." + system_prompt = SystemMessagePromptTemplate.from_template( + system_template + ) + human_template = """From this chunk of a presentation transcript, extract a short list of key insights. \ + Skip explaining what you're doing, labeling the insights and writing conclusion paragraphs. \ + The insights have to be phrased as statements of facts with no references to the presentation or the transcript. \ + Statements have to be full sentences and in terms of words and phrases as close as possible to those used in the transcript. \ + Keep as much detail as possible. The transcript of the presentation is delimited in triple backticks. + + Desired output format: + - [Key insight #1] + - [Key insight #2] + - [...] + + Transcript: + ```{transcript}```""" + human_prompt = HumanMessagePromptTemplate.from_template(human_template) + chat_prompt = ChatPromptTemplate.from_messages( + [system_prompt, human_prompt] + ) + + result = self.chat( + chat_prompt.format_prompt(transcript=transcript).to_messages() + ) + + return result.content + + def create_essay_insights(self, transcript_chunks, verbose=True): + response = "" + for i, text in enumerate(transcript_chunks): + insights = self.transcript2insights(text.page_content) + response = "\n".join([response, insights]) + if verbose: + print( + f"\nInsights extracted from chunk {i+1}/{len(transcript_chunks)}:\n{insights}" + ) + return response + + def create_blueprint(self, statements, verbose=True): + system_template = """You are a helpful AI blogger who writes essays on technical topics.""" + system_prompt = SystemMessagePromptTemplate.from_template( + system_template + ) + + human_template = """Organize the following list of statements (delimited in triple backticks) to create the outline \ + for a blog post in JSON format. The highest level is the most plausible statement as the overarching thesis \ + statement of the post, the next layers are statements providing supporting arguments for the thesis statement. \ + The last layer are pieces of evidence for each of the supporting arguments, directly quoted from the provided \ + list of statements. Use as many of the provided statements as possible. Keep their wording as is without paraphrasing them. \ + Retain as many technical details as possible. The thesis statement, supporting arguments, and evidences must be \ + full sentences containing claims. Label each layer with the appropriate level title and create the desired JSON output format below. \ + Only output the JSON and skip explaining what you're doing: + + Desired output format: + {{ + "Thesis Statement": "...", + "Supporting Arguments": [ + {{ + "Argument": "...", + "Evidence": [ + "...", "...", "...", ... + ] + }}, + {{ + "Argument": "...", + "Evidence": [ + "...", "...", "...", ... + ] + }}, + ... + ] + }} + + Statements: + ```{statements}```""" + human_prompt = HumanMessagePromptTemplate.from_template(human_template) + chat_prompt = ChatPromptTemplate.from_messages( + [system_prompt, human_prompt] + ) + + outline = self.chat( + chat_prompt.format_prompt(statements=statements).to_messages() + ) + + if verbose: + print(f"\nEssay outline: {outline.content}\n") + return outline.content + + # @timer_decorator + def full_transcript2outline_json(self, verbose=True): + print("\nChunking transcript...") + transcript_docs = self.transcript_splitter() + t1 = time.time() + print("\nExtracting key insights...") + essay_insights = self.create_essay_insights(transcript_docs, verbose) + t2 = time.time() - t1 + print("\nCreating essay outline...") + t1 = time.time() + blueprint = self.create_blueprint(essay_insights, verbose) + t3 = time.time() - t1 + if verbose: + print() + print(f"Extracted essay insights in {t2:.2f} seconds.") + print(f"Created essay blueprint in {t3:.2f} seconds.") + return blueprint diff --git a/demo/blog_writer/requirements.txt b/demo/blog_writer/requirements.txt new file mode 100644 index 00000000..5b9a7277 --- /dev/null +++ b/demo/blog_writer/requirements.txt @@ -0,0 +1,7 @@ +pdfminer.six +sentence-transformers +langchain==0.0.332 +python-dotenv>=1.0.0 +openai>=0.28.0 +tiktoken>=0.4.0 +sherpa-ai >= 0.2.1 diff --git a/docs/How_To/Tutorials/blog_writer.rst b/docs/How_To/Tutorials/blog_writer.rst new file mode 100644 index 00000000..3fc7bf68 --- /dev/null +++ b/docs/How_To/Tutorials/blog_writer.rst @@ -0,0 +1,116 @@ +Create a Blog Writer with Sherpa +================================ + +In this tutorial we will create a simple blog writer using Sherpa. The +blog writer will be able to read the transcript of a presentation, +create an outline for a blog post, and then write the blog post section +by section using a “Writer” agent that can gather information about the +topic of the section from the web by performing a Google Search and/or +from the transcript itself by performing a Document Search. + +Overview +-------- + +The two main Python files are: + +1. ``main.py``: This file instantiates +and configures a Sherpa agent with access to various “actions” (wrappers +for “tools”) such as Google search (with citation validation) and +document search (for retrieving relevant context from the transcript). +The agent takes statements that comprise “evidence” for claims to be +made in the blog post and extends and expands them into paragraphs. +These pieces of “evidence” are part of the blog outline generated by the +second component. + +2. ``ouliner.py``: This file houses the Outliner +component which performs the following: + +* Preprocessing (chunking) the document. This is necessary due to GPT-3.5’s + context window limit of 4096 tokens. + +* Analyzing the transcript. This consists of two steps: First, a short list of + “key insights” is extracted from each chunk. In the next step, these lists are + concatenated together and a blog post “blueprint” (analogous to an “essay + outline”) is synthesized from the list of all key insights. This blueprint can + be thought of as a tree with three levels of depth: + + 1. Thesis Statement: A single statement that forms the core message of the + blog post (essentially a topic and a claim made about the topic). + + 2. Supporting Arguments: A list of statements + that support the Thesis Statement at a high level. + + 3. Evidence: Lists of + statements that provide factual evidence for the Supporting Argument + under which they appear. The blueprint is output as a JSON string. + +How to Install +-------------- + +Step 1. Install Python 3.9 using your preferred installation method. + +Step 2. Create a folder for storing the blog writer code and +input/output files: + +.. code:: bash + + cd + mkdir sherpa_blog_writer + cd sherpa_blog_writer + +Step 3. You may wish to create a virtual environment to isolate the +Python libraries used for this tutorial from your other Python code. +This step is optional but highly recommended. An example of this (using +``venv``) would be: + +.. code:: bash + + python -m venv bwvenv + source bwvenv/bin/activate + +Step 4. Install the Sherpa library using ``pip``. + +.. code:: bash + + pip install sherpa_ai + +Step 5. Download all files from +(https://github.com/Aggregate-Intellect/sherpa/tree/main/demo/blog_writer) +into this directory. + +Step 6. Install additional requiremetns with +``pip``. + +.. code:: bash + + pip install -r requirements.txt + +Step 7. Rename the file ``.env.sample`` to ``.env``. Then open it in +your favourite text editor and add your OpenAI and Serper API keys. + +Step 8. Source the environment variables from the ``.env`` file with +``direnv`` if you use it, or alternatively using: + +.. code:: bash + + export $(grep -v '^#' .env | xargs) + +How to Use +---------- + +Step 1. Currently the blog writer needs the transcript in both ``.txt`` +*and* in ``.pdf`` formats. So the first step is to ensure you have both +files and copy them into the current (blog writer) directory. Most text +editors will have an “Export to PDF” feature. Alternatively, you can +“print” the file as a PDF. Name the files ``transcript.txt`` and +``transcript.pdf``. + +Step 2. Run: + +.. code:: bash + + python main.py --config agent_config.yml --transcript transcript.txt + +The blog writer will output verbose feedback to the console as it works +through the files. The blueprint will be saved as ``blueprint.json`` and +the final output (blog post) as ``blog.md``.