Merge pull request #68 from Eyobyb/feature/link-scraper-reconstructor

link , scraper for question
Aggregate-Intellect · Aug 11, 2023 · b94ea09 · b94ea09
2 parents 02bff2a + 2776c0a
commit b94ea09
Show file tree

Hide file tree

Showing 4 changed files with 187 additions and 8 deletions.
diff --git a/apps/slackbot/bolt_app.py b/apps/slackbot/bolt_app.py
@@ -6,6 +6,8 @@
 import os
 from dotenv import load_dotenv
 from flask import Flask, request
+
+from scrape.prompt_reconstructor import PromptReconstructor
 load_dotenv()
 from langchain.chat_models import ChatOpenAI
 from os import environ
@@ -66,12 +68,18 @@ def event_test(client, say, event):
     thread_ts = event.get("thread_ts", None) or event["ts"]
     replies = client.conversations_replies(channel=event['channel'], ts=thread_ts)
     previous_messages = replies['messages'][:-1]
+
+    # used to reconstruct the question. if the question contains a link recreate
+    # them so that they contain scraped and summerized content of the link
+    reconstructor = PromptReconstructor(question=question, 
+                                        slack_message=[replies['messages'][-1]])
+    question = reconstructor.reconstruct_prompt()
 
     results, verbose_message = get_response(question, previous_messages)
     say(results, thread_ts=thread_ts)
 
     if contains_verbose(question):
-      say(f"#verbose message: \n```{verbose_message}```", thread_ts=thread_ts)
+        say(f"#verbose message: \n```{verbose_message}```", thread_ts=thread_ts)
 
 @app.event("app_home_opened")
 def update_home_tab(client, event, logger):

diff --git a/.../slackbot/scrape/extract-github-readme.py → .../slackbot/scrape/extract_github_readme.py b/.../slackbot/scrape/extract-github-readme.py → .../slackbot/scrape/extract_github_readme.py
@@ -2,10 +2,12 @@
 import base64
 import re
 from dotenv import dotenv_values
-from apps.slackbot.vectorstores import ConversationStore
+
 import pinecone
 from langchain.embeddings.openai import OpenAIEmbeddings
 
+from vectorstores import ConversationStore
+
 env_vars = dotenv_values(".env")
 
 # Access the variables
@@ -62,7 +64,4 @@ def save_to_pine_cone(content,metadatas):
     embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
 
     vectorstore = ConversationStore("Github_data", index, embeddings, 'text')
-    vectorstore.add_texts(content,metadatas)
-
-
-extract_github_readme('https://github.com/TowhidKashem/snapchat-clone')
+    vectorstore.add_texts(content,metadatas)
diff --git a/apps/slackbot/scrape/prompt_reconstructor.py b/apps/slackbot/scrape/prompt_reconstructor.py
@@ -0,0 +1,62 @@
+
+from scrape.extract_github_readme import extract_github_readme
+from utils import chunk_and_summerize, count_string_tokens, get_link_from_slack_client_conversation, question_reconstructor, scarape_with_url
+from os import environ
+
+OPENAI_KEY = environ.get("OPENAI_KEY")
+
+
+class PromptReconstructor:
+    def __init__(self, question, slack_message):
+        self.question = question
+        self.slack_message = slack_message
+
+    def reconstruct_prompt(self):
+        question = self.question
+        last_message = self.slack_message
+        last_message_links = get_link_from_slack_client_conversation(
+            last_message)
+
+        # if there is a link inside the question scrape then summerize based
+        # on question and then aggregate to the question
+
+        if len(last_message_links) > 0:
+            available_token = 3000 - \
+                count_string_tokens(question, "gpt-3.5-turbo")
+            per_scrape_token_size = available_token/len(last_message_links)
+            final_summary = []
+            for last_message_link in last_message_links:
+                link = last_message_link["url"]
+                scraped_data = ""
+                if 'github' in last_message_links[-1]['base_url']:
+                    git_scraper = extract_github_readme(link)
+                    if git_scraper:
+                        scraped_data = {
+                            "data": extract_github_readme(link), "status": 200}
+                    else:
+                        scraped_data = {"data": "", "status": 404}
+                else:
+                    scraped_data = scarape_with_url(link)
+                if (scraped_data['status'] == 200):
+
+                    chunk_summary = chunk_and_summerize(
+                        link=link,
+                        open_ai_key=OPENAI_KEY,
+                        question=question,
+                        text_data=scraped_data["data"]
+                    )
+
+                    while count_string_tokens(chunk_summary, "gpt-3.5-turbo") > per_scrape_token_size:
+
+                        chunk_summary = chunk_and_summerize(
+                            link=link,
+                            open_ai_key=OPENAI_KEY,
+                            question=question,
+                            text_data=chunk_summary
+                        )
+
+                    final_summary.append({"data": chunk_summary, "link": link})
+
+            question = question_reconstructor(
+                question=question, data=final_summary)
+        return question
diff --git a/apps/slackbot/utils.py b/apps/slackbot/utils.py
@@ -1,6 +1,14 @@
 from typing import List
 from langchain.docstore.document import Document
 from langchain.document_loaders import UnstructuredPDFLoader, UnstructuredMarkdownLoader
+from langchain.llms import OpenAI
+from langchain.text_splitter import TokenTextSplitter
+
+from bs4 import BeautifulSoup
+import tiktoken
+import requests
+import re
+from urllib.parse import urlparse
 
 
 def load_files(files: List[str]) -> List[Document]:
@@ -14,6 +22,108 @@ def load_files(files: List[str]) -> List[Document]:
         else:
             raise NotImplementedError(f"File type {f} not supported")
         documents.extend(loader.load())
-    
+
     print(documents)
-    return documents
+    return documents
+
+
+def get_links_from_string(text):
+    # Define the regular expression pattern to find links inside angle brackets
+    pattern = r'<([^>]*)>'
+
+    # Use re.findall to extract all matches of the pattern in the input string
+    matches = re.findall(pattern, text)
+
+    # Filter the matches to keep only the ones that start with "http://" or "https://"
+    # links = [match for match in matches if match.startswith(
+    #     "http://") or match.startswith("https://")]
+    links = []
+
+    for match in matches:
+        if match.startswith("http://") or match.startswith("https://"):
+            links.append({"url": match, "base_url": get_base_url(match)})
+    return links
+
+
+def get_base_url(link):
+    parsed_url = urlparse(link)
+    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
+    return base_url
+
+
+def get_link_from_slack_client_conversation(data):
+    links = []
+    for item in data:
+        if 'blocks' in item:
+            for block in item['blocks']:
+                if 'elements' in block:
+                    for element in block['elements']:
+                        for newElement in element['elements']:
+                            if (newElement.get('type') == 'link'):
+                                newUrl = newElement['url']
+                                links.append(
+                                    {"url": newUrl,
+                                     "base_url": get_base_url(newUrl)
+                                     })
+    return links
+
+
+def scarape_with_url(url: str):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, "html.parser")
+    data = soup.get_text(strip=True)
+    status = response.status_code
+    if response.status_code == 200:
+        return {"data": data, "status": status}
+    else:
+        return {"data": "", "status": status}
+
+
+def question_reconstructor(data: any, question: str):
+    result = question + "./n Reference:"
+    count = 1
+    for chunk in data:
+        chunk_link = f"<{chunk['link']}>"
+        result = result.replace(f"{chunk_link}", f"[{count}]")
+        result = result + \
+            f""" [{count}] link: "{chunk['link']}" , link_data: {data}"""
+        count += 1
+
+    return result
+
+
+def count_string_tokens(string: str, model_name: str) -> int:
+    """
+    Returns the number of tokens in a text string.
+
+    Args:
+        string (str): The text string.
+        model_name (str): The name of the encoding to use. (e.g., "gpt-3.5-turbo")
+
+    Returns:
+        int: The number of tokens in the text string.
+    """
+    encoding = tiktoken.encoding_for_model(model_name)
+    return len(encoding.encode(string))
+
+
+def chunk_and_summerize(text_data: str,  question: str, open_ai_key: str, link: str):
+
+    llm = OpenAI(temperature=0.9, openai_api_key=open_ai_key)
+    instruction = f"include any information that can be used to answer the question '{question}' the given literal text is a data from the link {link}. Do not directly answer the question itself"
+
+    text_splitter = TokenTextSplitter(chunk_size=3000, chunk_overlap=0)
+    chunked_text = text_splitter.split_text(text_data)
+    chunk_summary = []
+    for text in chunked_text:
+
+        summerized = llm.predict(
+            f"""Write a concise summary of the following text
+            {instruction}:
+            "\n\n\n
+            f'LITERAL TEXT: {text}
+            \n\n\n
+            CONCISE SUMMARY: The text is best summarized as""")
+        chunk_summary.append(summerized)
+
+    return " ".join(chunk_summary)