Skip to content

Commit

Permalink
Merge pull request #68 from Eyobyb/feature/link-scraper-reconstructor
Browse files Browse the repository at this point in the history
link , scraper for question
  • Loading branch information
saminegash authored Aug 11, 2023
2 parents 02bff2a + 2776c0a commit b94ea09
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 8 deletions.
10 changes: 9 additions & 1 deletion apps/slackbot/bolt_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import os
from dotenv import load_dotenv
from flask import Flask, request

from scrape.prompt_reconstructor import PromptReconstructor
load_dotenv()
from langchain.chat_models import ChatOpenAI
from os import environ
Expand Down Expand Up @@ -66,12 +68,18 @@ def event_test(client, say, event):
thread_ts = event.get("thread_ts", None) or event["ts"]
replies = client.conversations_replies(channel=event['channel'], ts=thread_ts)
previous_messages = replies['messages'][:-1]

# used to reconstruct the question. if the question contains a link recreate
# them so that they contain scraped and summerized content of the link
reconstructor = PromptReconstructor(question=question,
slack_message=[replies['messages'][-1]])
question = reconstructor.reconstruct_prompt()

results, verbose_message = get_response(question, previous_messages)
say(results, thread_ts=thread_ts)

if contains_verbose(question):
say(f"#verbose message: \n```{verbose_message}```", thread_ts=thread_ts)
say(f"#verbose message: \n```{verbose_message}```", thread_ts=thread_ts)

@app.event("app_home_opened")
def update_home_tab(client, event, logger):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
import base64
import re
from dotenv import dotenv_values
from apps.slackbot.vectorstores import ConversationStore

import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings

from vectorstores import ConversationStore

env_vars = dotenv_values(".env")

# Access the variables
Expand Down Expand Up @@ -62,7 +64,4 @@ def save_to_pine_cone(content,metadatas):
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

vectorstore = ConversationStore("Github_data", index, embeddings, 'text')
vectorstore.add_texts(content,metadatas)


extract_github_readme('https://github.com/TowhidKashem/snapchat-clone')
vectorstore.add_texts(content,metadatas)
62 changes: 62 additions & 0 deletions apps/slackbot/scrape/prompt_reconstructor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@

from scrape.extract_github_readme import extract_github_readme
from utils import chunk_and_summerize, count_string_tokens, get_link_from_slack_client_conversation, question_reconstructor, scarape_with_url
from os import environ

OPENAI_KEY = environ.get("OPENAI_KEY")


class PromptReconstructor:
def __init__(self, question, slack_message):
self.question = question
self.slack_message = slack_message

def reconstruct_prompt(self):
question = self.question
last_message = self.slack_message
last_message_links = get_link_from_slack_client_conversation(
last_message)

# if there is a link inside the question scrape then summerize based
# on question and then aggregate to the question

if len(last_message_links) > 0:
available_token = 3000 - \
count_string_tokens(question, "gpt-3.5-turbo")
per_scrape_token_size = available_token/len(last_message_links)
final_summary = []
for last_message_link in last_message_links:
link = last_message_link["url"]
scraped_data = ""
if 'github' in last_message_links[-1]['base_url']:
git_scraper = extract_github_readme(link)
if git_scraper:
scraped_data = {
"data": extract_github_readme(link), "status": 200}
else:
scraped_data = {"data": "", "status": 404}
else:
scraped_data = scarape_with_url(link)
if (scraped_data['status'] == 200):

chunk_summary = chunk_and_summerize(
link=link,
open_ai_key=OPENAI_KEY,
question=question,
text_data=scraped_data["data"]
)

while count_string_tokens(chunk_summary, "gpt-3.5-turbo") > per_scrape_token_size:

chunk_summary = chunk_and_summerize(
link=link,
open_ai_key=OPENAI_KEY,
question=question,
text_data=chunk_summary
)

final_summary.append({"data": chunk_summary, "link": link})

question = question_reconstructor(
question=question, data=final_summary)
return question
114 changes: 112 additions & 2 deletions apps/slackbot/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders import UnstructuredPDFLoader, UnstructuredMarkdownLoader
from langchain.llms import OpenAI
from langchain.text_splitter import TokenTextSplitter

from bs4 import BeautifulSoup
import tiktoken
import requests
import re
from urllib.parse import urlparse


def load_files(files: List[str]) -> List[Document]:
Expand All @@ -14,6 +22,108 @@ def load_files(files: List[str]) -> List[Document]:
else:
raise NotImplementedError(f"File type {f} not supported")
documents.extend(loader.load())

print(documents)
return documents
return documents


def get_links_from_string(text):
# Define the regular expression pattern to find links inside angle brackets
pattern = r'<([^>]*)>'

# Use re.findall to extract all matches of the pattern in the input string
matches = re.findall(pattern, text)

# Filter the matches to keep only the ones that start with "http://" or "https://"
# links = [match for match in matches if match.startswith(
# "http://") or match.startswith("https://")]
links = []

for match in matches:
if match.startswith("http://") or match.startswith("https://"):
links.append({"url": match, "base_url": get_base_url(match)})
return links


def get_base_url(link):
parsed_url = urlparse(link)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
return base_url


def get_link_from_slack_client_conversation(data):
links = []
for item in data:
if 'blocks' in item:
for block in item['blocks']:
if 'elements' in block:
for element in block['elements']:
for newElement in element['elements']:
if (newElement.get('type') == 'link'):
newUrl = newElement['url']
links.append(
{"url": newUrl,
"base_url": get_base_url(newUrl)
})
return links


def scarape_with_url(url: str):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
data = soup.get_text(strip=True)
status = response.status_code
if response.status_code == 200:
return {"data": data, "status": status}
else:
return {"data": "", "status": status}


def question_reconstructor(data: any, question: str):
result = question + "./n Reference:"
count = 1
for chunk in data:
chunk_link = f"<{chunk['link']}>"
result = result.replace(f"{chunk_link}", f"[{count}]")
result = result + \
f""" [{count}] link: "{chunk['link']}" , link_data: {data}"""
count += 1

return result


def count_string_tokens(string: str, model_name: str) -> int:
"""
Returns the number of tokens in a text string.
Args:
string (str): The text string.
model_name (str): The name of the encoding to use. (e.g., "gpt-3.5-turbo")
Returns:
int: The number of tokens in the text string.
"""
encoding = tiktoken.encoding_for_model(model_name)
return len(encoding.encode(string))


def chunk_and_summerize(text_data: str, question: str, open_ai_key: str, link: str):

llm = OpenAI(temperature=0.9, openai_api_key=open_ai_key)
instruction = f"include any information that can be used to answer the question '{question}' the given literal text is a data from the link {link}. Do not directly answer the question itself"

text_splitter = TokenTextSplitter(chunk_size=3000, chunk_overlap=0)
chunked_text = text_splitter.split_text(text_data)
chunk_summary = []
for text in chunked_text:

summerized = llm.predict(
f"""Write a concise summary of the following text
{instruction}:
"\n\n\n
f'LITERAL TEXT: {text}
\n\n\n
CONCISE SUMMARY: The text is best summarized as""")
chunk_summary.append(summerized)

return " ".join(chunk_summary)

0 comments on commit b94ea09

Please sign in to comment.