diff --git a/airflow/dags/ingestion/ask-astro-load-github.py b/airflow/dags/ingestion/ask-astro-load-github.py index e5c00776..4e025055 100644 --- a/airflow/dags/ingestion/ask-astro-load-github.py +++ b/airflow/dags/ingestion/ask-astro-load-github.py @@ -52,13 +52,7 @@ def ask_astro_load_github(): .expand(source=markdown_docs_sources) ) - issues_docs = ( - task(github.extract_github_issues) - .partial(github_conn_id=_GITHUB_CONN_ID, cutoff_date=_GITHUB_ISSUE_CUTOFF_DATE) - .expand(repo_base=issues_docs_sources) - ) - - split_md_docs = task(chunking_utils.split_markdown).expand(dfs=[md_docs, issues_docs]) + split_md_docs = task(chunking_utils.split_markdown).expand(dfs=[md_docs]) _import_data = WeaviateDocumentIngestOperator.partial( class_name=WEAVIATE_CLASS, diff --git a/airflow/dags/ingestion/ask-astro-load.py b/airflow/dags/ingestion/ask-astro-load.py index e8e909ae..d3323d3c 100644 --- a/airflow/dags/ingestion/ask-astro-load.py +++ b/airflow/dags/ingestion/ask-astro-load.py @@ -21,16 +21,13 @@ _WEAVIATE_CONN_ID = f"weaviate_{ask_astro_env}" _GITHUB_CONN_ID = "github_ro" WEAVIATE_CLASS = os.environ.get("WEAVIATE_CLASS", "DocsDev") -_GITHUB_ISSUE_CUTOFF_DATE = os.environ.get("GITHUB_ISSUE_CUTOFF_DATE", "2022-1-1") markdown_docs_sources = [ {"doc_dir": "", "repo_base": "OpenLineage/docs"}, {"doc_dir": "", "repo_base": "OpenLineage/OpenLineage"}, ] -issues_docs_sources = [ - "apache/airflow", -] + slack_channel_sources = [ { "channel_name": "troubleshooting", @@ -150,7 +147,6 @@ def check_seed_baseline(seed_baseline_url: str = None) -> str | set: "extract_airflow_docs", "extract_stack_overflow", "extract_astro_registry_cell_types", - "extract_github_issues", "extract_astro_blogs", "extract_astro_registry_dags", "extract_astro_cli_docs", @@ -262,23 +258,6 @@ def extract_astro_forum_doc(): return [df] - @task(trigger_rule=TriggerRule.NONE_FAILED) - def extract_github_issues(repo_base: str): - from include.tasks.extract import github - - parquet_file = f"include/data/{repo_base}/issues.parquet" - - if os.path.isfile(parquet_file): - if os.access(parquet_file, os.R_OK): - df = pd.read_parquet(parquet_file) - else: - raise Exception("Parquet file exists locally but is not readable.") - else: - df = github.extract_github_issues(repo_base, _GITHUB_CONN_ID, _GITHUB_ISSUE_CUTOFF_DATE) - df.to_parquet(parquet_file) - - return df - @task(trigger_rule=TriggerRule.NONE_FAILED) def extract_astro_registry_cell_types(): from include.tasks.extract import registry @@ -396,7 +375,6 @@ def import_baseline( ) md_docs = extract_github_markdown.expand(source=markdown_docs_sources) - issues_docs = extract_github_issues.expand(repo_base=issues_docs_sources) stackoverflow_docs = extract_stack_overflow.expand(tag=stackoverflow_tags) registry_cells_docs = extract_astro_registry_cell_types() blogs_docs = extract_astro_blogs() @@ -415,7 +393,6 @@ def import_baseline( markdown_tasks = [ md_docs, - issues_docs, stackoverflow_docs, blogs_docs, registry_cells_docs, diff --git a/airflow/include/tasks/extract/astro_cli_docs.py b/airflow/include/tasks/extract/astro_cli_docs.py index b8ddd749..f2807889 100644 --- a/airflow/include/tasks/extract/astro_cli_docs.py +++ b/airflow/include/tasks/extract/astro_cli_docs.py @@ -19,15 +19,15 @@ def extract_astro_cli_docs() -> list[pd.DataFrame]: 'content': HTML content of the page 'sha': A UUID from the other fields """ - astronomer_base_url = "https://docs.astronomer.io" + astronomer_base_url = "https://www.astronomer.io/docs" astro_cli_overview_endpoint = "/astro/cli/overview" - response = requests.get(f"{astronomer_base_url}/{astro_cli_overview_endpoint}") + response = requests.get(f"{astronomer_base_url}{astro_cli_overview_endpoint}") soup = BeautifulSoup(response.text, "lxml") astro_cli_links = { f"{astronomer_base_url}{link.get('href')}" for link in soup.find_all("a") - if link.get("href").startswith("/astro/cli") + if link.get("href").startswith("/docs/astro/cli") } df = pd.DataFrame(astro_cli_links, columns=["docLink"]) diff --git a/airflow/include/tasks/extract/astro_docs.py b/airflow/include/tasks/extract/astro_docs.py index e984d753..6d034e63 100644 --- a/airflow/include/tasks/extract/astro_docs.py +++ b/airflow/include/tasks/extract/astro_docs.py @@ -8,7 +8,7 @@ from include.tasks.extract.utils.html_utils import fetch_page_content, get_internal_links -base_url = "https://docs.astronomer.io/" +base_url = "https://www.astronomer.io/docs" def process_astro_doc_page_content(page_content: str) -> str: @@ -57,17 +57,18 @@ def process_astro_doc_page_content(page_content: str) -> str: def extract_astro_docs(base_url: str = base_url) -> list[pd.DataFrame]: """ - Extract documentation pages from docs.astronomer.io and its subdomains. + Extract documentation pages from www.astronomer.io/docs and its subdomains. :return: A list of pandas dataframes with extracted data. """ - all_links = get_internal_links(base_url, exclude_literal=["learn/tags"]) + all_links = get_internal_links(base_url=base_url, exclude_literal=["learn/tags"], prefix_url=base_url) # for software references, we only want latest docs, ones with version number (old) is removed - old_version_doc_pattern = r"^https://docs\.astronomer\.io/software/\d+\.\d+/.+$" + old_version_doc_pattern = r"^https://www\.astronomer\.io/docs/software/\d+\.\d+/.+$" # remove duplicate xml files, we only want html pages non_doc_links = { - link if link.endswith("xml") or re.match(old_version_doc_pattern, link) else "" for link in all_links + link if link.endswith("xml") or re.match(old_version_doc_pattern, link) or not link.startswith(base_url) else "" + for link in all_links } docs_links = all_links - non_doc_links diff --git a/airflow/include/tasks/extract/blogs.py b/airflow/include/tasks/extract/blogs.py index 5dc0d90d..eb07db1d 100644 --- a/airflow/include/tasks/extract/blogs.py +++ b/airflow/include/tasks/extract/blogs.py @@ -52,9 +52,7 @@ def extract_astro_blogs(blog_cutoff_date: datetime) -> list[pd.DataFrame]: df = pd.DataFrame(links, columns=["docLink"]) df.drop_duplicates(inplace=True) df["content"] = df["docLink"].apply(lambda x: requests.get(x).content) - df["title"] = df["content"].apply( - lambda x: BeautifulSoup(x, "lxml").find(class_="post-card__meta").find(class_="title").get_text() - ) + df["title"] = df["content"].apply(lambda x: BeautifulSoup(x, "html").find(class_="hero__title").get_text()) df["content"] = df["content"].apply(lambda x: BeautifulSoup(x, "lxml").find(class_="prose").get_text()) df["content"] = df.apply(lambda x: blog_format.format(title=x.title, content=x.content), axis=1) diff --git a/airflow/include/tasks/extract/github.py b/airflow/include/tasks/extract/github.py index e7293fa4..037009b4 100644 --- a/airflow/include/tasks/extract/github.py +++ b/airflow/include/tasks/extract/github.py @@ -1,14 +1,11 @@ from __future__ import annotations import re -from datetime import datetime from pathlib import Path -from textwrap import dedent import pandas as pd import pypandoc from bs4 import BeautifulSoup -from weaviate.util import generate_uuid5 from airflow.providers.github.hooks.github import GithubHook @@ -179,110 +176,3 @@ def extract_github_python(source: dict, github_conn_id: str) -> pd.DataFrame: df = df[["docSource", "sha", "content", "docLink"]] return df - - -def extract_github_issues(repo_base: str, github_conn_id: str, cutoff_date: str = "2022-1-1") -> pd.DataFrame: - """ - This task downloads github issues as markdown documents in a pandas dataframe. Text from templated - auto responses for issues are removed while building a markdown document for each issue. - - param repo_base: The name of organization/repository (ie. "apache/airflow") from which to extract - issues. - type repo_base: str - - param github_conn_id: The connection ID to use with the GithubHook - param github_conn_id: str - param cutoff_date: The cutoff date (format: Y-m-d) to extract issues - - The returned data includes the following fields: - 'docSource': ie. 'astronomer/docs/astro', 'astronomer/docs/learn', etc. - 'sha': the github sha for the document - 'docLink': URL for the specific document in github. - 'content': Entire document content in markdown format. - - """ - - gh_hook = GithubHook(github_conn_id) - - repo = gh_hook.client.get_repo(repo_base) - issues = repo.get_issues(state="all", since=datetime.strptime(cutoff_date, "%Y-%m-%d")) - - issue_autoresponse_text = "Thanks for opening your first issue here!" - pr_autoresponse_text = "Congratulations on your first Pull Request and welcome to the Apache Airflow community!" - drop_content = [issue_autoresponse_text, pr_autoresponse_text] - - issues_drop_text = [ - dedent( - """ <\\!--\r - .*?Licensed to the Apache Software Foundation \\(ASF\\) under one.*?under the License\\.\r - -->""" - ), - "", - "", - r"\*\*\^ Add meaningful description above.*?newsfragments\)\.", - ] - - issue_markdown_template = dedent( - """ - ## ISSUE TITLE: {title} - DATE: {date} - BY: {user} - STATE: {state} - {body} - {comments}""" - ) - - comment_markdown_template = dedent( - """ - #### COMMENT: {user} on {date} - {body}\n""" - ) - - downloaded_docs = [] - page_num = 0 - - page = issues.get_page(page_num) - - while page: - for issue in page: - print(issue.number) - comments = [] - for comment in issue.get_comments(): - if not any(substring in comment.body for substring in drop_content): - comments.append( - comment_markdown_template.format( - user=comment.user.login, date=issue.created_at.strftime("%m-%d-%Y"), body=comment.body - ) - ) - downloaded_docs.append( - { - "docLink": issue.html_url, - "sha": "", - "content": issue_markdown_template.format( - title=issue.title, - date=issue.created_at.strftime("%m-%d-%Y"), - user=issue.user.login, - state=issue.state, - body=issue.body, - comments="\n".join(comments), - ), - "docSource": f"{repo_base}/issues", - } - ) - page_num = page_num + 1 - page = issues.get_page(page_num) - - df = pd.DataFrame(downloaded_docs) - - for _text in issues_drop_text: - df["content"] = df["content"].apply(lambda x: re.sub(_text, "", x, flags=re.DOTALL)) - - df["content"] = df["content"].apply(lambda x: re.sub(r"\r\n+", "\n\n", x).strip()) - df["content"] = df["content"].apply(lambda x: re.sub(r"\n+", "\n\n", x).strip()) - - df["sha"] = df.apply(generate_uuid5, axis=1) - - # column order matters for uuid generation - df = df[["docSource", "sha", "content", "docLink"]] - - return df diff --git a/airflow/include/tasks/extract/registry.py b/airflow/include/tasks/extract/registry.py index 5e209a1a..cf6a12c1 100644 --- a/airflow/include/tasks/extract/registry.py +++ b/airflow/include/tasks/extract/registry.py @@ -8,18 +8,59 @@ modules_url = "https://api.astronomer.io/registryV2/v1alpha1/organizations/public/modules?limit=1000" modules_link_template = "https://registry.astronomer.io/providers/{providerName}/versions/{version}/modules/{_name}" +module_info_url_template = "https://api.astronomer.io/registryV2/v1alpha1/organizations/public/providers/{provider_name}/versions/latest/modules/{module_name}" dags_url = "https://api.astronomer.io/registryV2/v1alpha1/organizations/public/dags?limit=1000" dags_link_template = "https://registry.astronomer.io/dags/{_name}/versions/{version}" -registry_cell_md_template = dedent( - """ - # Registry - ## Provider: {providerName} + +def get_individual_module_detail(provider_name, module_name): + data = requests.get(module_info_url_template.format(provider_name=provider_name, module_name=module_name)).json() + import_path = data["importPath"] + + module_name = data["name"] + version = data["version"] + provider_name = data["providerName"] + description = html2text(data["description"]).strip() if data["description"] else "No Description" + description = description.replace("\n", " ") + parameters = data["parameters"] + + param_details = [] + param_usage = [] + + for param in parameters: + param_name = param["name"] + param_type = param.get("type", "UNKNOWN") + if param_type == "UNKNOWN" and "typeDef" in param and "rawAnnotation" in param["typeDef"]: + param_type = param["typeDef"]["rawAnnotation"] + required = "(REQUIRED) " if param["required"] else "" + param_details.append( + f"{param_name} ({param_type}): {required}{param.get('description', 'No Param Description')}" + ) + param_usage.append(f"\t{param_name}=MY_{param_name.upper()}") + + param_details_str = "\n\t".join(param_details) + param_usage_str = ",\n\t".join(param_usage) + + # Format the final string + module_info = dedent( + f""" + Module Name: {module_name} Version: {version} - Module: {module} - Module Description: {description}""" -) + Provider Name: {provider_name} + Import Statement: `from {import_path} import {module_name}` + Module Description: {description} + + Parameters: + {param_details_str} + + Usage Example: + f = AsyncKubernetesHook( + {param_usage_str} + )""" + ) + + return module_info def extract_astro_registry_cell_types() -> list[pd.DataFrame]: @@ -51,12 +92,8 @@ def extract_astro_registry_cell_types() -> list[pd.DataFrame]: df["docSource"] = "astronomer registry modules" df["description"] = df["description"].apply(lambda x: html2text(x) if x else "No Description") - df["content"] = df.apply( - lambda x: registry_cell_md_template.format( - providerName=x.providerName, version=x.version, module=x["name"], description=x.description - ), - axis=1, - ) + + df["content"] = df.apply(lambda x: pd.Series(get_individual_module_detail(x.providerName, x["name"])), axis=1) # column order matters for uuid generation df = df[["docSource", "sha", "content", "docLink"]] diff --git a/airflow/include/tasks/extract/utils/html_utils.py b/airflow/include/tasks/extract/utils/html_utils.py index 01a00601..1ffc244a 100644 --- a/airflow/include/tasks/extract/utils/html_utils.py +++ b/airflow/include/tasks/extract/utils/html_utils.py @@ -110,7 +110,9 @@ def truncate_tokens(text: str, encoding_name: str = "gpt-3.5-turbo", max_length: logger.info(e) -def get_page_links(url: str, current_page_content: bytes, exclude_literal: list[str]) -> None: +def get_page_links( + url: str, current_page_content: bytes, exclude_literal: list[str], prefix_url: str | None = None +) -> None: """ Recursively extract all valid and internal links from the given URL. Deduplicates any links with the exact same page content in the process. @@ -118,6 +120,7 @@ def get_page_links(url: str, current_page_content: bytes, exclude_literal: list[ param url (str): The URL to extract links from. param current_page_content: Bytes of the content of the url passed in for hashing. param exclude_docs (list): List of strings to exclude from the URL path. + param prefix_url (str | None): Ensure all urls scrapped begins with this prefix url. None for skipping this check. """ domain_name = urlparse(url).netloc page_content_hash = generate_uuid5(current_page_content) @@ -130,36 +133,43 @@ def get_page_links(url: str, current_page_content: bytes, exclude_literal: list[ href = urljoin(url, href) parsed_href = urlparse(href) href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path + if href in attempted_urls: + continue + attempted_urls.add(href) if ( not is_valid_url(href) or not href.startswith("https") or href in internal_urls - or href in attempted_urls or domain_name not in href or is_excluded_url(href, exclude_literal) + or (prefix_url and not href.startswith(prefix_url)) ): continue - attempted_urls.add(href) new_page_content = fetch_page_content(href) if (not new_page_content) or generate_uuid5(new_page_content) in internal_page_hashset: continue logger.info(href) internal_urls.add(href) - get_page_links(href, new_page_content, exclude_literal) + get_page_links(href, new_page_content, exclude_literal, prefix_url) -def get_internal_links(base_url: str, exclude_literal: list[str] | None = None) -> set[str]: +def get_internal_links( + base_url: str, exclude_literal: list[str] | None = None, prefix_url: str | None = None +) -> set[str]: """ Extract the internal links of website param base_url: The base URL of site param exclude_literal: Exclude URL that contain pattern from this list + param prefix_url (str | None): Ensure all urls scrapped begins with this prefix url. None for skipping this check. """ if exclude_literal is None: exclude_literal = [] page_content = fetch_page_content(base_url) - get_page_links(base_url, page_content, exclude_literal) + get_page_links( + url=base_url, current_page_content=page_content, exclude_literal=exclude_literal, prefix_url=prefix_url + ) internal_urls.add(base_url) return internal_urls diff --git a/airflow/include/tasks/extract/utils/stack_overflow_helpers.py b/airflow/include/tasks/extract/utils/stack_overflow_helpers.py index 4e1b79e7..7aed8ac0 100644 --- a/airflow/include/tasks/extract/utils/stack_overflow_helpers.py +++ b/airflow/include/tasks/extract/utils/stack_overflow_helpers.py @@ -1,3 +1,4 @@ +import os from datetime import datetime from textwrap import dedent @@ -23,16 +24,17 @@ ) comment_template = "\n{user} on {date} [Score: {score}]: {body}\n" +STACK_APP_API_KEY = os.environ.get("STACK_APP_API_KEY") def fetch_questions_through_stack_api( - tag: str, stackoverflow_cutoff_date: str, *, page_size: int = 100, max_pages: int = 10000000 + tag: str, stackoverflow_cutoff_date: str, *, page_size: int = 100, max_pages: int = 1000 ) -> dict: """Fetch data from stackoverflow site through stack api""" fromdate = datetime.strptime(stackoverflow_cutoff_date, "%Y-%m-%d") first_question_id, first_question_creation_date = fetch_first_question_after_fromdate(tag=tag, fromdate=fromdate) - stack_api = StackAPI(name="stackoverflow", page_size=page_size, max_pages=max_pages) + stack_api = StackAPI(name="stackoverflow", page_size=page_size, max_pages=max_pages, key=STACK_APP_API_KEY) # https://api.stackexchange.com/docs/read-filter#filters=!-(5KXGCFLp3w9.-7QsAKFqaf5yFPl**9q*_hsHzYGjJGQ6BxnCMvDYijFE&filter=default&run=true filter_ = "!-(5KXGCFLp3w9.-7QsAKFqaf5yFPl**9q*_hsHzYGjJGQ6BxnCMvDYijFE" diff --git a/api/ask_astro/chains/answer_question.py b/api/ask_astro/chains/answer_question.py index b4221086..39217f4d 100644 --- a/api/ask_astro/chains/answer_question.py +++ b/api/ask_astro/chains/answer_question.py @@ -1,7 +1,10 @@ from __future__ import annotations +from typing import Any + from langchain import LLMChain from langchain.chains import ConversationalRetrievalChain +from langchain.chains.combine_documents.stuff import StuffDocumentsChain from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT from langchain.chains.question_answering import load_qa_chain from langchain.chat_models import AzureChatOpenAI @@ -15,6 +18,8 @@ from langchain.retrievers import ContextualCompressionRetriever, MultiQueryRetriever from langchain.retrievers.document_compressors import CohereRerank, LLMChainFilter from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetriever +from langchain_core.documents import Document +from langchain_core.prompts import format_document from ask_astro.chains.custom_llm_filter_prompt import custom_llm_chain_filter_prompt_template from ask_astro.chains.custom_llm_output_lines_parser import CustomLineListOutputParser @@ -81,7 +86,9 @@ multi_query_retriever.llm_chain.output_parser = CustomLineListOutputParser(max_lines=2) # Rerank -cohere_reranker_compressor = CohereRerank(user_agent="langchain", top_n=CohereConfig.rerank_top_n) +cohere_reranker_compressor = CohereRerank( + model="rerank-english-v3.0", user_agent="langchain", top_n=CohereConfig.rerank_top_n +) reranker_retriever = ContextualCompressionRetriever( base_compressor=cohere_reranker_compressor, base_retriever=multi_query_retriever ) @@ -99,6 +106,45 @@ base_compressor=llm_chain_filter, base_retriever=reranker_retriever ) + +# customize how the documents are combined to the final LLM call, overriding LangChain's default +def custom_combine_docs_override(self, docs: list[Document], **kwargs: Any) -> dict: + # same function as the one in stuff doc chain, just changing this one line for doc number + doc_strings = [f"Document {i+1}:\n" + format_document(doc, self.document_prompt) for i, doc in enumerate(docs)] + + inputs = {k: v for k, v in kwargs.items() if k in self.llm_chain.prompt.input_variables} + inputs[self.document_variable_name] = self.document_separator.join(doc_strings) + return inputs + + +custom_document_combine_prompt = PromptTemplate( + input_variables=["page_content", "docLink"], + template="Document Link: {docLink}\n{page_content}\n===End of Document===\n", +) +StuffDocumentsChain._get_inputs = custom_combine_docs_override + +custom_stuff_docs_chain_webapp: StuffDocumentsChain = load_qa_chain( + AzureChatOpenAI( + **AzureOpenAIParams.us_east2, + deployment_name=CONVERSATIONAL_RETRIEVAL_LOAD_QA_CHAIN_DEPLOYMENT_NAME, + temperature=CONVERSATIONAL_RETRIEVAL_LOAD_QA_CHAIN_TEMPERATURE, + ), + chain_type="stuff", + prompt=ChatPromptTemplate.from_messages(webapp_messages), +) +custom_stuff_docs_chain_webapp.document_prompt = custom_document_combine_prompt + +custom_stuff_docs_chain_slack: StuffDocumentsChain = load_qa_chain( + AzureChatOpenAI( + **AzureOpenAIParams.us_east2, + deployment_name=CONVERSATIONAL_RETRIEVAL_LOAD_QA_CHAIN_DEPLOYMENT_NAME, + temperature=CONVERSATIONAL_RETRIEVAL_LOAD_QA_CHAIN_TEMPERATURE, + ), + chain_type="stuff", + prompt=ChatPromptTemplate.from_messages(slack_messages), +) +custom_stuff_docs_chain_slack.document_prompt = custom_document_combine_prompt + # Set up a ConversationalRetrievalChain to generate answers using the retriever. webapp_answer_question_chain = ConversationalRetrievalChain( retriever=llm_chain_filter_compression_retriever, @@ -111,15 +157,7 @@ ), prompt=CONDENSE_QUESTION_PROMPT, ), - combine_docs_chain=load_qa_chain( - AzureChatOpenAI( - **AzureOpenAIParams.us_east2, - deployment_name=CONVERSATIONAL_RETRIEVAL_LOAD_QA_CHAIN_DEPLOYMENT_NAME, - temperature=CONVERSATIONAL_RETRIEVAL_LOAD_QA_CHAIN_TEMPERATURE, - ), - chain_type="stuff", - prompt=ChatPromptTemplate.from_messages(webapp_messages), - ), + combine_docs_chain=custom_stuff_docs_chain_webapp, ) slack_answer_question_chain = ConversationalRetrievalChain( @@ -133,13 +171,5 @@ ), prompt=CONDENSE_QUESTION_PROMPT, ), - combine_docs_chain=load_qa_chain( - AzureChatOpenAI( - **AzureOpenAIParams.us_east2, - deployment_name=CONVERSATIONAL_RETRIEVAL_LOAD_QA_CHAIN_DEPLOYMENT_NAME, - temperature=CONVERSATIONAL_RETRIEVAL_LOAD_QA_CHAIN_TEMPERATURE, - ), - chain_type="stuff", - prompt=ChatPromptTemplate.from_messages(slack_messages), - ), + combine_docs_chain=custom_stuff_docs_chain_slack, ) diff --git a/api/ask_astro/chains/custom_llm_filter_prompt.py b/api/ask_astro/chains/custom_llm_filter_prompt.py index 039d6467..c0359b56 100644 --- a/api/ask_astro/chains/custom_llm_filter_prompt.py +++ b/api/ask_astro/chains/custom_llm_filter_prompt.py @@ -1,9 +1,15 @@ -from langchain.retrievers.document_compressors.chain_filter_prompt import ( - prompt_template, -) from langchain_core.output_parsers import BaseOutputParser from langchain_core.prompts import PromptTemplate +prompt_template = """Given the following question starting with "Question:", and context starting with "Context:" surrounded by >>> symbols, return YES if the context can be used to answer the question and NO if it cannot. + +> Question: {question} +> Context: +>>> +{context} +>>> +> Context surrounded by >>> is helpful and can be used to answer the question (YES / NO):""" + class CustomBooleanOutputParser(BaseOutputParser[bool]): """Parse the output of an LLM call to a boolean. Default to True if response not formatted correctly.""" diff --git a/api/ask_astro/config.py b/api/ask_astro/config.py index bd096575..72612671 100644 --- a/api/ask_astro/config.py +++ b/api/ask_astro/config.py @@ -66,7 +66,7 @@ class WeaviateConfig: class CohereConfig: """Contains the config variables for the Cohere API.""" - rerank_top_n = int(os.environ.get("COHERE_RERANK_TOP_N", 10)) + rerank_top_n = int(os.environ.get("COHERE_RERANK_TOP_N", 8)) class PromptPreprocessingConfig: diff --git a/api/ask_astro/settings.py b/api/ask_astro/settings.py index e4ff83b8..ee5b6467 100644 --- a/api/ask_astro/settings.py +++ b/api/ask_astro/settings.py @@ -18,7 +18,7 @@ os.environ.get("CONVERSATIONAL_RETRIEVAL_LOAD_QA_CHAIN_TEMPERATURE", "0.3") ) CONVERSATIONAL_RETRIEVAL_LOAD_QA_CHAIN_DEPLOYMENT_NAME = os.environ.get( - "CONVERSATIONAL_RETRIEVAL_LOAD_QA_CHAIN_DEPLOYMENT_NAME", "gpt-4-128k" + "CONVERSATIONAL_RETRIEVAL_LOAD_QA_CHAIN_DEPLOYMENT_NAME", "gpt-4o" ) SHOW_SERVICE_MAINTENANCE_BANNER = os.environ.get("SHOW_SERVICE_MAINTENANCE_BANNER", "False").upper() == "TRUE" diff --git a/api/ask_astro/templates/combine_docs_sys_prompt_slack.txt b/api/ask_astro/templates/combine_docs_sys_prompt_slack.txt index abf6c86d..f2fb6526 100644 --- a/api/ask_astro/templates/combine_docs_sys_prompt_slack.txt +++ b/api/ask_astro/templates/combine_docs_sys_prompt_slack.txt @@ -1,8 +1,7 @@ You are Ask Astro, a friendy and helpful bot. -Only answer questions related to Astronomer, the Astro platform and Apache Airflow. +Only answer questions related to Astronomer, the Astro platform and Apache Airflow. If the question is not related to these topics, answer "Sorry I can only help with questions regarding Airflow, Astronomer and the Astro platform". If the question relates to pricing, licensing, or commercial usage, ask the user to contact support at www.astronomer.io/contact. -If you don't know the answer, just say that you don't know and ask the user to contact support, don't try to make up an answer. -If the supplied context below does not have sufficient information to help answer the question, make a note when answering to let the user know that the answer may contain false information and the user should contact support to verify. +Only include hyperlinks or URLs if they from the supplied context. Be concise and precise in your answers and do not apologize. Format your response using Slack syntax. Surround text with SINGLE * to format it in bold or provide emphasis. Examples: GOOD: *This is bold!*. BAD: **This is bold!**. @@ -11,7 +10,17 @@ Use the • character for unnumbered lists. Use the ` character to surround inline code. Example: This is a sentence with some `inline *code*` in it. Use ``` to surround multi-line code blocks. Do not specify a language in code blocks. Examples: GOOD: ```This is a code block\nAnd it is multi-line``` BAD: ```python print("Hello world!")```. Format links using this format: . Examples: GOOD: . BAD: [This message *is* a link](https://www.example.com). + +You must refer to the following pieces of context documents below "----------------" to answer the users question. Each document starts with "Document X" where X is the document number and ends with "===End of Document===". +If the context documents are not helpful enough, you came up with any parts of your answers on your own, or your answer contains information not directly in the context documents, prefix the answer with "I cannot find documents that are directly helpful with your question, but I provided my best guess below. Please use caution as the answer below is more likely to contain incorrect information and your should always verify the answers with Astronomer support at www.astronomer.io/contact". +For example, if the question is "Does astronomer have a CLI", with context documents all about airflow CLI instead, the correct response would be "I cannot find documents that are directly helpful with your question, but I provided my best guess below. Please use caution as the answer below is more likely to contain incorrect information and your should always verify the answers with Astronomer support at www.astronomer.io/contact.\n Yes, astro does have a CLI.". + +To ensure your response answers are factual, for each chunk of statement in your response that is from a document in the context provided below, cite the source by adding an embedded slack hyperlink document link to "[document number]"" at the end of the statement. For slack syntax, this would look like . +Do not add citations to the answer if your answer does not use the document's information. +For example, if the statement in the context is "The sky is blue.", the source document is located at https://www.example.com, and the document number is 1, the response would be "The sky is blue .". +A bad example would be if the answer generated is "The sky is red" without any citations to the context documents. +Another bad example would be if the answer generated is "The sky is blue" with a citation to document 2, as the statement is not present in document 2. + 12 character words that start with "<@U" and end with ">" are usernames. Example: <@U024BE7LH>. -Use the following pieces of context to answer the users question. ---------------- {context} diff --git a/api/ask_astro/templates/combine_docs_sys_prompt_webapp.txt b/api/ask_astro/templates/combine_docs_sys_prompt_webapp.txt index 14a61502..27105540 100644 --- a/api/ask_astro/templates/combine_docs_sys_prompt_webapp.txt +++ b/api/ask_astro/templates/combine_docs_sys_prompt_webapp.txt @@ -1,17 +1,22 @@ You are Ask Astro, a friendy and helpful bot. -Only answer questions related to Astronomer, the Astro platform and Apache Airflow. +Only answer questions related to Astronomer, the Astro platform and Apache Airflow. If the question is not related to these topics, answer "Sorry I can only help with questions regarding Airflow, Astronomer and the Astro platform". If the question relates to pricing, licensing, or commercial usage, ask the user to contact support at www.astronomer.io/contact. -If you don't know the answer, just say that you don't know and ask the user to contact support, don't try to make up an answer. -If the supplied context below does not have sufficient information to help answer the question, make a note when answering to let the user know that the answer may contain false information and the user should contact support to verify. -Be concise and precise in your answers and do not apologize. +Only include hyperlinks or URLs if they from the supplied context. +Be concise and precise in your answers, and do not apologize. Format your response using Markdown syntax. -Surround text with SINGLE * to format it in bold or provide emphasis. Examples: GOOD: *This is bold!*. BAD: **This is bold!**. -Support text with _ to format it in italic. Example: _This is italic._ -Use the • character for unnumbered lists. -Use the ` character to surround inline code. Example: This is a sentence with some `inline *code*` in it. -Use ``` to surround multi-line code blocks. Do not specify a language in code blocks. Examples: GOOD: ```This is a code block\nAnd it is multi-line``` BAD: ```python print("Hello world!")```. +Use the ` character to surround inline code. Use ``` to surround multi-line code blocks. Do not specify a language in code blocks. Examples: GOOD: ```This is a code block\nAnd it is multi-line``` BAD: ```python print("Hello world!")```. Format links using this format: [Text to display](URL). Examples: GOOD: [This message **is** a link](https://www.example.com). BAD: . + +You must refer to the following pieces of context documents below "----------------" to answer the users question. Each document starts with "Document X" where X is the document number and ends with "===End of Document===". +If the context documents are not helpful enough, you came up with any parts of your answers on your own, or your answer contains information not directly in the context documents, prefix the answer with "I cannot find documents that are directly helpful with your question, but I provided my best guess below. Please use caution as the answer below is more likely to contain incorrect information and your should always verify the answers with Astronomer support at www.astronomer.io/contact". +For example, if the question is "Does astronomer have a CLI", with context documents all about airflow CLI instead, the correct response would be "I cannot find documents that are directly helpful with your question, but I provided my best guess below. Please use caution as the answer below is more likely to contain incorrect information and your should always verify the answers with Astronomer support at www.astronomer.io/contact.\n Yes, astro does have a CLI.". + +To ensure your response answers are factual, for each chunk of statement in your response that is from a document in the context provided below, cite the source by adding [document number](document link) at the end of the statement. The citation document number should be double wrapped in [] for markdown syntax. +Do not add citations to the answer if your answer does not use the document's information. +For example, if the statement in the context is "The sky is blue.", the source document is located at https://www.example.com, and the document number is 1, the response would be "The sky is blue [[1]](https://www.example.com).". +A bad example would be if the answer generated is "The sky is red" without any citations to the context documents. +Another bad example would be if the answer generated is "The sky is blue" with a citation to document 2, as the statement is not present in document 2. + 12 character words that start with "<@U" and end with ">" are usernames. Example: <@U024BE7LH>. -Use the following pieces of context to answer the users question. ---------------- {context} diff --git a/docs/README.md b/docs/README.md index bdf51810..df8e38a6 100644 --- a/docs/README.md +++ b/docs/README.md @@ -29,7 +29,7 @@ Ask Astro uses a set of Airflow DAGs that: ingest data from a source via an API - [Astronomer docs](https://docs.astronomer.io) - [Astronomer blog](https://www.astronomer.io/blog/) - [Astronomer Registry](https://registry.astronomer.io) -- [Apache Airflow GitHub](https://github.com/apache/airflow) issues and pull requests +- [Apache Airflow GitHub](https://github.com/apache/airflow) pull requests - [OpenLineage GitHub](https://github.com/OpenLineage/OpenLineage) - [OpenLineage GitHub docs](https://github.com/OpenLineage/docs) - [StackOverflow's Stack Exchange Data Dump](https://archive.org/details/stackexchange) @@ -47,12 +47,14 @@ See the [Ingest README](https://github.com/astronomer/ask-astro/tree/main/airflo Ask Astro uses LangChain's `ConversationalRetrievalChain` to generate a response. This chain does the following: -1. Use an LLM to generate 3 variations of the original user prompt with different wording. This is to ensure we retrieve as much helpful context as possible from our vector database -2. Embed each of the prompts with OpenAI's embeddings model -3. Retrieve documents from Weaviate using the embedded vectors from each prompt -4. Combine the original user prompt with relevant sources found from the vector database, and make an LLM call to generate an answer +1. Use an LLM to generate 2 variations of the original user prompt with different wording. This is to ensure we retrieve as much helpful context as possible from our vector database +2. Embed each of 2 reworded prompts + original user prompt with OpenAI's embeddings model +3. Retrieve up to 100 documents for each prompt from Weaviate using the embedded vectors from each prompt +4. Using Cohere Reranker to rerank the combined up to 300 candidate pool of documents to only 8 most relevant documents +5. Use a fast and cheapt LLM (`gpt-3.5-turbo`) to check relevancy of each of the 8 documents. +6. Combine the original user prompt with most relevant sources found, and make a final LLM call to generate an answer -This generally works well. For prompt rewording, we use `gpt-3.5-turbo`, which runs very quickly and inexpensively. For the actual user-facing answer generation, we use `gpt-4` to ensure high quality answers. +This generally works well. For prompt rewording, we use `gpt-3.5-turbo`, which runs very quickly and inexpensively. For the actual user-facing answer generation, we use `gpt-4o` to ensure high quality answers. ## Feedback Loops diff --git a/docs/_static/prompt-orchestration.png b/docs/_static/prompt-orchestration.png index e76bc68e..2e8cd975 100644 Binary files a/docs/_static/prompt-orchestration.png and b/docs/_static/prompt-orchestration.png differ