From 71c38c072c7e0ae371a5cea7d9620f52c02880e3 Mon Sep 17 00:00:00 2001 From: David Xue Date: Wed, 24 Jan 2024 16:08:43 -0800 Subject: [PATCH 1/3] Fix airflow doc ingestiong wrong typing error --- airflow/include/tasks/extract/airflow_docs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airflow/include/tasks/extract/airflow_docs.py b/airflow/include/tasks/extract/airflow_docs.py index c67cef5c..8a8faedc 100644 --- a/airflow/include/tasks/extract/airflow_docs.py +++ b/airflow/include/tasks/extract/airflow_docs.py @@ -25,6 +25,6 @@ def extract_airflow_docs(docs_base_url: str) -> list[pd.DataFrame]: "cli-and-env-variables-ref.html", ] - all_links = get_internal_links(docs_base_url, exclude_literal=exclude_docs) + all_links = list(get_internal_links(docs_base_url, exclude_literal=exclude_docs)) return all_links From df12a1ce1bc73c15b2f2c32240b11688a198c9ab Mon Sep 17 00:00:00 2001 From: David Xue Date: Wed, 24 Jan 2024 16:36:44 -0800 Subject: [PATCH 2/3] Recover missing/deleted lines from previous PR --- airflow/include/tasks/extract/airflow_docs.py | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/airflow/include/tasks/extract/airflow_docs.py b/airflow/include/tasks/extract/airflow_docs.py index 8a8faedc..667f1a08 100644 --- a/airflow/include/tasks/extract/airflow_docs.py +++ b/airflow/include/tasks/extract/airflow_docs.py @@ -1,6 +1,11 @@ from __future__ import annotations +from bs4 import BeautifulSoup import pandas as pd +import requests +import urllib.parse +from weaviate.util import generate_uuid5 +import re from include.tasks.extract.utils.html_utils import get_internal_links @@ -25,6 +30,28 @@ def extract_airflow_docs(docs_base_url: str) -> list[pd.DataFrame]: "cli-and-env-variables-ref.html", ] - all_links = list(get_internal_links(docs_base_url, exclude_literal=exclude_docs)) + all_links = get_internal_links(docs_base_url, exclude_literal=exclude_docs) + + docs_url_parts = urllib.parse.urlsplit(docs_base_url) + docs_url_base = f"{docs_url_parts.scheme}://{docs_url_parts.netloc}" + # make sure we didn't accidentally pickup any unrelated links in recursion + non_doc_links = {link if docs_url_base not in link else "" for link in all_links} + docs_links = all_links - non_doc_links - return all_links + df = pd.DataFrame(docs_links, columns=["docLink"]) + + df["html_content"] = df["docLink"].apply(lambda x: requests.get(x).content) + + df["content"] = df["html_content"].apply( + lambda x: str(BeautifulSoup(x, "html.parser").find(class_="body", role="main")) + ) + df["content"] = df["content"].apply(lambda x: re.sub("ΒΆ", "", x)) + + df["sha"] = df["content"].apply(generate_uuid5) + df["docSource"] = "apache/airflow/docs" + df.reset_index(drop=True, inplace=True) + + # column order matters for uuid generation + df = df[["docSource", "sha", "content", "docLink"]] + + return [df] From 7101772a3e548d57a06e7350cc36101ce8558004 Mon Sep 17 00:00:00 2001 From: David Xue Date: Wed, 24 Jan 2024 16:42:00 -0800 Subject: [PATCH 3/3] Fix formatting --- airflow/include/tasks/extract/airflow_docs.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/airflow/include/tasks/extract/airflow_docs.py b/airflow/include/tasks/extract/airflow_docs.py index 667f1a08..fb9de592 100644 --- a/airflow/include/tasks/extract/airflow_docs.py +++ b/airflow/include/tasks/extract/airflow_docs.py @@ -1,11 +1,12 @@ from __future__ import annotations -from bs4 import BeautifulSoup + +import re +import urllib.parse import pandas as pd import requests -import urllib.parse +from bs4 import BeautifulSoup from weaviate.util import generate_uuid5 -import re from include.tasks.extract.utils.html_utils import get_internal_links @@ -31,7 +32,7 @@ def extract_airflow_docs(docs_base_url: str) -> list[pd.DataFrame]: ] all_links = get_internal_links(docs_base_url, exclude_literal=exclude_docs) - + docs_url_parts = urllib.parse.urlsplit(docs_base_url) docs_url_base = f"{docs_url_parts.scheme}://{docs_url_parts.netloc}" # make sure we didn't accidentally pickup any unrelated links in recursion