Skip to content

Commit

Permalink
Merge branch 'main' of github.com:trailofbits/ask-astro
Browse files Browse the repository at this point in the history
* 'main' of github.com:trailofbits/ask-astro:
  Fix Airflow Doc Ingestion Task Incorrect Typing Error (astronomer#283)
  • Loading branch information
bismuthsalamander committed Jan 29, 2024
2 parents ba48fcf + 7c79156 commit a18009d
Showing 1 changed file with 29 additions and 1 deletion.
30 changes: 29 additions & 1 deletion airflow/include/tasks/extract/airflow_docs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
from __future__ import annotations

import re
import urllib.parse

import pandas as pd
import requests
from bs4 import BeautifulSoup
from weaviate.util import generate_uuid5

from include.tasks.extract.utils.html_utils import get_internal_links

Expand All @@ -27,4 +33,26 @@ def extract_airflow_docs(docs_base_url: str) -> list[pd.DataFrame]:

all_links = get_internal_links(docs_base_url, exclude_literal=exclude_docs)

return all_links
docs_url_parts = urllib.parse.urlsplit(docs_base_url)
docs_url_base = f"{docs_url_parts.scheme}://{docs_url_parts.netloc}"
# make sure we didn't accidentally pickup any unrelated links in recursion
non_doc_links = {link if docs_url_base not in link else "" for link in all_links}
docs_links = all_links - non_doc_links

df = pd.DataFrame(docs_links, columns=["docLink"])

df["html_content"] = df["docLink"].apply(lambda x: requests.get(x).content)

df["content"] = df["html_content"].apply(
lambda x: str(BeautifulSoup(x, "html.parser").find(class_="body", role="main"))
)
df["content"] = df["content"].apply(lambda x: re.sub("¶", "", x))

df["sha"] = df["content"].apply(generate_uuid5)
df["docSource"] = "apache/airflow/docs"
df.reset_index(drop=True, inplace=True)

# column order matters for uuid generation
df = df[["docSource", "sha", "content", "docLink"]]

return [df]

0 comments on commit a18009d

Please sign in to comment.