From 273e6dcc54e692ca34293eb0c06c02771c6c40d1 Mon Sep 17 00:00:00 2001 From: Wei Lee Date: Mon, 27 Nov 2023 18:39:50 +0800 Subject: [PATCH] refactor(airflow): refactor extract_astro_blogs method --- airflow/include/tasks/extract/blogs.py | 29 +++++++++++++------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/airflow/include/tasks/extract/blogs.py b/airflow/include/tasks/extract/blogs.py index 1d8d3cd1..5dc0d90d 100644 --- a/airflow/include/tasks/extract/blogs.py +++ b/airflow/include/tasks/extract/blogs.py @@ -28,30 +28,29 @@ def extract_astro_blogs(blog_cutoff_date: datetime) -> list[pd.DataFrame]: 'sha': A UUID from the other fields """ - headers = {} - links = [] - dates = [] + links: list[str] = [] page = 1 - response = requests.get(page_url.format(page=page), headers=headers) + response = requests.get(page_url.format(page=page), headers={}) while response.ok: soup = BeautifulSoup(response.text, "lxml") - cards = soup.find_all(class_="post-card__cover") - card_links = [base_url + card.find("a", href=True)["href"] for card in cards] + + articles = soup.find_all("article") + + card_links = [ + f"{base_url}{article.find('a', href=True)['href']}" + for article in articles + if datetime.fromisoformat(article.find("time")["datetime"]).date() > blog_cutoff_date + ] links.extend(card_links) - meta = soup.find_all(class_="post-card__meta") - dates.extend([post.find("time")["datetime"] for post in meta]) + if len(articles) != len(card_links): + break page = page + 1 - response = requests.get(page_url.format(page=page), headers=headers) + response = requests.get(page_url.format(page=page), headers={}) - df = pd.DataFrame(zip(links, dates), columns=["docLink", "date"]) - - df["date"] = pd.to_datetime(df["date"]).dt.date - df = df[df["date"] > blog_cutoff_date] - df.drop("date", inplace=True, axis=1) + df = pd.DataFrame(links, columns=["docLink"]) df.drop_duplicates(inplace=True) - df["content"] = df["docLink"].apply(lambda x: requests.get(x).content) df["title"] = df["content"].apply( lambda x: BeautifulSoup(x, "lxml").find(class_="post-card__meta").find(class_="title").get_text()