Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NationalPost #584

Merged
merged 4 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,21 @@
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>NationalPost</code>
</td>
<td>
<div>National Post</div>
</td>
<td>
<a href="https://nationalpost.com">
<span>nationalpost.com</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
</tbody>
</table>

Expand Down
12 changes: 12 additions & 0 deletions src/fundus/publishers/ca/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.ca.cbc_news import CBCNewsParser
from fundus.publishers.ca.national_post import NationalPostParser
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap

# noinspection PyPep8Naming
Expand All @@ -16,3 +17,14 @@ class CA(metaclass=PublisherGroup):
RSSFeed("https://www.cbc.ca/webfeed/rss/rss-canada"),
],
)

NationalPost = Publisher(
name="National Post",
domain="https://nationalpost.com",
parser=NationalPostParser,
sources=[
NewsMap("https://nationalpost.com/sitemap-news.xml"),
Sitemap("https://nationalpost.com/sitemap-old.xml"),
RSSFeed("https://nationalpost.com/feed"),
],
)
59 changes: 59 additions & 0 deletions src/fundus/publishers/ca/national_post.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import datetime
from typing import List, Optional

from lxml.cssselect import CSSSelector
from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
)
from fundus.scraping.filter import regex_filter


class NationalPostParser(ParserProxy):
class V1(BaseParser):
_summary_selector = CSSSelector("article p.article-subtitle")
_subheadline_selector = XPath(
"//section[@class='article-content__content-group article-content__content-group--story']/p/strong | "
"//section[@class='article-content__content-group article-content__content-group--story']/h3"
)
_paragraph_selector = XPath(
"//section[@class='article-content__content-group article-content__content-group--story']/p[text()]"
)

@attribute
def body(self) -> ArticleBody:
return extract_article_body_with_selector(
self.precomputed.doc,
summary_selector=self._summary_selector,
subheadline_selector=self._subheadline_selector,
paragraph_selector=self._paragraph_selector,
)

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.meta.get("og:title")

@attribute
def topics(self) -> List[str]:
preliminary_topics = self.precomputed.ld.bf_search("keywords")
filter_list = ["Curated", "News", "Newsroom daily", "story", "Canada", "World"]
topic_filter = regex_filter(
r"([0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}|NLP Entity Tokens|NLP Category|NP Comment)"
)
filtered_topics = [
topic for topic in preliminary_topics if not topic_filter(topic) and topic not in filter_list
]
return generic_topic_parsing(filtered_topics)
85 changes: 85 additions & 0 deletions tests/resources/parser/test_data/ca/NationalPost.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
{
"V1": {
"authors": [
"Joseph Brean"
],
"body": {
"summary": [
"The Harrises settled into the top floor of a Victorian home in Montreal's moneyed neighbourhood of Westmount"
],
"sections": [
{
"headline": [],
"paragraphs": [
"If Kamala Harris becomes the next American president in November, both Canada and the United States will at least briefly be led by people who spent formative teenage years in Montreal. But whereas one had already been in the public eye since being born to a sitting prime minister, the other was a reluctant transplant from sunny California, uncertain how she would fit into a francophone city. The National Post takes stock of Harris’ Montreal years."
]
},
{
"headline": [
"How did Kamala Harris end up living on the top floor of a Westmount Victorian in the 1970s and 80s?"
],
"paragraphs": [
"Kamala Harris was born in 1964, the older of two daughters to Donald Harris, a Jamaican-American economist, and Shyamala Gopalan Harris, an Indian-American breast cancer scientist who died in 2009. They had lived in a series of Midwest college towns before Donald and Shyamala divorced in 1971, and the girls lived with their mother in California, in the San Francisco area, staying with their father in Palo Alto on weekends. Kamala was 12 in 1976 when her mother took a job at the McGill University School of Medicine and the Jewish General Hospital. She wrote in her memoir, The Truths We Hold: An American Journey, “I was 12 years old, and the thought of moving away from sunny California in February, in the middle of the school year, to a French-speaking foreign city covered in 12 feet of snow was distressing. My mother tried to make it sound like an adventure, taking us to buy our first down jackets and mittens, as though we were going to be explorers of the great northern winter. But it was hard for me to see it that way.”"
]
},
{
"headline": [
"Where did Harris live?"
],
"paragraphs": [
"The Harrises settled into the top floor of a Victorian home in the moneyed neighbourhood of Westmount. It was a difficult transition. Kamala was a happy child. The New York Times has reported she was into Diana Ross and Michael Jackson. But she had no French to speak of. So when she enrolled in the French language primary school Notre-Dame-des-Neiges, she recalled saying “quoi? quoi?” all the time and self-consciously feeling she sounded like a duck."
]
},
{
"headline": [
"But that did not last."
],
"paragraphs": [
"This was a period of great change in Canadian education, and alternative schools were on the rise. So Kamala switched over for her grade eight year to FACE school, then called FACES, for Fine Arts Core Elementary School. Founded in 1975, it was and remains a free-wheeling place where students reportedly took instruction in copper enamelling, macramé, poetry, interior design, yoga and tie-dying."
]
},
{
"headline": [
"And then comes high school."
],
"paragraphs": [
"Westmount High School is a public anglophone school that is more diverse and less privileged than its name suggests, with its connotations of old Montreal money. It drew from Westmount, the largely Black Little Burgundy to the south and beyond. By the time she got there, and still today, its most prominent alumnus was Leonard Cohen, with Mila Mulroney and Stockwell Day trailing far behind. Her yearbook quote gives special thanks to her mother, offering words of encouragement to her younger sister Maya to “Be cool,” but the rest is typically cryptic. It suggests her cherished memories were “California, Angelo; summer ’80.” Happiness is “Making long distance phone calls to A.M.” Her favourite past-time is “Dancing with super six; Midnight Magic,” and as to her “Fav Ex,” she wrote: “Naw, I’m just playing!” Super Six and Midnight Magic were dance troupes."
]
},
{
"headline": [
"That sounds like a positive high school experience."
],
"paragraphs": [
"Although she has not strongly emphasized her Canadian youth, partly in defence against false claims of being ineligible to be president that recall the Barack Obama birther conspiracy theory, Harris has written and spoken about her Montreal home as a comfortable and safe place. One episode in particular illustrates that, when she was able to offer the same comfort and safety to her high school best friend Wanda Kagan at a difficult moment, when she told Harris about abuse she was suffering at home. They shared Harris’ room for a while. In her acceptance speech for the Democratic presidential nomination, Harris said this episode “is one of the reasons I became a prosecutor, to protect people like Wanda, because I believe everyone has a right to safety, to dignity and to justice.”"
]
},
{
"headline": [
"What happened after graduation?"
],
"paragraphs": [
"Harris graduated from Westmount in 1981 and attended Vanier College in Montreal for a year before pursuing further studies at Howard University, the historically Black university in Washington, D.C. She later studied law in California and became a prosecutor, and eventually Attorney General, later a Senator, and now vice president."
]
},
{
"headline": [
"How did Harris vote in the 1980 Quebec independence referendum?"
],
"paragraphs": [
"She was 15 and not eligible to vote. But she was politically minded even in childhood. A couple of years previously, according to a story her sister Maya has told, Kamala organized a successful protest in front of the building where she lived because the owner did not want children playing on the lawn."
]
}
]
},
"publishing_date": "2024-08-28 10:00:23+00:00",
"title": "Kamala Harris' Canadian connections: What we know about her childhood in Montreal",
"topics": [
"Explainer",
"General",
"Kamala Harris",
"Montreal",
"United States"
]
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/ca/meta.info
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,9 @@
"CBCNews_2024_08_08.html.gz": {
"url": "https://www.cbc.ca/news/world/gaza-israel-ceasefire-negotiations-sinwar-1.7287711?cmp=rss",
"crawl_date": "2024-08-08 23:53:17.604667"
},
"NationalPost_2024_08_28.html.gz": {
"url": "https://nationalpost.com/news/canada/kamala-harris-childhood-montreal-canada",
"crawl_date": "2024-08-28 13:13:43.905282"
}
}