diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md index ca5de0e3..9526cd42 100644 --- a/docs/supported_publishers.md +++ b/docs/supported_publishers.md @@ -114,6 +114,21 @@     + + + NationalPost + + +
National Post
+ + + + nationalpost.com + + +   +   + diff --git a/src/fundus/publishers/ca/__init__.py b/src/fundus/publishers/ca/__init__.py index 5013deef..7f020472 100644 --- a/src/fundus/publishers/ca/__init__.py +++ b/src/fundus/publishers/ca/__init__.py @@ -1,5 +1,6 @@ from fundus.publishers.base_objects import Publisher, PublisherGroup from fundus.publishers.ca.cbc_news import CBCNewsParser +from fundus.publishers.ca.national_post import NationalPostParser from fundus.scraping.url import NewsMap, RSSFeed, Sitemap # noinspection PyPep8Naming @@ -16,3 +17,14 @@ class CA(metaclass=PublisherGroup): RSSFeed("https://www.cbc.ca/webfeed/rss/rss-canada"), ], ) + + NationalPost = Publisher( + name="National Post", + domain="https://nationalpost.com", + parser=NationalPostParser, + sources=[ + NewsMap("https://nationalpost.com/sitemap-news.xml"), + Sitemap("https://nationalpost.com/sitemap-old.xml"), + RSSFeed("https://nationalpost.com/feed"), + ], + ) diff --git a/src/fundus/publishers/ca/national_post.py b/src/fundus/publishers/ca/national_post.py new file mode 100644 index 00000000..374c784d --- /dev/null +++ b/src/fundus/publishers/ca/national_post.py @@ -0,0 +1,59 @@ +import datetime +from typing import List, Optional + +from lxml.cssselect import CSSSelector +from lxml.etree import XPath + +from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.utility import ( + extract_article_body_with_selector, + generic_author_parsing, + generic_date_parsing, + generic_topic_parsing, +) +from fundus.scraping.filter import regex_filter + + +class NationalPostParser(ParserProxy): + class V1(BaseParser): + _summary_selector = CSSSelector("article p.article-subtitle") + _subheadline_selector = XPath( + "//section[@class='article-content__content-group article-content__content-group--story']/p/strong | " + "//section[@class='article-content__content-group article-content__content-group--story']/h3" + ) + _paragraph_selector = XPath( + "//section[@class='article-content__content-group article-content__content-group--story']/p[text()]" + ) + + @attribute + def body(self) -> ArticleBody: + return extract_article_body_with_selector( + self.precomputed.doc, + summary_selector=self._summary_selector, + subheadline_selector=self._subheadline_selector, + paragraph_selector=self._paragraph_selector, + ) + + @attribute + def authors(self) -> List[str]: + return generic_author_parsing(self.precomputed.ld.bf_search("author")) + + @attribute + def publishing_date(self) -> Optional[datetime.datetime]: + return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) + + @attribute + def title(self) -> Optional[str]: + return self.precomputed.meta.get("og:title") + + @attribute + def topics(self) -> List[str]: + preliminary_topics = self.precomputed.ld.bf_search("keywords") + filter_list = ["Curated", "News", "Newsroom daily", "story", "Canada", "World"] + topic_filter = regex_filter( + r"([0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}|NLP Entity Tokens|NLP Category|NP Comment)" + ) + filtered_topics = [ + topic for topic in preliminary_topics if not topic_filter(topic) and topic not in filter_list + ] + return generic_topic_parsing(filtered_topics) diff --git a/tests/resources/parser/test_data/ca/NationalPost.json b/tests/resources/parser/test_data/ca/NationalPost.json new file mode 100644 index 00000000..a59719c3 --- /dev/null +++ b/tests/resources/parser/test_data/ca/NationalPost.json @@ -0,0 +1,85 @@ +{ + "V1": { + "authors": [ + "Joseph Brean" + ], + "body": { + "summary": [ + "The Harrises settled into the top floor of a Victorian home in Montreal's moneyed neighbourhood of Westmount" + ], + "sections": [ + { + "headline": [], + "paragraphs": [ + "If Kamala Harris becomes the next American president in November, both Canada and the United States will at least briefly be led by people who spent formative teenage years in Montreal. But whereas one had already been in the public eye since being born to a sitting prime minister, the other was a reluctant transplant from sunny California, uncertain how she would fit into a francophone city. The National Post takes stock of Harris’ Montreal years." + ] + }, + { + "headline": [ + "How did Kamala Harris end up living on the top floor of a Westmount Victorian in the 1970s and 80s?" + ], + "paragraphs": [ + "Kamala Harris was born in 1964, the older of two daughters to Donald Harris, a Jamaican-American economist, and Shyamala Gopalan Harris, an Indian-American breast cancer scientist who died in 2009. They had lived in a series of Midwest college towns before Donald and Shyamala divorced in 1971, and the girls lived with their mother in California, in the San Francisco area, staying with their father in Palo Alto on weekends. Kamala was 12 in 1976 when her mother took a job at the McGill University School of Medicine and the Jewish General Hospital. She wrote in her memoir, The Truths We Hold: An American Journey, “I was 12 years old, and the thought of moving away from sunny California in February, in the middle of the school year, to a French-speaking foreign city covered in 12 feet of snow was distressing. My mother tried to make it sound like an adventure, taking us to buy our first down jackets and mittens, as though we were going to be explorers of the great northern winter. But it was hard for me to see it that way.”" + ] + }, + { + "headline": [ + "Where did Harris live?" + ], + "paragraphs": [ + "The Harrises settled into the top floor of a Victorian home in the moneyed neighbourhood of Westmount. It was a difficult transition. Kamala was a happy child. The New York Times has reported she was into Diana Ross and Michael Jackson. But she had no French to speak of. So when she enrolled in the French language primary school Notre-Dame-des-Neiges, she recalled saying “quoi? quoi?” all the time and self-consciously feeling she sounded like a duck." + ] + }, + { + "headline": [ + "But that did not last." + ], + "paragraphs": [ + "This was a period of great change in Canadian education, and alternative schools were on the rise. So Kamala switched over for her grade eight year to FACE school, then called FACES, for Fine Arts Core Elementary School. Founded in 1975, it was and remains a free-wheeling place where students reportedly took instruction in copper enamelling, macramé, poetry, interior design, yoga and tie-dying." + ] + }, + { + "headline": [ + "And then comes high school." + ], + "paragraphs": [ + "Westmount High School is a public anglophone school that is more diverse and less privileged than its name suggests, with its connotations of old Montreal money. It drew from Westmount, the largely Black Little Burgundy to the south and beyond. By the time she got there, and still today, its most prominent alumnus was Leonard Cohen, with Mila Mulroney and Stockwell Day trailing far behind. Her yearbook quote gives special thanks to her mother, offering words of encouragement to her younger sister Maya to “Be cool,” but the rest is typically cryptic. It suggests her cherished memories were “California, Angelo; summer ’80.” Happiness is “Making long distance phone calls to A.M.” Her favourite past-time is “Dancing with super six; Midnight Magic,” and as to her “Fav Ex,” she wrote: “Naw, I’m just playing!” Super Six and Midnight Magic were dance troupes." + ] + }, + { + "headline": [ + "That sounds like a positive high school experience." + ], + "paragraphs": [ + "Although she has not strongly emphasized her Canadian youth, partly in defence against false claims of being ineligible to be president that recall the Barack Obama birther conspiracy theory, Harris has written and spoken about her Montreal home as a comfortable and safe place. One episode in particular illustrates that, when she was able to offer the same comfort and safety to her high school best friend Wanda Kagan at a difficult moment, when she told Harris about abuse she was suffering at home. They shared Harris’ room for a while. In her acceptance speech for the Democratic presidential nomination, Harris said this episode “is one of the reasons I became a prosecutor, to protect people like Wanda, because I believe everyone has a right to safety, to dignity and to justice.”" + ] + }, + { + "headline": [ + "What happened after graduation?" + ], + "paragraphs": [ + "Harris graduated from Westmount in 1981 and attended Vanier College in Montreal for a year before pursuing further studies at Howard University, the historically Black university in Washington, D.C. She later studied law in California and became a prosecutor, and eventually Attorney General, later a Senator, and now vice president." + ] + }, + { + "headline": [ + "How did Harris vote in the 1980 Quebec independence referendum?" + ], + "paragraphs": [ + "She was 15 and not eligible to vote. But she was politically minded even in childhood. A couple of years previously, according to a story her sister Maya has told, Kamala organized a successful protest in front of the building where she lived because the owner did not want children playing on the lawn." + ] + } + ] + }, + "publishing_date": "2024-08-28 10:00:23+00:00", + "title": "Kamala Harris' Canadian connections: What we know about her childhood in Montreal", + "topics": [ + "Explainer", + "General", + "Kamala Harris", + "Montreal", + "United States" + ] + } +} diff --git a/tests/resources/parser/test_data/ca/NationalPost_2024_08_28.html.gz b/tests/resources/parser/test_data/ca/NationalPost_2024_08_28.html.gz new file mode 100644 index 00000000..caa542d0 Binary files /dev/null and b/tests/resources/parser/test_data/ca/NationalPost_2024_08_28.html.gz differ diff --git a/tests/resources/parser/test_data/ca/meta.info b/tests/resources/parser/test_data/ca/meta.info index 50821391..abc02f5a 100644 --- a/tests/resources/parser/test_data/ca/meta.info +++ b/tests/resources/parser/test_data/ca/meta.info @@ -2,5 +2,9 @@ "CBCNews_2024_08_08.html.gz": { "url": "https://www.cbc.ca/news/world/gaza-israel-ceasefire-negotiations-sinwar-1.7287711?cmp=rss", "crawl_date": "2024-08-08 23:53:17.604667" + }, + "NationalPost_2024_08_28.html.gz": { + "url": "https://nationalpost.com/news/canada/kamala-harris-childhood-montreal-canada", + "crawl_date": "2024-08-28 13:13:43.905282" } }