diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md index 8db9dab1..73889813 100644 --- a/docs/supported_publishers.md +++ b/docs/supported_publishers.md @@ -86,6 +86,38 @@ +## CA-Publishers + + + + + + + + + + + + + + + + + + + + +
Class                                Name                                                                        URL                                                    Missing AttributesAdditional Attributes    
+ CBCNews + +
CBC News
+
+ + www.cbc.ca + +   
+ + ## CH-Publishers diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py index b9faea9d..6db1a341 100644 --- a/src/fundus/parser/data.py +++ b/src/fundus/parser/data.py @@ -53,8 +53,8 @@ def __init__(self, lds: Iterable[Dict[str, Any]] = ()): def serialize(self) -> Dict[str, Any]: return {attribute: value for attribute, value in self.__dict__.items() if "__" not in attribute} - def add_ld(self, ld: Dict[str, Any]) -> None: - if ld_type := ld.get("@type"): + def add_ld(self, ld: Dict[str, Any], name: Optional[str] = None) -> None: + if ld_type := ld.get("@type", name): if isinstance(ld_type, list): if len(ld_type) == 1: ld_type = ld_type[0] diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py index 67810e25..6eb7b664 100644 --- a/src/fundus/parser/utility.py +++ b/src/fundus/parser/utility.py @@ -7,6 +7,7 @@ from datetime import datetime from functools import total_ordering from typing import ( + Any, Callable, ClassVar, Dict, @@ -155,6 +156,25 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]: _json_pattern = re.compile(r"(?P{[\s\S]*}|\[\s*{[\s\S]*}\s*](?!\s*}))") +def extract_json_from_dom(root: lxml.html.HtmlElement, selector: XPath) -> Iterable[Dict[str, Any]]: + def sanitize(text: str) -> Optional[str]: + # capture only content enclosed as follows: {...} or [{...}] + match = re.search(_json_pattern, text) + if match is not None and (sanitized := match.group("json")): + return sanitized + return None + + json_nodes = selector(root) + jsons = [] + for node in json_nodes: + json_content = sanitize(node.text_content()) or "" + try: + jsons.append(json.loads(json_content)) + except json.JSONDecodeError as error: + logger.debug(f"Encountered {error!r} during JSON parsing") + return more_itertools.collapse(jsons, base_type=dict) + + def get_ld_content(root: lxml.html.HtmlElement) -> LinkedDataMapping: """Parse JSON-LD from HTML. @@ -168,23 +188,7 @@ def get_ld_content(root: lxml.html.HtmlElement) -> LinkedDataMapping: The JSON-LD data as a LinkedDataMapping """ - def sanitize(text: str) -> Optional[str]: - # capture only content enclosed as follows: {...} or [{...}] - match = re.search(_json_pattern, text) - if match is not None and (sanitized := match.group("json")): - return sanitized - return None - - ld_nodes = _ld_node_selector(root) - lds = [] - for node in ld_nodes: - json_content = sanitize(node.text_content()) or "" - try: - lds.append(json.loads(json_content)) - except json.JSONDecodeError as error: - logger.debug(f"Encountered {error!r} during LD parsing") - collapsed_lds = more_itertools.collapse(lds, base_type=dict) - return LinkedDataMapping(collapsed_lds) + return LinkedDataMapping(extract_json_from_dom(root, _ld_node_selector)) _meta_node_selector = CSSSelector("head > meta, body > meta") diff --git a/src/fundus/publishers/__init__.py b/src/fundus/publishers/__init__.py index da42fd27..cb68634e 100644 --- a/src/fundus/publishers/__init__.py +++ b/src/fundus/publishers/__init__.py @@ -4,6 +4,7 @@ from fundus.publishers.at import AT from fundus.publishers.au import AU from fundus.publishers.base_objects import Publisher, PublisherGroup +from fundus.publishers.ca import CA from fundus.publishers.ch import CH from fundus.publishers.cn import CN from fundus.publishers.de import DE @@ -61,3 +62,4 @@ class PublisherCollection(metaclass=PublisherCollectionMeta): tr = TR my = MY no = NO + ca = CA diff --git a/src/fundus/publishers/ca/__init__.py b/src/fundus/publishers/ca/__init__.py new file mode 100644 index 00000000..5013deef --- /dev/null +++ b/src/fundus/publishers/ca/__init__.py @@ -0,0 +1,18 @@ +from fundus.publishers.base_objects import Publisher, PublisherGroup +from fundus.publishers.ca.cbc_news import CBCNewsParser +from fundus.scraping.url import NewsMap, RSSFeed, Sitemap + +# noinspection PyPep8Naming + + +class CA(metaclass=PublisherGroup): + CBCNews = Publisher( + name="CBC News", + domain="https://www.cbc.ca/", + parser=CBCNewsParser, + sources=[ + RSSFeed("https://www.cbc.ca/webfeed/rss/rss-topstories"), + RSSFeed("https://www.cbc.ca/webfeed/rss/rss-world"), + RSSFeed("https://www.cbc.ca/webfeed/rss/rss-canada"), + ], + ) diff --git a/src/fundus/publishers/ca/cbc_news.py b/src/fundus/publishers/ca/cbc_news.py new file mode 100644 index 00000000..fba874ab --- /dev/null +++ b/src/fundus/publishers/ca/cbc_news.py @@ -0,0 +1,66 @@ +import datetime +import re +from typing import List, Optional + +from lxml.cssselect import CSSSelector +from lxml.etree import XPath + +from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.base_parser import function +from fundus.parser.utility import ( + extract_article_body_with_selector, + extract_json_from_dom, + generic_author_parsing, + generic_date_parsing, +) + + +class CBCNewsParser(ParserProxy): + class V1(BaseParser): + _summary_selector = CSSSelector("h2.deck") + _subheadline_selector = CSSSelector("div.story > h2") + _paragraph_selector = CSSSelector("div.story > p") + + _cbc_ld_selector: XPath = XPath("//script[@id='initialStateDom']") + + @function(priority=1) + def _parse_initial_state_dom(self): + state_dom_json = extract_json_from_dom(self.precomputed.doc, self._cbc_ld_selector) + for ld in state_dom_json: + self.precomputed.ld.add_ld(ld, "initialStateDom") + + @attribute + def body(self) -> ArticleBody: + return extract_article_body_with_selector( + self.precomputed.doc, + summary_selector=self._summary_selector, + subheadline_selector=self._subheadline_selector, + paragraph_selector=self._paragraph_selector, + ) + + @attribute + def authors(self) -> List[str]: + return generic_author_parsing(self.precomputed.ld.bf_search("author")) + + @attribute + def publishing_date(self) -> Optional[datetime.datetime]: + return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) + + @attribute + def title(self) -> Optional[str]: + return self.precomputed.ld.bf_search("headline") + + @attribute + def topics(self) -> List[str]: + if not (topic_dict := self.precomputed.ld.bf_search("keywords")): + return [] + + # add locations + topic_list = [topic for location in topic_dict.get("tags") if (topic := location.get("name")) is not None] + + # add subjects + for subject in topic_dict.get("concepts"): + if (path := subject.get("path")) is not None: + topic_list.append(re.sub(r".*/", "", path)) + + return topic_list diff --git a/tests/resources/parser/test_data/ca/CBCNews.json b/tests/resources/parser/test_data/ca/CBCNews.json new file mode 100644 index 00000000..42959f31 --- /dev/null +++ b/tests/resources/parser/test_data/ca/CBCNews.json @@ -0,0 +1,83 @@ +{ + "V1": { + "authors": [ + "Yasmine Hassan" + ], + "body": { + "summary": [ + "The appointment came days after Ismail Haniyeh was assassinated in Tehran" + ], + "sections": [ + { + "headline": [], + "paragraphs": [ + "Hours after Yahya Sinwar was named the new leader of Hamas's political bureau on Tuesday, many in Gaza wondered how the appointment would affect the war and ceasefire negotiations with Israel.", + "The announcement, posted on Hamas's Telegram channel soon after former leader Ismail Haniyeh was killed in Iran, was seen as a defiant move from the group. Israel has characterized Sinwar as the \"mastermind\" behind the Oct. 7 attacks on southern Israel, which Israeli figures say killed 1,200 and took over 250 hostages into Gaza.", + "Sinwar, 61, has led Hamas in the Gaza Strip since 2017. But his background leans more in military rather than in politics, and his methods are seen as more extreme than his predecessor's.", + "That has created questions over how Sinwar will manage negotiations, and how Israel will negotiate with the man who they say orchestrated the attacks — and whom they've vowed to kill.", + "Many Palestinians interviewed in Gaza expressed similar concern over the promotion, although some welcomed the move. The news comes during a time of tense negotiation to end a war that has devastated the region and killed more than 39,000, according to Palestinian tallies, over the past 10 months." + ] + }, + { + "headline": [ + "Palestinians react" + ], + "paragraphs": [ + "Jamil Al Saadouni, 58, told CBC freelance videographer Mohamed El Saife in Khan Younis that Sinwar's appointment was \"an internal decision.\"", + "He lamented the fact that Palestinian civilians, who are directly impacted by the war in Gaza, were not consulted on the best replacement for Haniyeh.", + "\"This has nothing to do with other factions or the Palestinian people.\"", + "Abu Hassan Amer, 44, agreed.", + "\"Choosing a military leadership during this period can harm the negotiations,\" he told El Saife. \"Because as they say, the non-political gun creates roadblocks.\"", + "Sinwar is seen as a \"hard-liner\" even within Hamas, said Matthew Levitt, senior fellow at the Washington Institute for Near East Policy, which was founded in 1985 with support from the American Israel Public Affairs Committee, a pro-Israel lobbying organization.", + "Sinwar served over 20 years in Israeli jails in connection with the killings of two Israeli soldiers and four fellow Palestinians, and was released early in 2011 as part of a prisoner swap. He has been known to hunt down people suspected of collaborating with Israel.", + "Levitt said that because of his time in jail, Sinwar \"understands Israelis.\"", + "\"He learned Hebrew, he spoke with his jailers, and that really showed on Oct. 7, when he understood the trauma that the kidnapping and killing of a large number of people would do for the Israelis,\" he said.", + "By comparison, Haniyeh, who ruled in exile from Qatar, often took a more moderate and pragmatic stance.", + "\"The killing of Haniyeh already brought negotiations back to the drawing board,\" Lina Khatib, an expert on the conflict at U.K. think-tank Chatham House, told the AP in an interview. \"The next chess move by Hamas makes negotiations even trickier.\"", + "Haniyeh was killed by an airstrike in Tehran, where he was attending the inauguration of Iran's new president. While Hamas and Iran have blamed Israel for the strike, Israel has not claimed responsibility for it." + ] + }, + { + "headline": [ + "A military man in politics" + ], + "paragraphs": [ + "Some in Gaza welcomed the news of Sinwar's promotion, saying they needed someone to defend them.", + "\"Choosing him from the stance of Palestine is a good choice,\" Abu Anas Al Saud told El Saife. \"We need someone to defend the land that was stolen.\"", + "But Al Saud is aware of the effect Sinwar may have on ceasefire talks.", + "\"He's the most wanted man to Israel,\" he said. \"It will not advance negotiations at all.\"", + "Sinwar only made rare appearances before the war. He hasn't been seen in public since Oct. 7, and is thought to be hiding deep in tunnels beneath the Gaza Strip. Mediators say it takes several days to exchange messages with him, raising questions on how he will now manage Hamas as its international face.", + "Sinwar \"is someone who grew up within the brigade and the militant terrorist wing of Hamas,\" said Levitt.", + "However, while Sinwar's promotion might seem like a direct \"challenge to Israel,\" a deal was still possible, Sadeq Abu Amer told the AP. He noted that Sinwar \"might take a step that will surprise everyone.\" Abu Amer is the head of the Palestinian Dialogue Group in Turkey, which says on its site that it aims to \"protect the historical rights of the Palestinian people.\"", + "And while the assassination of Haniyeh makes a difference \"in the immediate,\" Levitt said, in the long term, both sides are still looking for a deal.", + "\"The same factors that were driving Hamas towards the deal and separately driving the Israeli prime minister to a deal are still there.\"" + ] + }, + { + "headline": [ + "'There is only one place for Yahya Sinwar'" + ], + "paragraphs": [ + "On Tuesday, Israel's chief military spokesperson, Rear Admiral Daniel Hagari, said Sinwar's appointment would not stop Israel from pursuing him.", + "\"There is only one place for Yahya Sinwar, and it is beside Mohammed Deif and the rest of the Oct. 7 terrorists,\" he told the Saudi state-owned Al-Arabiya television. \"That is the only place we're preparing and intending for him.\"", + "Amer, in Gaza, stressed the importance of diplomacy before military strength, particularly as negotiations continue between both sides.", + "\"There are rules to resistance, rules to war and rules to peace,\" said Amer. \"[And] we need peace in this current moment.\"" + ] + } + ] + }, + "publishing_date": "2024-08-08 08:00:00+00:00", + "title": "What's next for Gaza, after Yahya Sinwar's appointment as Hamas political head?", + "topics": [ + "Israel-Hamas war", + "Iran", + "Israel", + "Tehran", + "Hamas", + "Yahya Sinwar", + "Assassinations", + "Kidnapping", + "War and unrest" + ] + } +} diff --git a/tests/resources/parser/test_data/ca/CBCNews_2024_08_08.html.gz b/tests/resources/parser/test_data/ca/CBCNews_2024_08_08.html.gz new file mode 100644 index 00000000..22aeece3 Binary files /dev/null and b/tests/resources/parser/test_data/ca/CBCNews_2024_08_08.html.gz differ diff --git a/tests/resources/parser/test_data/ca/meta.info b/tests/resources/parser/test_data/ca/meta.info new file mode 100644 index 00000000..50821391 --- /dev/null +++ b/tests/resources/parser/test_data/ca/meta.info @@ -0,0 +1,6 @@ +{ + "CBCNews_2024_08_08.html.gz": { + "url": "https://www.cbc.ca/news/world/gaza-israel-ceasefire-negotiations-sinwar-1.7287711?cmp=rss", + "crawl_date": "2024-08-08 23:53:17.604667" + } +}