Skip to content

Commit

Permalink
Merge pull request #628 from flairNLP/add-yumiuri
Browse files Browse the repository at this point in the history
Add ` Yomiuri Shimbun`
  • Loading branch information
addie9800 authored Oct 16, 2024
2 parents 6174d66 + 186c6b4 commit 6ba4e1a
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 1 deletion.
15 changes: 15 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -1121,6 +1121,21 @@
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>YomiuriShimbun</code>
</td>
<td>
<div>Yomiuri Shimbun</div>
</td>
<td>
<a href="https://www.yomiuri.co.jp/">
<span>www.yomiuri.co.jp</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
</tbody>
</table>

Expand Down
13 changes: 12 additions & 1 deletion src/fundus/publishers/jp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.jp.thejapannews import TheJapanNewsParser
from fundus.publishers.jp.the_japan_news import TheJapanNewsParser
from fundus.publishers.jp.yomiuri_shimbun import YomiuriShimbunParser
from fundus.scraping.filter import regex_filter
from fundus.scraping.url import NewsMap, Sitemap

Expand All @@ -17,3 +18,13 @@ class JP(metaclass=PublisherGroup):
NewsMap("https://japannews.yomiuri.co.jp/sitemap-news.xml"),
],
)

YomiuriShimbun = Publisher(
name="Yomiuri Shimbun",
domain="https://www.yomiuri.co.jp/",
parser=YomiuriShimbunParser,
sources=[
Sitemap("https://www.yomiuri.co.jp/sitemap.xml", sitemap_filter=regex_filter("sitemap-news-latest")),
NewsMap("https://www.yomiuri.co.jp/sitemap-news-latest.xml"),
],
)
46 changes: 46 additions & 0 deletions src/fundus/publishers/jp/the_japan_news.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import datetime
import re
from typing import List, Optional

from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
)


class TheJapanNewsParser(ParserProxy):
class V1(BaseParser):
_subheadline_selector = XPath("//div[@id='p-article-block']/h2")
_paragraph_selector = XPath("//div[@id='p-article-block']//p[not(@class)]")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
subheadline_selector=self._subheadline_selector,
)

@attribute
def title(self) -> Optional[str]:
return self.precomputed.meta.get("og:title")

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.meta.get("article:published_time"))

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

@attribute
def topics(self) -> List[str]:
return [
re.sub(r"\([0-9]+\)", "", topic).strip()
for topic in generic_topic_parsing(self.precomputed.ld.bf_search("keywords"))
]
43 changes: 43 additions & 0 deletions src/fundus/publishers/jp/yomiuri_shimbun.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import datetime
import re
from typing import List, Optional

from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
)


class YomiuriShimbunParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = XPath("//div[@class='p-main-contents ']/p")

_topic_selector = XPath("//div[@class='p-related-tags']/ul/li/a")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
)

@attribute
def title(self) -> Optional[str]:
return self.precomputed.meta.get("og:title")

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.meta.get("article:published_time"))

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

@attribute
def topics(self) -> List[str]:
return [node.text_content() for node in self._topic_selector(self.precomputed.doc)]
26 changes: 26 additions & 0 deletions tests/resources/parser/test_data/jp/YomiuriShimbun.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"V1": {
"authors": [
"読売新聞オンライン"
],
"body": {
"summary": [],
"sections": [
{
"headline": [],
"paragraphs": [
"プロ野球・阪神タイガースの次期監督に球団OBの藤川球児氏(44)が就任することが分かった。藤川氏の内諾を得ているといい、球団幹部は「勝ち続けることと若手育成の観点から選んだ」と語った。近く発表される予定。阪神は13日、クライマックスシリーズで敗退。岡田 彰布 ( あきのぶ ) 監督(66)は今季で退任する。",
"藤川氏は1999年、高知商高(高知)からドラフト1位で阪神に入団。2005年、当時指揮を執った岡田監督のもと、救援投手として80試合に登板し、セ・リーグ優勝に貢献。ジェフ・ウィリアムス、久保田智之両投手と形成したリリーフ陣は、それぞれの頭文字から「JFK」と呼ばれた。",
"12年オフに米大リーグ挑戦を表明。独立リーグの四国アイランドリーグplus・高知ファイティングドッグスを経て、16年に阪神へ復帰した。20年の現役引退後、球団フロントに加わり、外国人選手の補強などに関わった。日本で最多セーブのタイトルを2度獲得し、日米通算成績は61勝39敗245セーブ。〈関連記事スポーツ面〉"
]
}
]
},
"publishing_date": "2024-10-13 22:43:00+09:00",
"title": "阪神新監督に藤川氏 ",
"topics": [
"#阪神",
"#日本"
]
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/jp/meta.info
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,9 @@
"TheJapanNews_2024_10_13.html.gz": {
"url": "https://japannews.yomiuri.co.jp/politics/politics-government/20241013-216478/",
"crawl_date": "2024-10-13 16:27:01.520980"
},
"YomiuriShimbun_2024_10_13.html.gz": {
"url": "https://www.yomiuri.co.jp/local/kansai/news/20241013-OYO1T50044/",
"crawl_date": "2024-10-13 16:52:57.081306"
}
}

0 comments on commit 6ba4e1a

Please sign in to comment.