-
Notifications
You must be signed in to change notification settings - Fork 75
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #628 from flairNLP/add-yumiuri
Add ` Yomiuri Shimbun`
- Loading branch information
Showing
7 changed files
with
146 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import datetime | ||
import re | ||
from typing import List, Optional | ||
|
||
from lxml.etree import XPath | ||
|
||
from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute | ||
from fundus.parser.utility import ( | ||
extract_article_body_with_selector, | ||
generic_author_parsing, | ||
generic_date_parsing, | ||
generic_topic_parsing, | ||
) | ||
|
||
|
||
class TheJapanNewsParser(ParserProxy): | ||
class V1(BaseParser): | ||
_subheadline_selector = XPath("//div[@id='p-article-block']/h2") | ||
_paragraph_selector = XPath("//div[@id='p-article-block']//p[not(@class)]") | ||
|
||
@attribute | ||
def body(self) -> Optional[ArticleBody]: | ||
return extract_article_body_with_selector( | ||
self.precomputed.doc, | ||
paragraph_selector=self._paragraph_selector, | ||
subheadline_selector=self._subheadline_selector, | ||
) | ||
|
||
@attribute | ||
def title(self) -> Optional[str]: | ||
return self.precomputed.meta.get("og:title") | ||
|
||
@attribute | ||
def publishing_date(self) -> Optional[datetime.datetime]: | ||
return generic_date_parsing(self.precomputed.meta.get("article:published_time")) | ||
|
||
@attribute | ||
def authors(self) -> List[str]: | ||
return generic_author_parsing(self.precomputed.ld.bf_search("author")) | ||
|
||
@attribute | ||
def topics(self) -> List[str]: | ||
return [ | ||
re.sub(r"\([0-9]+\)", "", topic).strip() | ||
for topic in generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import datetime | ||
import re | ||
from typing import List, Optional | ||
|
||
from lxml.etree import XPath | ||
|
||
from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute | ||
from fundus.parser.utility import ( | ||
extract_article_body_with_selector, | ||
generic_author_parsing, | ||
generic_date_parsing, | ||
generic_topic_parsing, | ||
) | ||
|
||
|
||
class YomiuriShimbunParser(ParserProxy): | ||
class V1(BaseParser): | ||
_paragraph_selector = XPath("//div[@class='p-main-contents ']/p") | ||
|
||
_topic_selector = XPath("//div[@class='p-related-tags']/ul/li/a") | ||
|
||
@attribute | ||
def body(self) -> Optional[ArticleBody]: | ||
return extract_article_body_with_selector( | ||
self.precomputed.doc, | ||
paragraph_selector=self._paragraph_selector, | ||
) | ||
|
||
@attribute | ||
def title(self) -> Optional[str]: | ||
return self.precomputed.meta.get("og:title") | ||
|
||
@attribute | ||
def publishing_date(self) -> Optional[datetime.datetime]: | ||
return generic_date_parsing(self.precomputed.meta.get("article:published_time")) | ||
|
||
@attribute | ||
def authors(self) -> List[str]: | ||
return generic_author_parsing(self.precomputed.ld.bf_search("author")) | ||
|
||
@attribute | ||
def topics(self) -> List[str]: | ||
return [node.text_content() for node in self._topic_selector(self.precomputed.doc)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
{ | ||
"V1": { | ||
"authors": [ | ||
"読売新聞オンライン" | ||
], | ||
"body": { | ||
"summary": [], | ||
"sections": [ | ||
{ | ||
"headline": [], | ||
"paragraphs": [ | ||
"プロ野球・阪神タイガースの次期監督に球団OBの藤川球児氏(44)が就任することが分かった。藤川氏の内諾を得ているといい、球団幹部は「勝ち続けることと若手育成の観点から選んだ」と語った。近く発表される予定。阪神は13日、クライマックスシリーズで敗退。岡田 彰布 ( あきのぶ ) 監督(66)は今季で退任する。", | ||
"藤川氏は1999年、高知商高(高知)からドラフト1位で阪神に入団。2005年、当時指揮を執った岡田監督のもと、救援投手として80試合に登板し、セ・リーグ優勝に貢献。ジェフ・ウィリアムス、久保田智之両投手と形成したリリーフ陣は、それぞれの頭文字から「JFK」と呼ばれた。", | ||
"12年オフに米大リーグ挑戦を表明。独立リーグの四国アイランドリーグplus・高知ファイティングドッグスを経て、16年に阪神へ復帰した。20年の現役引退後、球団フロントに加わり、外国人選手の補強などに関わった。日本で最多セーブのタイトルを2度獲得し、日米通算成績は61勝39敗245セーブ。〈関連記事スポーツ面〉" | ||
] | ||
} | ||
] | ||
}, | ||
"publishing_date": "2024-10-13 22:43:00+09:00", | ||
"title": "阪神新監督に藤川氏 ", | ||
"topics": [ | ||
"#阪神", | ||
"#日本" | ||
] | ||
} | ||
} |
Binary file added
BIN
+72.8 KB
tests/resources/parser/test_data/jp/YomiuriShimbun_2024_10_13.html.gz
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters