Skip to content

Commit

Permalink
Merge pull request #453 from brandjakHU/master
Browse files Browse the repository at this point in the history
Added US publisher Rolling Stone
  • Loading branch information
MaxDall authored May 6, 2024
2 parents f7ba274 + 0107355 commit 11a34c1
Show file tree
Hide file tree
Showing 6 changed files with 114 additions and 1 deletion.
15 changes: 15 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -918,6 +918,21 @@
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>RollingStone</code>
</td>
<td>
<div>Rolling Stone</div>
</td>
<td>
<a href="https://www.rollingstone.com/">
<span>www.rollingstone.com</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>TheGatewayPundit</code>
Expand Down
16 changes: 15 additions & 1 deletion src/fundus/publishers/us/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
from fundus.scraping.filter import inverse, regex_filter
from fundus.scraping.filter import inverse, lor, regex_filter
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap

from .ap_news import APNewsParser
Expand All @@ -10,6 +10,7 @@
from .la_times import LATimesParser
from .occupy_democrats import OccupyDemocratsParser
from .reuters import ReutersParser
from .rolling_stone import RollingStoneParser
from .the_gateway_pundit import TheGatewayPunditParser
from .the_intercept import TheInterceptParser
from .the_nation_parser import TheNationParser
Expand Down Expand Up @@ -164,3 +165,16 @@ class US(PublisherEnum):
],
parser=BusinessInsiderParser,
)

RollingStone = PublisherSpec(
name="Rolling Stone",
domain="https://www.rollingstone.com/",
sources=[
NewsMap("https://www.rollingstone.com/news-sitemap.xml"),
Sitemap(
"https://www.rollingstone.com/sitemap_index.xml",
sitemap_filter=inverse(lor(regex_filter("/pmc_list-sitemap"), regex_filter("/post-sitemap"))),
),
],
parser=RollingStoneParser,
)
44 changes: 44 additions & 0 deletions src/fundus/publishers/us/rolling_stone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import datetime
from typing import List, Optional

from lxml.cssselect import CSSSelector

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
)


class RollingStoneParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = CSSSelector("div.a-content p.paragraph")
_summary_selector = CSSSelector("div.article-excerpt")
_subheadline_selector = CSSSelector("div.a-content h2.heading," "div.a-content div#pmc-gallery-vertical h2")

@attribute
def body(self) -> ArticleBody:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
summary_selector=self._summary_selector,
subheadline_selector=self._subheadline_selector,
)

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.meta.get("author"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.meta.get("published_at"))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.meta.get("title")

@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.meta.get("topics"))
32 changes: 32 additions & 0 deletions tests/resources/parser/test_data/us/RollingStone.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"V1": {
"authors": [
"Emily Zemler"
],
"body": {
"summary": [
"\"She is a very talented woman,\" she said of her son Travis Kelce's girlfriend"
],
"sections": [
{
"headline": [],
"paragraphs": [
"Donna Kelce has praised Taylor Swift‘s new album The Tortured Poets Department. Speaking to People at the Age of Possibility summit in Las Vegas, Travis Kelce‘s mom confirmed, “I listened to the whole album, and I listened to it all morning long when it was released.”",
"She added, “I was just very impressed. She is a very talented woman, and I think it is probably her best work.”",
"However, when asked if she had any advice for Swift about growing older, Donna dismissed the idea, noting, “She doesn’t need my advice on anything. In fact, I hope she will give me advice.”",
"Since the double album’s release last week, fans have speculated that at least two tracks, “The Alchemy” and “So High School,” reference her current relationship with Travis. On “The Alchemy” Swift makes several references to football, singing, “These blokes warm the benches/ We’ve been on a winning streak/ He jokes that ‘It’s heroin, but this time with an E’/ ‘Cause the sign on your heart said it’s still reserved for me/ Honestly, who are we to fight the alchemy?”",
"Donna and Swift spent time together throughout the 2023-24 NFL season as they cheered for Travis and the Kansas City Chiefs, who ultimately claimed victory at the Super Bowl. Donna previously spoke about hanging out with Swift on the Got It From My Momma podcast, saying it was a truly surreal experience.",
"“I feel like I’m in an alternate universe, because it’s something I’ve never been involved with before,” Donna said. “Every week just seems to trump the week before. So it’s really kind of wild, a wild ride.” Trending Billie Eilish Would Like to Reintroduce Herself Team Trump Is Ready to Lose the Supreme Court Immunity Case. They’re Celebrating Taylor Swift and Jack Antonoff Have Reached Their Limit Kanye West Announces 'Yeezy Porn' Amid Reports of Adult Film Company",
"Swift and Travis recently attended Coachella together, where they were spotted at sets for Ice Spice, Dom Dolla, and more. The football star recapped the weekend during an episode of his New Heights podcast, which he co-hosts with his brother Jason.",
"“I like to see it from the fans’ perspective,” Kelce shared. “Because I am a fan of music, I’m a fan of live shows. I want to see it from the front of the stage. We probably could have finessed it that way, but I think it’s just that much more of an experience if you’re in the pit, man, if you’re in the madness with all the fans.”"
]
}
]
},
"publishing_date": "2024-04-25 04:21:02",
"title": "Donna Kelce Calls Taylor Swift's ‘The Tortured Poets Department’ Her 'Best Work'",
"topics": [
"Music News"
]
}
}
Binary file not shown.
8 changes: 8 additions & 0 deletions tests/resources/parser/test_data/us/meta.info
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,14 @@
"url": "https://www.reuters.com/world/middle-east/israeli-tank-likely-scenario-fired-machine-gun-reporters-after-deadly-shelling-2024-03-07/",
"crawl_date": "2024-03-07 13:30:03.931664"
},
"RollingStone_2024_04_24.html.gz": {
"url": "https://www.rollingstone.com/tv-movies/tv-movie-features/shogun-finale-review-1235007316/",
"crawl_date": "2024-04-24 16:34:23.763178"
},
"RollingStone_2024_04_25.html.gz": {
"url": "https://www.rollingstone.com/music/music-news/donna-kelce-taylor-swift-the-tortured-poets-department-best-work-1235010328/",
"crawl_date": "2024-04-25 11:47:22.799761"
},
"TheGatewayPundit_2023_04_28.html.gz": {
"url": "https://www.thegatewaypundit.com/2023/04/new-plot-twist-in-idaho-quadruple-murders-case-surviving-roommate-agrees-to-interview-with-kohbergers-defense-lawyers/",
"crawl_date": "2023-04-28 20:33:02.500510"
Expand Down

0 comments on commit 11a34c1

Please sign in to comment.