Skip to content

Commit

Permalink
Merge pull request #473 from BanoMarvey/master
Browse files Browse the repository at this point in the history
Added Daily Mail as a publisher
  • Loading branch information
MaxDall authored May 6, 2024
2 parents 725e4b3 + 9ed83f4 commit 26ea2c4
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 2 deletions.
17 changes: 16 additions & 1 deletion docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
<table class="publishers au">
<thead>
<tr>
<th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>Source&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>Missing&#160;Attributes</th>
Expand Down Expand Up @@ -815,6 +815,21 @@
</tr>
</thead>
<tbody>
<tr>
<td>
<code>DailyMail</code>
</td>
<td>
<div>Daily Mail</div>
</td>
<td>
<a href="https://www.dailymail.co.uk/">
<span>www.dailymail.co.uk</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>DailyStar</code>
Expand Down
18 changes: 17 additions & 1 deletion src/fundus/publishers/uk/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from datetime import date
from datetime import date, datetime

from dateutil.rrule import YEARLY, rrule

from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
from fundus.scraping.filter import inverse, regex_filter
from fundus.scraping.url import NewsMap, Sitemap

from ..shared import EuronewsParser
from .daily_mail import DailyMailParser
from .daily_star import DailyStarParser
from .i_news import INewsParser
from .the_guardian import TheGuardianParser
Expand Down Expand Up @@ -97,3 +100,16 @@ class UK(PublisherEnum):
url_filter=regex_filter("sun-bingo|web-stories"),
parser=TheSunParser,
)

DailyMail = PublisherSpec(
name="Daily Mail",
domain="https://www.dailymail.co.uk/",
sources=[
NewsMap("https://www.dailymail.co.uk/google-news-sitemap.xml"),
]
+ [
Sitemap(f"https://www.dailymail.co.uk/sitemap-articles-year~{year.year}.xml")
for year in rrule(YEARLY, dtstart=datetime(2021, 1, 1), until=datetime.today())
],
parser=DailyMailParser,
)
45 changes: 45 additions & 0 deletions src/fundus/publishers/uk/daily_mail.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import datetime
from typing import List, Optional

from lxml.cssselect import CSSSelector

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
)


class DailyMailParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = CSSSelector("div[itemprop='articleBody'] > p")
_summary_selector = CSSSelector("#js-article-text > h1")

@attribute
def body(self) -> ArticleBody:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
)

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.meta.get("og:title")

@attribute
def topics(self) -> List[str]:
filtered_topics = []
for topic in generic_topic_parsing(self.precomputed.meta.get("keywords")):
if topic.casefold() != topic:
filtered_topics.append(topic)
return filtered_topics
42 changes: 42 additions & 0 deletions tests/resources/parser/test_data/uk/DailyMail.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"V1": {
"authors": [
"Dolores Chang"
],
"body": {
"summary": [],
"sections": [
{
"headline": [],
"paragraphs": [
"Think twice before you spend any $1 bill in your wallet, as it could fetch thousands of dollars for you.",
"Currency collectors nationwide are on hunt for some rare dollar bills, willing to pay up to $150,000 for those with a specific printing error.",
"According to the personal finance blog Wealthynickel, two batches of $1 bills printed in 2014 and 2016 contain this particular error from the US Bureau of Engraving and Printing.",
"'It's very rare that the Federal Reserve would mess up an order, and then it reaches circulation,' Chad Hawk, vice president of PMG, a paper money grading company in Florida told Fox.",
"Scroll down to see how to identify the rare bucks worth thousands",
"Typically, every bill in circulation needs a unique serial number to identify it, but the US Bureau of Engraving and Printing had a miscommunication with federal banks.",
"This resulted in 6.4 million pairs of $1 bills with matching serial numbers being circulated before the mistake was noticed by the Federal Reserve.",
"While the first batch was issued in New York and the second was issued in Washington, D.C., these bills could now be anywhere in the world.",
"'In the last two or three years, people started to discover the error. The community, through social media, has been able to connect,' Hawk said.",
"'And people have been able to pair up their notes in a lot of ways. The last pairing I think I saw sold for about $6,000,' he added.",
"Only nine of these pairs have been matched, leaving millions of rare $1 bills out there.",
"According to Wealthynickel, currency collecting companies are willing to pay between $20,000 and $150,000 for a pair from the two batches.",
"Here's what to look for:",
"If you're fortunate enough to have one of these $1 bills, the next step is to find the other bill with a matching serial number.",
"According to Hawk, the best approach is to utilize social media.",
"'The best thing to do is look online, go on social media — and there are actually websites dedicated to this,' he said.",
"'You can find outlets where people are collecting the data, so you can see if notes are out there already.",
"'If someone's already reported this number, you might be able to pair up with someone looking for this number. They may be willing to pay a big premium for that,' he said."
]
}
]
},
"publishing_date": "2024-04-27 15:56:35+01:00",
"title": "Your $1 bill could be worth up THOUSANDS - here's how to check",
"topics": [
"Florida",
"Federal Reserve",
"New York"
]
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/uk/meta.info
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
{
"DailyMail_2024_04_27.html.gz": {
"url": "https://www.dailymail.co.uk/yourmoney/article-13357113/dollar-bill-worth-THOUSANDS-check.html",
"crawl_date": "2024-04-27 17:00:10.853579"
},
"DailyStar_2024_04_23.html.gz": {
"url": "https://www.dailystar.co.uk/travel/travel-news/i-sell-walt-disney-world-32645619",
"crawl_date": "2024-04-23 13:00:40.272329"
Expand Down

0 comments on commit 26ea2c4

Please sign in to comment.