diff --git a/recipes/afr.recipe b/recipes/afr.recipe index e91d7cd420f4..e3ad295dc5d3 100644 --- a/recipes/afr.recipe +++ b/recipes/afr.recipe @@ -1,6 +1,10 @@ -from calibre.ptempfile import PersistentTemporaryFile +#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe +from datetime import date +def absurl(url): + if url.startswith('/'): + return 'https://www.afr.com' + url class afr(BasicNewsRecipe): title = 'Australian Financial Review' @@ -40,34 +44,10 @@ class afr(BasicNewsRecipe): [data-testid="AuthorNames"], [data-testid="ArticleTimestamp"] {font-size:small;} ''' - ignore_duplicate_articles = {'title'} + ignore_duplicate_articles = {'title', 'url'} resolve_internal_links = True remove_empty_feeds = True - articles_are_obfuscated = True - - def get_obfuscated_article(self, url): - br = self.get_browser() - try: - br.open(url) - except Exception as e: - url = e.hdrs.get('location') - soup = self.index_to_soup(url) - link = soup.find('a', href=True) - skip_sections =[ # add sections you want to skip - '/video/', '/videos/', '/media/', 'podcast-' - ] - if any(x in link['href'] for x in skip_sections): - self.log('Aborting Article ', link['href']) - self.abort_article('skipping video links') - - self.log('Downloading ', link['href']) - html = br.open(link['href']).read() - pt = PersistentTemporaryFile('.html') - pt.write(html) - pt.close() - return pt.name - def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-src':True}): img['src'] = img['data-src'] @@ -75,15 +55,28 @@ class afr(BasicNewsRecipe): fig['id'] = 'img-cap' return soup - feeds = [] - - sections = [ - 'companies', 'market', 'politics', 'policy', 'world', 'wealth', 'street-talk', - 'chaticleer', 'rear-window', 'life-and-luxury', 'technology', 'property', - 'work-and-careers', - ] - - for sec in sections: - a = 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fwww.afr.com{}&hl=en-AU&gl=AU&ceid=AU:en' - feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F'))) - feeds.append(('Others', a.format(''))) + def parse_index(self): + index = 'https://www.afr.com/' + sections = [ + 'companies', 'market', 'politics', 'policy', 'world', 'wealth', 'street-talk', + 'chaticleer', 'rear-window', 'life-and-luxury', 'technology', 'property', + 'work-and-careers', + ] + feeds = [] + soup = self.index_to_soup(index) + for sec in sections: + section = sec.capitalize() + self.log(section) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith('/' + sec + '/')}): + url = absurl(a['href'].split('?')[0]) + if url in {index + sec + '/', index + sec}: + continue + if date.today().strftime('%Y') not in url: + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds diff --git a/recipes/bar_and_bench.recipe b/recipes/bar_and_bench.recipe index d54fb41ede7b..9fe81716bb3a 100644 --- a/recipes/bar_and_bench.recipe +++ b/recipes/bar_and_bench.recipe @@ -1,4 +1,4 @@ -from calibre.ptempfile import PersistentTemporaryFile +#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes @@ -38,37 +38,24 @@ class bar(BasicNewsRecipe): resolve_internal_links = True remove_empty_feeds = True - articles_are_obfuscated = True - - def get_obfuscated_article(self, url): - br = self.get_browser() - try: - br.open(url) - except Exception as e: - url = e.hdrs.get('location') - soup = self.index_to_soup(url) - link = soup.find('a', href=True) - skip_sections =[ # add sections you want to skip - '/video/', '/videos/', '/media/', 'podcast-' + def parse_index(self): + index = 'https://www.barandbench.com/' + sections = [ + 'news', 'columns', 'interviews', 'law-firms', 'apprentice-lawyer', 'legal-jobs' ] - if any(x in link['href'] for x in skip_sections): - self.log('Aborting Article ', link['href']) - self.abort_article('skipping video links') - - self.log('Downloading ', link['href']) - html = br.open(link['href']).read() - pt = PersistentTemporaryFile('.html') - pt.write(html) - pt.close() - return pt.name - - feeds = [] - - sections = [ - 'news', 'columns', 'interviews', 'law-firms', 'apprentice-lawyer', 'legal-jobs' - ] - - for sec in sections: - a = 'https://news.google.com/rss/search?q=when:27h+allinurl:barandbench.com{}&hl=en-IN&gl=IN&ceid=IN:en' - feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F'))) - feeds.append(('Others', a.format(''))) + feeds = [] + soup = self.index_to_soup(index) + for sec in sections: + section = sec.capitalize() + self.log(section) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(index + sec + '/')}): + url = a['href'].split('?')[0] + if url in {index + sec + '/', index + sec}: + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds diff --git a/recipes/deccan_herald.recipe b/recipes/deccan_herald.recipe index 6f1523a82011..bcf6bb12c225 100644 --- a/recipes/deccan_herald.recipe +++ b/recipes/deccan_herald.recipe @@ -1,7 +1,10 @@ -from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe, classes +def absurl(url): + if url.startswith('/'): + return 'https://www.deccanherald.com' + url + class herald(BasicNewsRecipe): title = 'Deccan Herald' __author__ = 'unkn0wn' @@ -11,38 +14,13 @@ class herald(BasicNewsRecipe): remove_attributes = ['height', 'width', 'style'] ignore_duplicate_articles = {'url', 'title'} encoding = 'utf-8' - - articles_are_obfuscated = True - def get_obfuscated_article(self, url): - br = self.get_browser() - try: - br.open(url) - except Exception as e: - url = e.hdrs.get('location') - soup = self.index_to_soup(url) - link = soup.find('a', href=True) - skip_sections =[ # add sections you want to skip - '/video/', '/bengaluru-crime/', '/metrolife/', - '/karnataka-districts/', '/brandspot/', '/entertainment/', - ] - if any(x in link['href'] for x in skip_sections): - self.log('Aborting Article ', link['href']) - self.abort_article('skipping section') - - self.log('Downloading ', link['href']) - html = br.open(link['href']).read() - pt = PersistentTemporaryFile('.html') - pt.write(html) - pt.close() - return pt.name - keep_only_tags = [ classes('article-title article-author__name'), dict(name='div', attrs={'id':'main-content'}) ] - + remove_tags = [ classes( 'storyShare social-media-icons in_article_video static_text' @@ -50,17 +28,26 @@ class herald(BasicNewsRecipe): ' field-name-field-tags section-full strip--business' ) ] - - feeds = [ - ('Nation', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fnational%2F&hl=en-IN&gl=IN&ceid=IN:en'), - ('Karnataka', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fstate%2F&hl=en-IN&gl=IN&ceid=IN:en'), - ('Opinion', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fopinion%2F&hl=en-IN&gl=IN&ceid=IN:en'), - ('City', - 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fcity%2F&hl=en-IN&gl=IN&ceid=IN:en'), - ('Business', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fbusiness%2F&hl=en-IN&gl=IN&ceid=IN:en'), - ('World', - 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Finternational%2F&hl=en-IN&gl=IN&ceid=IN:en'), - ('Sports', - 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fsports%2F&hl=en-IN&gl=IN&ceid=IN:en'), - ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com&hl=en-IN&gl=IN&ceid=IN:en'), - ] \ No newline at end of file + + def parse_index(self): + index = 'https://www.deccanherald.com/' + sections = [ + 'india', 'world', 'elections', 'opinion', 'specials', 'business', 'sports' + ] + feeds = [] + + for sec in sections: + soup = self.index_to_soup(index + sec) + section = sec.capitalize() + self.log(section) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith('/' + sec + '/')}): + url = absurl(a['href'].split('?')[0]) + if url in {index + sec + '/', index + sec}: + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds diff --git a/recipes/firstpost.recipe b/recipes/firstpost.recipe index 3bc56eb46fae..75e62501a4c0 100644 --- a/recipes/firstpost.recipe +++ b/recipes/firstpost.recipe @@ -1,9 +1,6 @@ -from calibre.ptempfile import PersistentTemporaryFile +#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes -# Firstpost feeds mix sections into other feeds, like explainers end up in opinion feed and opinions end up in India feed. -# change google_feeds to True to fetch right sections. -google_feeds = False class firstpost(BasicNewsRecipe): title = 'Firstpost' @@ -43,35 +40,11 @@ class firstpost(BasicNewsRecipe): 'world', 'web-stories', 'tech', 'artandculture', 'health', 'health-supplement', # 'photos', 'entertainment', 'living', 'education', 'sports', 'firstcricket', ] - if not google_feeds: - oldest_article = 1.2 # days - for sec in sections: - a = 'https://www.firstpost.com/rss/{}.xml' - feeds.append((sec.capitalize(), a.format(sec))) - else: - articles_are_obfuscated = True - def get_obfuscated_article(self, url): - br = self.get_browser() - soup = self.index_to_soup(url) - link = soup.find('a', href=True) - skip_sections =[ # add sections you want to skip - '/video/', '/videos/', '/media/', '/vantage/' - ] - if any(x in link['href'] for x in skip_sections): - self.log('Aborting Article ', link['href']) - self.abort_article('skipping video links') - self.log('Downloading ', link['href']) - html = br.open(link['href']).read() - pt = PersistentTemporaryFile('.html') - pt.write(html) - pt.close() - return pt.name - - for sec in sections: - a = 'https://news.google.com/rss/search?q=when:27h+allinurl:firstpost.com{}&hl=en-IN&gl=IN&ceid=IN:en' - feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F'))) - # feeds.append(('Others', a.format(''))) + oldest_article = 1.2 # days + for sec in sections: + a = 'https://www.firstpost.com/rss/{}.xml' + feeds.append((sec.capitalize(), a.format(sec))) def preprocess_html(self, soup): if h2 := soup.find('h2', attrs={'class':'category-name'}): diff --git a/recipes/hamilton_spectator.recipe b/recipes/hamilton_spectator.recipe index f6cb13674064..92180b75ea54 100644 --- a/recipes/hamilton_spectator.recipe +++ b/recipes/hamilton_spectator.recipe @@ -1,10 +1,13 @@ -from calibre.ptempfile import PersistentTemporaryFile +#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes ''' Hamilton Spectator Calibre Recipe ''' +def absurl(url): + if url.startswith('/'): + return 'https://www.thespec.com' + url class HamiltonSpectator(BasicNewsRecipe): title = u'Hamilton Spectator' @@ -21,8 +24,7 @@ class HamiltonSpectator(BasicNewsRecipe): remove_attributes = ['style', 'height', 'width'] masthead_url = 'https://bloximages.chicago2.vip.townnews.com/thespec.com/content/tncms/custom/image/c0094646-1108-11ee-8af0-b3954ce40e5e.png' - ignore_duplicate_articles = {'title'} - articles_are_obfuscated = True + ignore_duplicate_articles = {'title', 'url'} extra_css = ''' .caption { font-size:small; text-align:center; } @@ -52,35 +54,26 @@ class HamiltonSpectator(BasicNewsRecipe): img['src'] = x.split()[0] return soup - def get_obfuscated_article(self, url): - br = self.get_browser() - try: - br.open(url) - except Exception as e: - url = e.hdrs.get('location') - soup = self.index_to_soup(url) - link = soup.find('a', href=True) - skip_sections =[ # add sections you want to skip - '/video/', '/videos/', '/media/', 'podcast' + def parse_index(self): + index = 'https://www.thespec.com/' + sections = [ + 'news', 'politics', 'opinion', 'business', 'sports', 'life', 'entertainment' ] - if any(x in link['href'] for x in skip_sections): - self.log('Aborting Article ', link['href']) - self.abort_article('skipping video links') - - self.log('Downloading ', link['href']) - html = br.open(link['href']).read() - pt = PersistentTemporaryFile('.html') - pt.write(html) - pt.close() - return pt.name - - feeds = [] - - sections = [ - 'news', 'politics', 'opinion', 'business', 'sports', 'life', 'entertainment' - ] - - for sec in sections: - a = 'https://news.google.com/rss/search?q=when:27h+allinurl:thespec.com{}&hl=en-CA&gl=IN&ceid=CA:en' - feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F'))) - feeds.append(('Others', a.format(''))) + feeds = [] + soup = self.index_to_soup(index) + for sec in sections: + section = sec.capitalize() + self.log(section) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith('/' + sec + '/')}): + url = absurl(a['href'].split('#')[0]) + if url in {index + sec + '/', index + sec}: + continue + if not url.endswith('.html'): + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds diff --git a/recipes/hindutamil.recipe b/recipes/hindutamil.recipe index 0c58577d5787..9c1af06f90c1 100644 --- a/recipes/hindutamil.recipe +++ b/recipes/hindutamil.recipe @@ -1,4 +1,4 @@ -from calibre.ptempfile import PersistentTemporaryFile +#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes @@ -22,53 +22,43 @@ class hindutamil(BasicNewsRecipe): keep_only_tags = [ classes('main-article') ] - + remove_tags = [ classes('newsbot-ads article-details-ads-inner art-follow-title1 dont-miss-it') ] - - ignore_duplicate_articles = {'title'} - remove_empty_feeds = True - articles_are_obfuscated = True + ignore_duplicate_articles = {'title', 'url'} + remove_empty_feeds = True - def get_obfuscated_article(self, url): - br = self.get_browser() - try: - br.open(url) - except Exception as e: - url = e.hdrs.get('location') - soup = self.index_to_soup(url) - link = soup.find('a', href=True) - skip_sections =[ # add sections you want to skip - '/video/', '/videos/', '/media/' + def parse_index(self): + index = 'https://www.hindutamil.in/' + sections = [ + ('தமிழகம்', 'tamilnadu'), + ('இந்தியா', 'india'), + ('கருத்துப் பேழை', 'opinion'), + ('உலகம்', 'world'), + ('வணிகம்', 'business'), + ('விளையாட்டு', 'sports'), + ('தமிழ் சினிமா', 'cinema'), + ('தொழில்நுட்பம்', 'technology'), + ('இணைப்பிதழ்கள்', 'supplements'), + ('Cartoon', 'cartoon'), + ('Life-style', 'life-style') ] - if any(x in link['href'] for x in skip_sections): - self.log('Aborting Article ', link['href']) - self.abort_article('skipping video links') - - self.log('Downloading ', link['href']) - html = br.open(link['href']).read() - pt = PersistentTemporaryFile('.html') - pt.write(html) - pt.close() - return pt.name - - feeds = [] - - sections = [ - ('தமிழகம்', 'tamilnadu'), - ('இந்தியா', 'india'), - ('கருத்துப் பேழை', 'opinion'), - ('உலகம்', 'world'), - ('வணிகம்', 'business'), - # ('விளையாட்டு', 'sports'), - # ('தமிழ் சினிமா', 'cinema'), - ('தொழில்நுட்பம்', 'technology'), - # ('இணைப்பிதழ்கள்', 'supplements'), - ] - - for sec in sections: - a = 'https://news.google.com/rss/search?q=when:27h+allinurl:hindutamil.in%2Fnews{}&hl=ta-IN&gl=IN&ceid=IN:ta' - feeds.append((sec[0], a.format('%2F' + sec[1] + '%2F'))) - # feeds.append(('Others', a.format(''))) + feeds = [] + soup = self.index_to_soup(index) + index = index + 'news/' + for sec in sections: + section = sec[0] + self.log(section) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(index + sec[1] + '/')}): + url = a['href'] + if url in {index + sec[1] + '/', index + sec[1]}: + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds diff --git a/recipes/ifzm.recipe b/recipes/ifzm.recipe index d64fd3c05062..d1fc06f09493 100644 --- a/recipes/ifzm.recipe +++ b/recipes/ifzm.recipe @@ -1,7 +1,11 @@ +#!/usr/bin/env python import json from calibre.web.feeds.news import BasicNewsRecipe +def absurl(url): + if url.startswith('/'): + return 'https://www.infzm.com' + url def json_to_html(raw, link): data = json.loads(raw) @@ -21,24 +25,19 @@ class infzm(BasicNewsRecipe): encoding = 'utf-8' no_stylesheets = True remove_javascript = True - ignore_duplicate_articles = {'title'} + ignore_duplicate_articles = {'title', 'url'} remove_empty_feeds = True use_embedded_content = False remove_attributes = ['style', 'height', 'width'] masthead_url = 'http://ssimg.kkod.cn/web/02/14227.gif' - articles_are_obfuscated = True - remove_tags = [dict(name=['video', 'svg', 'button'])] + articles_are_obfuscated = True + def get_obfuscated_article(self, url): br = self.get_browser() - try: - br.open(url) - except Exception as e: - url = e.hdrs.get('location') - soup = self.index_to_soup(url) - link = soup.find('a', href=True)['href'].split('?')[0] + link = url res_link = link.replace('https://www.infzm.com', 'https://api.infzm.com/mobile') \ + '?platform=wap&version=1.89.0&machine_id=35458aa29603f2b246636e5492122b50&user_id=&token=&member_type=' # if article is paywalled, add code to figure out machine_id @@ -51,12 +50,29 @@ class infzm(BasicNewsRecipe): .cm_pic_caption, .cm_pic_author { font-size:small; text-align:center; } ''' - feeds = [ - ('南方周末', 'https://news.google.com/rss/search?q=when:170h+allinurl:https%3A%2F%2Fwww.infzm.com&hl=zh-HK&gl=HK&ceid=HK:zh') - ] + + def parse_index(self): + index = 'https://www.infzm.com/' + sections = [ + 'contents' + ] + feeds = [] + soup = self.index_to_soup(index) + for sec in sections: + section = sec.capitalize() + self.log(section) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith('/' + sec + '/')}): + url = absurl(a['href'].split('?')[0]) + if url in {index + sec + '/', index + sec}: + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds def populate_article_metadata(self, article, soup, first): - article.title = article.title.replace(' - 南方周末', '') - article.url = soup.find('h1')['title'] - article.summary = self.tag_to_string(soup.find(attrs={'class':'intro'})) - article.text_summary = self.tag_to_string(soup.find(attrs={'class':'intro'})) + if soup.find(attrs={'class':'intro'}: + article.summary = article.text_summary = self.tag_to_string(soup.find(attrs={'class':'intro'})) diff --git a/recipes/inc42.recipe b/recipes/inc42.recipe index de1c921d6c89..254cd6b1ec76 100644 --- a/recipes/inc42.recipe +++ b/recipes/inc42.recipe @@ -1,4 +1,4 @@ -from calibre.ptempfile import PersistentTemporaryFile +#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes @@ -21,45 +21,33 @@ class inc42(BasicNewsRecipe): dict(name='button'), classes('also-read slick-list slides-three common-card'), ] - - ignore_duplicate_articles = {'title'} - remove_empty_feeds = True - articles_are_obfuscated = True + ignore_duplicate_articles = {'title', 'url'} + remove_empty_feeds = True - def get_obfuscated_article(self, url): - br = self.get_browser() - try: - br.open(url) - except Exception as e: - url = e.hdrs.get('location') - soup = self.index_to_soup(url) - link = soup.find('a', href=True) - skip_sections =[ # add sections you want to skip - '/video/', '/videos/', '/media/' + def parse_index(self): + index = 'https://inc42.com/' + sections = [ + 'features', 'buzz', 'startups', 'resources' ] - if any(x in link['href'] for x in skip_sections): - self.log('Aborting Article ', link['href']) - self.abort_article('skipping video links') - - self.log('Downloading ', link['href']) - html = br.open(link['href']).read() - pt = PersistentTemporaryFile('.html') - pt.write(html) - pt.close() - return pt.name + feeds = [] + soup = self.index_to_soup(index) + for sec in sections: + section = sec.capitalize() + self.log(section) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(index + sec + '/')}): + url = a['href'] + if url == index + sec + '/': + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds - feeds = [] - sections = [ - 'features', 'buzz', 'startups', 'resources' - ] - - for sec in sections: - a = 'https://news.google.com/rss/search?q=when:27h+allinurl:inc42.com{}&hl=en-IN&gl=IN&ceid=IN:en' - feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F'))) - feeds.append(('Others', a.format(''))) - def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-src':True}): img['src'] = img['data-src'] diff --git a/recipes/irish_times_free.recipe b/recipes/irish_times_free.recipe index dde5ee3bbf2f..3f75028374a4 100644 --- a/recipes/irish_times_free.recipe +++ b/recipes/irish_times_free.recipe @@ -1,7 +1,12 @@ -from calibre.ptempfile import PersistentTemporaryFile +#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes +from datetime import date +def absurl(url): + if url.startswith('/'): + return 'https://www.irishtimes.com' + url + class IrishTimes(BasicNewsRecipe): title = 'The Irish Times (free)' __author__ = 'unkn0wn' @@ -24,9 +29,8 @@ class IrishTimes(BasicNewsRecipe): ] remove_attributes = ['width', 'height'] - ignore_duplicate_articles = {'title'} + ignore_duplicate_articles = {'title', 'url'} resolve_internal_links = True - articles_are_obfuscated = True def get_cover_url(self): from datetime import date @@ -45,28 +49,6 @@ class IrishTimes(BasicNewsRecipe): cover = None return cover - def get_obfuscated_article(self, url): - br = self.get_browser() - try: - br.open(url) - except Exception as e: - url = e.hdrs.get('location') - soup = self.index_to_soup(url) - link = soup.find('a', href=True) - skip_sections =[ # add sections you want to skip - '/video/', '/videos/', '/media/', '/podcast' - ] - if any(x in link['href'] for x in skip_sections): - self.log('Aborting Article', link['href']) - self.abort_article('skipping video links') - - self.log('Found', link['href']) - html = br.open(link['href']).read() - pt = PersistentTemporaryFile('.html') - pt.write(html) - pt.close() - return pt.name - def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) if self.output_profile.short_name.startswith('kindle'): @@ -78,13 +60,28 @@ class IrishTimes(BasicNewsRecipe): feeds = [] - sections = [ - 'ireland', 'world', 'opinion', 'politics', 'crime-law', 'culture', 'business', - 'life-style', 'health', 'sport', 'property', 'food', 'abroad', 'environment', - 'obituaries' - ] - - for sec in sections: - a = 'https://news.google.com/rss/search?q=when:27h+allinurl:irishtimes.com{}&hl=en-IE&gl=IE&ceid=IE:en' - feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F'))) - feeds.append(('Others', a.format(''))) + def parse_index(self): + index = 'https://www.irishtimes.com/' + sections = [ + 'ireland', 'world', 'opinion', 'politics', 'crime-law', 'culture', 'business', + 'life-style', 'health', 'sport', 'property', 'food', 'abroad', 'environment', + 'obituaries' + ] + feeds = [] + soup = self.index_to_soup(index) + for sec in sections: + section = sec.capitalize() + self.log(section) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith('/' + sec + '/')}): + url = absurl(a['href'].split('?')[0]) + if url in {index + sec + '/', index + sec}: + continue + if date.today().strftime('%Y') not in url: + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds diff --git a/recipes/live_law.recipe b/recipes/live_law.recipe index 747a5a1759af..755bde33255a 100644 --- a/recipes/live_law.recipe +++ b/recipes/live_law.recipe @@ -1,7 +1,9 @@ -from urllib.parse import quote - +#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes +def absurl(url): + if url.startswith('/'): + return 'https://www.livelaw.in' + url class livelaw(BasicNewsRecipe): title = 'Live Law' @@ -20,29 +22,12 @@ class livelaw(BasicNewsRecipe): remove_javascript = True ignore_duplicate_articles = {'title', 'url'} simultaneous_downloads = 1 - art_url = '' extra_css = ''' .news_detail_person_detail {font-size:small; color:#202020;} .news-description { color:#202020; font-style:italic; } ''' - articles_are_obfuscated = True - - def get_obfuscated_article(self, url): - br = self.get_browser() - soup = self.index_to_soup(url) - link = soup.a['href'] - skip_sections =[ # add sections you want to skip - '/video/', '/videos/', '/multimedia/', - ] - if any(x in link for x in skip_sections): - self.abort_article('skipping video links ', link) - self.log('Found ', link) - self.art_url = link - html = br.open(link).read() - return ({ 'data': html, 'url': link }) - keep_only_tags = [ dict(name='div', attrs={'id':'page-content-wrapper'}) ] @@ -65,22 +50,27 @@ class livelaw(BasicNewsRecipe): h2.name = 'p' return soup - feeds = [] - - when = '27' # hours - index = 'https://www.livelaw.in/' - - sections = [ - 'top-stories', 'supreme-court', 'high-court', 'news-updates', 'consumer-cases', 'articles', - 'lawschool', 'law-firms', 'round-ups' - ] - - a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=en-IN&gl=IN&ceid=IN:en' - - for sec in sections: - feeds.append((sec.capitalize(), a.format(when, quote(index + sec, safe='')))) - feeds.append(('Others' , a.format(when, quote(index, safe='')))) - - def populate_article_metadata(self, article, soup, first): - article.url = self.art_url - article.title = article.title.replace(' - Live Law - Indian Legal News', '') + def parse_index(self): + index = 'https://www.livelaw.in/' + sections = [ + 'top-stories', 'supreme-court', 'high-court', 'news-updates', 'consumer-cases', 'articles', + 'lawschool', 'law-firms', 'round-ups' + ] + feeds = [] + soup = self.index_to_soup(index) + for sec in sections: + section = sec.capitalize() + self.log(section) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith('/' + sec + '/')}): + url = absurl(a['href'].split('?')[0]) + if url in {index + sec + '/', index + sec}: + continue + if not url[-1].isdigit(): + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds diff --git a/recipes/mainichi_en.recipe b/recipes/mainichi_en.recipe index e22e310892ef..6be24893e4cc 100644 --- a/recipes/mainichi_en.recipe +++ b/recipes/mainichi_en.recipe @@ -1,8 +1,8 @@ +#!/usr/bin/env python """ www.mainichi.jp/english """ -from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe @@ -23,24 +23,26 @@ class MainichiEnglishNews(BasicNewsRecipe): remove_javascript = True auto_cleanup = True - ignore_duplicate_articles = {'title'} - - articles_are_obfuscated = True - - def get_obfuscated_article(self, url): - br = self.get_browser() - try: - br.open(url) - except Exception as e: - url = e.hdrs.get('location') - soup = self.index_to_soup(url) - link = soup.find('a', href=True) - html = br.open(link['href']).read() - pt = PersistentTemporaryFile('.html') - pt.write(html) - pt.close() - return pt.name - - feeds = [ - ('Articles', 'https://news.google.com/rss/search?q=when:48h+allinurl:mainichi.jp%2Fenglish%2Farticles%2F&hl=en-US&gl=US&ceid=US:en') - ] + ignore_duplicate_articles = {'title', 'url'} + + def parse_index(self): + index = 'https://mainichi.jp/english/' + sections = [ + 'articles' + ] + feeds = [] + soup = self.index_to_soup(index) + for sec in sections: + section = sec.capitalize() + self.log(section) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(index + sec + '/')}): + url = a['href'] + if url in {index + sec + '/', index + sec}: + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds diff --git a/recipes/moneycontrol.recipe b/recipes/moneycontrol.recipe index cf0b65e65ee9..fa5ca524e4f1 100644 --- a/recipes/moneycontrol.recipe +++ b/recipes/moneycontrol.recipe @@ -1,5 +1,4 @@ -from urllib.parse import quote - +#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes @@ -26,21 +25,6 @@ class MoneyControlRecipe(BasicNewsRecipe): .article_desc { font-style:italic; color:#202020; } ''' - articles_are_obfuscated = True - - def get_obfuscated_article(self, url): - br = self.get_browser() - soup = self.index_to_soup(url) - link = soup.a['href'] - skip_sections =[ # add sections you want to skip - '/video/', '/videos/', '/multimedia/', - ] - if any(x in link for x in skip_sections): - self.abort_article('skipping video links ', link) - self.log('Found ', link) - html = br.open(link).read() - return ({ 'data': html, 'url': link }) - keep_only_tags = [ dict(name='div', attrs={'id':lambda x: x and x.startswith('article-')}) ] @@ -65,41 +49,52 @@ class MoneyControlRecipe(BasicNewsRecipe): img['src'] = img['data-src'] return soup - feeds = [] - - when = oldest_article*24 - index = 'https://www.moneycontrol.com/' + def parse_index(self): + index = 'https://www.moneycontrol.com/' - business_sections = [ - 'markets', 'stocks', 'ipo', 'budget', 'banks', 'moneycontrol-research', 'economy', 'earnings', 'real-estate', - 'personal-finance', 'commodities', 'trade', 'companies' - ] - - a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=en-IN&gl=IN&ceid=IN:en' - - for sec in business_sections: - allinurl_a = index + 'news/business' - feeds.append((sec.capitalize(), a.format(when, quote(allinurl_a + sec, safe='')))) - feeds.append(('Business' , a.format(when, quote(allinurl_a, safe='')))) + business_sections = [ + 'markets', 'stocks', 'ipo', 'budget', 'banks', 'moneycontrol-research', 'economy', 'earnings', 'real-estate', + 'personal-finance', 'commodities', 'trade', 'companies' + ] - news_sections = [ - 'india', 'world', 'opinion', 'politics', 'technology', 'trends', 'lifestyle' - ] + news_sections = [ + 'india', 'world', 'opinion', 'politics', 'technology', 'trends', 'lifestyle' + ] - for sec in news_sections: - allinurl_b = index + 'news' - feeds.append((sec.capitalize(), a.format(when, quote(allinurl_b + sec, safe='')))) - feeds.append(('News', a.format(when, quote(allinurl_b, safe=''), ''))) - feeds.append( - ('Others', 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=en-IN&gl=IN&ceid=IN:en'.format(when, quote(index, safe=''))) - ) + feeds = [] + soup = self.index_to_soup(index) + for b_sec in business_sections: + burl = index + 'news/business/' + section = b_sec.capitalize() + self.log(section) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(burl + b_sec + '/')}): + url = a['href'] + if url in {burl + b_sec + '/', burl + b_sec}: + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + for n_sec in news_sections: + nurl = index + 'news/' + nsection = n_sec.capitalize() + self.log(nsection) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(nurl + n_sec + '/')}): + url = a['href'] + if url in {nurl + n_sec + '/', nurl + n_sec}: + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((nsection, articles)) + return feeds def populate_article_metadata(self, article, soup, first): - div = soup.find('div', attrs={'data-io-article-url':True}) - if div: - article.url = div['data-io-article-url'] desc = soup.find(**classes('article_desc')) if desc: article.summary = self.tag_to_string(desc) article.text_summary = article.summary - article.title = article.title.replace(' - Moneycontrol', '') diff --git a/recipes/newsminute.recipe b/recipes/newsminute.recipe index a4dd420d698b..d3c1d39449cd 100644 --- a/recipes/newsminute.recipe +++ b/recipes/newsminute.recipe @@ -1,4 +1,4 @@ -from calibre.ptempfile import PersistentTemporaryFile +#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes @@ -20,29 +20,6 @@ class newsminute(BasicNewsRecipe): resolve_internal_links = True remove_empty_feeds = True remove_attributes = ['style', 'height', 'width'] - articles_are_obfuscated = True - - def get_obfuscated_article(self, url): - br = self.get_browser() - try: - br.open(url) - except Exception as e: - url = e.hdrs.get('location') - soup = self.index_to_soup(url) - link = soup.find('a', href=True) - skip_sections =[ # add sections you want to skip - '/video/', '/videos/', '/media/', 'podcast-' - ] - if any(x in link['href'] for x in skip_sections): - self.log('Aborting Article ', link['href']) - self.abort_article('skipping video links') - - self.log('Downloading ', link['href']) - html = br.open(link['href']).read() - pt = PersistentTemporaryFile('.html') - pt.write(html) - pt.close() - return pt.name keep_only_tags = [ classes( @@ -50,21 +27,29 @@ class newsminute(BasicNewsRecipe): ), ] - feeds = [] - - sections = [ - 'tamil-nadu', 'telangana', 'andhra-pradesh', 'karnataka', 'kerala' - ] - - for sec in sections: - a = 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fwww.thenewsminute.com{}&hl=en-IN&gl=IN&ceid=IN:en' - feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F'))) - feeds.append(('Others', a.format(''))) - - def populate_article_metadata(self, article, soup, first): - article.title = article.title.replace(' - The News Minute', '') - def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-src':True}): img['src'] = img['data-src'] return soup + + def parse_index(self): + index = 'https://www.thenewsminute.com/' + sections = [ + 'tamil-nadu', 'telangana', 'andhra-pradesh', 'karnataka', 'kerala' + ] + feeds = [] + soup = self.index_to_soup(index) + for sec in sections: + section = sec.capitalize() + self.log(section) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(index + sec + '/')}): + url = a['href'] + if url in {index + sec + '/', index + sec}: + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds diff --git a/recipes/scroll.recipe b/recipes/scroll.recipe index 511319a96beb..ee012d32f8b8 100644 --- a/recipes/scroll.recipe +++ b/recipes/scroll.recipe @@ -1,4 +1,4 @@ -from calibre.ptempfile import PersistentTemporaryFile +#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes @@ -19,30 +19,6 @@ class scroll(BasicNewsRecipe): ignore_duplicate_articles = {'title', 'url'} remove_attributes = ['style', 'height', 'width'] - articles_are_obfuscated = True - - def get_obfuscated_article(self, url): - br = self.get_browser() - try: - br.open(url) - except Exception as e: - url = e.hdrs.get('location') - soup = self.index_to_soup(url) - link = soup.find('a', href=True) - skip_sections =[ # add sections you want to skip - '/video/', '/videos/', '/announcements/' - ] - if any(x in link['href'] for x in skip_sections): - self.log('Aborting Article ', link['href']) - self.abort_article('skipping video links') - - self.log('Downloading ', link['href']) - html = br.open(link['href']).read() - pt = PersistentTemporaryFile('.html') - pt.write(html) - pt.close() - return pt.name - extra_css = ''' .orange-tag, .article-meta-container { font-size:small; } .featured-image, .cms-block-image { text-align:center; font-size:small; } @@ -55,10 +31,28 @@ class scroll(BasicNewsRecipe): remove_tags = [classes('comments-entry-point-meta')] - feeds = [('Articles', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fscroll.in&hl=en-IN&gl=IN&ceid=IN:en')] + def parse_index(self): + index = 'https://scroll.in/' + sections = [ + 'article', 'magazine' + ] + feeds = [] + soup = self.index_to_soup(index) + for sec in sections: + section = sec.capitalize() + self.log(section) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(index + sec + '/')}): + url = a['href'].split('?')[0] + if url in {index + sec + '/', index + sec}: + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds def populate_article_metadata(self, article, soup, first): - # article.url = '' - article.summary = self.tag_to_string(soup.find('h2')) - article.text_summary = self.tag_to_string(soup.find('h2')) - article.title = article.title.replace(' - Scroll.in', '') + if soup.find('h2'): + article.summary = article.text_summary = self.tag_to_string(soup.find('h2')) diff --git a/recipes/singtaohk.recipe b/recipes/singtaohk.recipe index cd1d0df2f273..9372f3a9b1f6 100644 --- a/recipes/singtaohk.recipe +++ b/recipes/singtaohk.recipe @@ -1,3 +1,4 @@ +#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes @@ -11,7 +12,7 @@ class STHKRecipe(BasicNewsRecipe): masthead_url = 'https://std.stheadline.com/dist/images/logo-v2@2x.png' no_stylesheets = True remove_javascript = True - ignore_duplicate_articles = {'title'} + ignore_duplicate_articles = {'title', 'url'} remove_empty_feeds = True use_embedded_content = False remove_attributes = ['style', 'height', 'width'] @@ -31,34 +32,26 @@ class STHKRecipe(BasicNewsRecipe): classes('in-article-banner stick-box-gray article-pagination comments') ] - articles_are_obfuscated = True - - def get_obfuscated_article(self, url): - br = self.get_browser() - try: - br.open(url) - except Exception as e: - url = e.hdrs.get('location') - soup = self.index_to_soup(url) - link = soup.find('a', href=True)['href'] - skip_sections = [ # add sections you want to skip - '/video/', '/videos/', '/media/', 'podcast' + def parse_index(self): + index = 'https://std.stheadline.com/' + sections = [ + 'daily', 'realtime', 'supplement' ] - if any(x in link for x in skip_sections): - self.log('Aborting Article ', link) - self.abort_article('skipping video links') - html = br.open(link).read() - return ({ 'data': html, 'url': link }) - - feeds = [ - ('日報', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com%2Fdaily%2F&hl=zh-HK&gl=HK&ceid=HK:zh'), - ('即時', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com%2Frealtime%2F&hl=zh-HK&gl=HK&ceid=HK:zh'), - ('副刊', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com%2Fsupplement%2F&hl=zh-HK&gl=HK&ceid=HK:zh'), - ('其他的 新聞', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fstd.stheadline.com&hl=zh-HK&gl=HK&ceid=HK:zh') - ] - - def populate_article_metadata(self, article, soup, first): - article.title = article.title.replace(' - 星島頭條', '') - - def preprocess_raw_html(self, raw, *a): - return raw.replace('

', '') + feeds = [] + soup = self.index_to_soup(index) + for sec in sections: + section = sec.capitalize() + self.log(section) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(index + sec + '/')}): + url = a['href'] + if url in {index + sec + '/', index + sec}: + continue + if '/article/' not in url: + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds diff --git a/recipes/usatoday.recipe b/recipes/usatoday.recipe index 3a1bb1efe5b8..ccea63c5b015 100644 --- a/recipes/usatoday.recipe +++ b/recipes/usatoday.recipe @@ -6,7 +6,6 @@ __copyright__ = '2008, Kovid Goyal ' usatoday.com ''' -from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe @@ -15,6 +14,9 @@ def classes(classes): return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) +def absurl(url): + if url.startswith('/'): + return 'https://www.usatoday.com' + url class USAToday(BasicNewsRecipe): @@ -60,44 +62,32 @@ class USAToday(BasicNewsRecipe): } ''' - ignore_duplicate_articles = {'title'} + ignore_duplicate_articles = {'title', 'url'} resolve_internal_links = True remove_empty_feeds = True - articles_are_obfuscated = True - - def get_obfuscated_article(self, url): - br = self.get_browser() - try: - br.open(url) - except Exception as e: - url = e.hdrs.get('location') - soup = self.index_to_soup(url) - link = soup.find('a', href=True) - skip_sections =[ # add sections you want to skip - '/video/', '/videos/', '/media/', 'podcast-' + def parse_index(self): + index = 'https://www.usatoday.com/' + sections = [ + 'news', 'opinion', 'tech', 'entertainment', 'money', 'sports', 'travel', 'life', 'investigations', ] - if any(x in link['href'] for x in skip_sections): - self.log('Aborting Article ', link['href']) - self.abort_article('skipping video links') - - self.log('Downloading ', link['href']) - html = br.open(link['href']).read() - pt = PersistentTemporaryFile('.html') - pt.write(html) - pt.close() - return pt.name - - feeds = [] - - sections = [ - 'news', 'nation', 'politics', 'opinion', 'tech', 'entertainment', 'money', 'sports', 'travel', 'life', 'investigations', - ] - - for sec in sections: - a = 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fwww.usatoday.com%2Fstory{}&hl=en-US&gl=US&ceid=US:en' - feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F'))) - # feeds.append(('Others', a.format(''))) + feeds = [] + soup = self.index_to_soup(index) + for sec in sections: + section = sec.capitalize() + self.log(section) + articles = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith('/story/' + sec + '/')}): + url = absurl(a['href'].split('?')[0]) + if url == index + '/story/' + sec + '/': + continue + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds + def preprocess_html(self, soup): for img in soup.findAll('img', src=True):