Merge branch 'master' of https://github.com/unkn0w7n/calibre

kovidgoyal · Sep 25, 2024 · 2443850 · 2443850
2 parents 8c92e3e + c9b3317
commit 2443850
Show file tree

Hide file tree

Showing 16 changed files with 423 additions and 550 deletions.
diff --git a/recipes/afr.recipe b/recipes/afr.recipe
@@ -1,6 +1,10 @@
-from calibre.ptempfile import PersistentTemporaryFile
+#!/usr/bin/env python
 from calibre.web.feeds.news import BasicNewsRecipe
+from datetime import date
 
+def absurl(url):
+    if url.startswith('/'):
+        return 'https://www.afr.com' + url
 
 class afr(BasicNewsRecipe):
     title = 'Australian Financial Review'
@@ -40,50 +44,39 @@ class afr(BasicNewsRecipe):
         [data-testid="AuthorNames"], [data-testid="ArticleTimestamp"] {font-size:small;}
     '''
 
-    ignore_duplicate_articles = {'title'}
+    ignore_duplicate_articles = {'title', 'url'}
     resolve_internal_links  = True
     remove_empty_feeds = True
 
-    articles_are_obfuscated = True
-
-    def get_obfuscated_article(self, url):
-        br = self.get_browser()
-        try:
-            br.open(url)
-        except Exception as e:
-            url = e.hdrs.get('location')
-        soup = self.index_to_soup(url)
-        link = soup.find('a', href=True)
-        skip_sections =[ # add sections you want to skip
-            '/video/', '/videos/', '/media/', 'podcast-'
-        ]
-        if any(x in link['href'] for x in skip_sections):
-            self.log('Aborting Article ', link['href'])
-            self.abort_article('skipping video links')
-
-        self.log('Downloading ', link['href'])
-        html = br.open(link['href']).read()
-        pt = PersistentTemporaryFile('.html')
-        pt.write(html)
-        pt.close()
-        return pt.name
-
     def preprocess_html(self, soup):
         for img in soup.findAll('img', attrs={'data-src':True}):
             img['src'] = img['data-src']
         for fig in soup.findAll('figcaption'):
             fig['id'] = 'img-cap'
         return soup
 
-    feeds = []
-
-    sections = [
-        'companies', 'market', 'politics', 'policy', 'world', 'wealth', 'street-talk',
-        'chaticleer', 'rear-window', 'life-and-luxury', 'technology', 'property',
-        'work-and-careers',
-    ]
-
-    for sec in sections:
-        a = 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fwww.afr.com{}&hl=en-AU&gl=AU&ceid=AU:en'
-        feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F')))
-    feeds.append(('Others', a.format('')))
+    def parse_index(self):
+        index = 'https://www.afr.com/'
+        sections = [
+            'companies', 'market', 'politics', 'policy', 'world', 'wealth', 'street-talk',
+            'chaticleer', 'rear-window', 'life-and-luxury', 'technology', 'property',
+            'work-and-careers',
+        ]
+        feeds = []
+        soup = self.index_to_soup(index)
+        for sec in sections:
+            section = sec.capitalize()
+            self.log(section)
+            articles = []
+            for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith('/' + sec + '/')}):
+                url = absurl(a['href'].split('?')[0])
+                if url in {index + sec + '/', index + sec}:
+                    continue
+                if date.today().strftime('%Y') not in url:
+                    continue
+                title = self.tag_to_string(a)
+                self.log('\t', title, '\n\t\t', url)
+                articles.append({'title': title, 'url': url})
+            if articles:
+                feeds.append((section, articles))
+        return feeds
diff --git a/recipes/bar_and_bench.recipe b/recipes/bar_and_bench.recipe
@@ -1,4 +1,4 @@
-from calibre.ptempfile import PersistentTemporaryFile
+#!/usr/bin/env python
 from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
 
 
@@ -38,37 +38,24 @@ class bar(BasicNewsRecipe):
     resolve_internal_links  = True
     remove_empty_feeds = True
 
-    articles_are_obfuscated = True
-
-    def get_obfuscated_article(self, url):
-        br = self.get_browser()
-        try:
-            br.open(url)
-        except Exception as e:
-            url = e.hdrs.get('location')
-        soup = self.index_to_soup(url)
-        link = soup.find('a', href=True)
-        skip_sections =[ # add sections you want to skip
-            '/video/', '/videos/', '/media/', 'podcast-'
+    def parse_index(self):
+        index = 'https://www.barandbench.com/'
+        sections = [
+            'news', 'columns', 'interviews', 'law-firms', 'apprentice-lawyer', 'legal-jobs'
         ]
-        if any(x in link['href'] for x in skip_sections):
-            self.log('Aborting Article ', link['href'])
-            self.abort_article('skipping video links')
-
-        self.log('Downloading ', link['href'])
-        html = br.open(link['href']).read()
-        pt = PersistentTemporaryFile('.html')
-        pt.write(html)
-        pt.close()
-        return pt.name
-
-    feeds = []
-
-    sections = [
-        'news', 'columns', 'interviews', 'law-firms', 'apprentice-lawyer', 'legal-jobs'
-    ]
-
-    for sec in sections:
-        a = 'https://news.google.com/rss/search?q=when:27h+allinurl:barandbench.com{}&hl=en-IN&gl=IN&ceid=IN:en'
-        feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F')))
-    feeds.append(('Others', a.format('')))
+        feeds = []
+        soup = self.index_to_soup(index)
+        for sec in sections:
+            section = sec.capitalize()
+            self.log(section)
+            articles = []
+            for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(index + sec + '/')}):
+                url = a['href'].split('?')[0]
+                if url in {index + sec + '/', index + sec}:
+                    continue
+                title = self.tag_to_string(a)
+                self.log('\t', title, '\n\t\t', url)
+                articles.append({'title': title, 'url': url})
+            if articles:
+                feeds.append((section, articles))
+        return feeds
diff --git a/recipes/deccan_herald.recipe b/recipes/deccan_herald.recipe
@@ -1,7 +1,10 @@
-from calibre.ptempfile import PersistentTemporaryFile
 from calibre.web.feeds.news import BasicNewsRecipe, classes
 
 
+def absurl(url):
+    if url.startswith('/'):
+        return 'https://www.deccanherald.com' + url
+
 class herald(BasicNewsRecipe):
     title = 'Deccan Herald'
     __author__ = 'unkn0wn'
@@ -11,56 +14,40 @@ class herald(BasicNewsRecipe):
     remove_attributes = ['height', 'width', 'style']
     ignore_duplicate_articles = {'url', 'title'}
     encoding = 'utf-8'
-
-    articles_are_obfuscated = True
 
-    def get_obfuscated_article(self, url):
-        br = self.get_browser()
-        try:
-            br.open(url)
-        except Exception as e:
-            url = e.hdrs.get('location')
-        soup = self.index_to_soup(url)
-        link = soup.find('a', href=True)
-        skip_sections =[ # add sections you want to skip
-            '/video/', '/bengaluru-crime/', '/metrolife/',
-            '/karnataka-districts/', '/brandspot/', '/entertainment/',
-        ]
-        if any(x in link['href'] for x in skip_sections):
-            self.log('Aborting Article ', link['href'])
-            self.abort_article('skipping section')
-
-        self.log('Downloading ', link['href'])
-        html = br.open(link['href']).read()
-        pt = PersistentTemporaryFile('.html')
-        pt.write(html)
-        pt.close()
-        return pt.name
-
     keep_only_tags = [
         classes('article-title article-author__name'),
         dict(name='div', attrs={'id':'main-content'})
 
     ]
-    
+
     remove_tags = [
         classes(
             'storyShare social-media-icons in_article_video static_text'
             ' nl-optin-mobile dk_only article-banner-adver-wrapper wb_holder'
             ' field-name-field-tags section-full strip--business'
         )
     ]
-
-    feeds = [
-        ('Nation', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fnational%2F&hl=en-IN&gl=IN&ceid=IN:en'),
-        ('Karnataka', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fstate%2F&hl=en-IN&gl=IN&ceid=IN:en'),
-        ('Opinion', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fopinion%2F&hl=en-IN&gl=IN&ceid=IN:en'),
-        ('City', 
-            'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fcity%2F&hl=en-IN&gl=IN&ceid=IN:en'),
-        ('Business', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fbusiness%2F&hl=en-IN&gl=IN&ceid=IN:en'),
-        ('World', 
-            'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Finternational%2F&hl=en-IN&gl=IN&ceid=IN:en'),
-        ('Sports',
-            'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fsports%2F&hl=en-IN&gl=IN&ceid=IN:en'),
-        ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com&hl=en-IN&gl=IN&ceid=IN:en'),
-    ]
+
+    def parse_index(self):
+        index = 'https://www.deccanherald.com/'
+        sections = [
+            'india', 'world', 'elections', 'opinion', 'specials', 'business', 'sports'
+        ]
+        feeds = []
+
+        for sec in sections:
+            soup = self.index_to_soup(index + sec)
+            section = sec.capitalize()
+            self.log(section)
+            articles = []
+            for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith('/' + sec + '/')}):
+                url = absurl(a['href'].split('?')[0])
+                if url in {index + sec + '/', index + sec}:
+                    continue
+                title = self.tag_to_string(a)
+                self.log('\t', title, '\n\t\t', url)
+                articles.append({'title': title, 'url': url})
+            if articles:
+                feeds.append((section, articles))
+        return feeds
diff --git a/recipes/firstpost.recipe b/recipes/firstpost.recipe
@@ -1,9 +1,6 @@
-from calibre.ptempfile import PersistentTemporaryFile
+#!/usr/bin/env python
 from calibre.web.feeds.news import BasicNewsRecipe, classes
 
-# Firstpost feeds mix sections into other feeds, like explainers end up in opinion feed and opinions end up in India feed.
-# change google_feeds to True to fetch right sections.
-google_feeds = False
 
 class firstpost(BasicNewsRecipe):
     title = 'Firstpost'
@@ -43,35 +40,11 @@ class firstpost(BasicNewsRecipe):
         'world', 'web-stories', 'tech', 'artandculture', 'health', 'health-supplement',
         # 'photos', 'entertainment', 'living', 'education', 'sports', 'firstcricket',
     ]
-    if not google_feeds:
-        oldest_article = 1.2 # days
-        for sec in sections:
-            a = 'https://www.firstpost.com/rss/{}.xml'
-            feeds.append((sec.capitalize(), a.format(sec)))
-    else:
-        articles_are_obfuscated = True
 
-        def get_obfuscated_article(self, url):
-            br = self.get_browser()
-            soup = self.index_to_soup(url)
-            link = soup.find('a', href=True)
-            skip_sections =[ # add sections you want to skip
-                '/video/', '/videos/', '/media/', '/vantage/'
-            ]
-            if any(x in link['href'] for x in skip_sections):
-                self.log('Aborting Article ', link['href'])
-                self.abort_article('skipping video links')
-            self.log('Downloading ', link['href'])
-            html = br.open(link['href']).read()
-            pt = PersistentTemporaryFile('.html')
-            pt.write(html)
-            pt.close()
-            return pt.name
-
-        for sec in sections:
-            a = 'https://news.google.com/rss/search?q=when:27h+allinurl:firstpost.com{}&hl=en-IN&gl=IN&ceid=IN:en'
-            feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F')))
-        # feeds.append(('Others', a.format('')))
+    oldest_article = 1.2 # days
+    for sec in sections:
+        a = 'https://www.firstpost.com/rss/{}.xml'
+        feeds.append((sec.capitalize(), a.format(sec)))
 
     def preprocess_html(self, soup):
         if h2 := soup.find('h2', attrs={'class':'category-name'}):

diff --git a/recipes/hamilton_spectator.recipe b/recipes/hamilton_spectator.recipe
@@ -1,10 +1,13 @@
-from calibre.ptempfile import PersistentTemporaryFile
+#!/usr/bin/env python
 from calibre.web.feeds.news import BasicNewsRecipe, classes
 
 '''
 Hamilton Spectator Calibre Recipe
 '''
 
+def absurl(url):
+    if url.startswith('/'):
+        return 'https://www.thespec.com' + url
 
 class HamiltonSpectator(BasicNewsRecipe):
     title = u'Hamilton Spectator'
@@ -21,8 +24,7 @@ class HamiltonSpectator(BasicNewsRecipe):
     remove_attributes = ['style', 'height', 'width']
     masthead_url = 'https://bloximages.chicago2.vip.townnews.com/thespec.com/content/tncms/custom/image/c0094646-1108-11ee-8af0-b3954ce40e5e.png'
 
-    ignore_duplicate_articles = {'title'}
-    articles_are_obfuscated = True
+    ignore_duplicate_articles = {'title', 'url'}
 
     extra_css = '''
         .caption { font-size:small; text-align:center; }
@@ -52,35 +54,26 @@ class HamiltonSpectator(BasicNewsRecipe):
                     img['src'] = x.split()[0]
         return soup
 
-    def get_obfuscated_article(self, url):
-        br = self.get_browser()
-        try:
-            br.open(url)
-        except Exception as e:
-            url = e.hdrs.get('location')
-        soup = self.index_to_soup(url)
-        link = soup.find('a', href=True)
-        skip_sections =[ # add sections you want to skip
-            '/video/', '/videos/', '/media/', 'podcast'
+    def parse_index(self):
+        index = 'https://www.thespec.com/'
+        sections = [
+            'news', 'politics', 'opinion', 'business', 'sports', 'life', 'entertainment'
         ]
-        if any(x in link['href'] for x in skip_sections):
-            self.log('Aborting Article ', link['href'])
-            self.abort_article('skipping video links')
-
-        self.log('Downloading ', link['href'])
-        html = br.open(link['href']).read()
-        pt = PersistentTemporaryFile('.html')
-        pt.write(html)
-        pt.close()
-        return pt.name
-
-    feeds = []
-
-    sections = [
-        'news', 'politics', 'opinion', 'business', 'sports', 'life', 'entertainment'
-    ]
-
-    for sec in sections:
-        a = 'https://news.google.com/rss/search?q=when:27h+allinurl:thespec.com{}&hl=en-CA&gl=IN&ceid=CA:en'
-        feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F')))
-    feeds.append(('Others', a.format('')))
+        feeds = []
+        soup = self.index_to_soup(index)
+        for sec in sections:
+            section = sec.capitalize()
+            self.log(section)
+            articles = []
+            for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith('/' + sec + '/')}):
+                url = absurl(a['href'].split('#')[0])
+                if url in {index + sec + '/', index + sec}:
+                    continue
+                if not url.endswith('.html'):
+                    continue
+                title = self.tag_to_string(a)
+                self.log('\t', title, '\n\t\t', url)
+                articles.append({'title': title, 'url': url})
+            if articles:
+                feeds.append((section, articles))
+        return feeds