Skip to content

Commit

Permalink
Update india_today.recipe
Browse files Browse the repository at this point in the history
  • Loading branch information
unkn0w7n committed Sep 28, 2024
1 parent f0e42c3 commit 1ab50ec
Showing 1 changed file with 37 additions and 54 deletions.
91 changes: 37 additions & 54 deletions recipes/india_today.recipe
Original file line number Diff line number Diff line change
@@ -1,23 +1,9 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8

from calibre.ebooks.BeautifulSoup import Tag
import re
import json
from calibre.web.feeds.news import BasicNewsRecipe


def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})


def new_tag(soup, name, attrs=()):
impl = getattr(soup, 'new_tag', None)
if impl is not None:
return impl(name, attrs=dict(attrs))
return Tag(soup, name, attrs=attrs or None)


class IndiaToday(BasicNewsRecipe):
title = u'India Today Magazine'
language = 'en_IN'
Expand All @@ -33,21 +19,13 @@ class IndiaToday(BasicNewsRecipe):
masthead_url = 'https://akm-img-a-in.tosshub.com/sites/all/themes/itg/logo.png'

extra_css = '''
#sub-d {font-style:italic; color:#202020;}
.story__byline {font-size:small; text-align:left;}
.body_caption, .mos__alt .caption, .caption-drupal-entity {font-size:small; text-align:center;}
blockquote{color:#404040;}
#sub-h {font-style:italic; color:#202020;}
.body_caption, #imgcap, .mos__alt .caption, .caption-drupal-entity, .calibre-nuked-tag-figcaption {font-size:small; text-align:center;}
#author, .authors__container {font-size:small;}
blockquote {color:#404040;}
'''

remove_tags = [
classes('checkout__section sharing align-center-button amp-izooto-sub ads__container inline-story-add amp-ad readmore__box'),
dict(name=(('amp-web-push-widget', 'amp-ad'))),
dict(attrs={'id':'tab-link-wrapper-plugin'}),
dict(name='div', attrs={'amp-access':'NOT granted'})
]

def preprocess_raw_html(self, raw_html, url):
return raw_html.replace('—', '--')
remove_tags = [dict(attrs={id:['tab-link-wrapper-plugin']})]

recipe_specific_options = {
'date': {
Expand Down Expand Up @@ -105,32 +83,37 @@ class IndiaToday(BasicNewsRecipe):
return sorted(sections.items(), key=sort_key)

def preprocess_html(self, soup):
if soup.find('div', attrs={'amp-access':'granted'}) is not None:
keep_only_tags = [
classes('strytitle strykicker story__byline srtymos'),
dict(name='div', attrs={'amp-access':'granted'}),
]
else:
keep_only_tags = [
classes('strytitle strykicker story__byline srtymos'),
dict(name='div', attrs={'class':'description'}),
]
body = new_tag(soup, 'body')
for spec in keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)

for img in soup.findAll('amp-img'):
if not img.find('img'):
img.name = 'img'
h2 = soup.find('h2')
if h2:
h2.name = 'p'
h2['id'] = 'sub-d'
for quo in soup.findAll(attrs={'class':'quotes'}):
quo.name = 'blockquote'
return soup

def print_version(self, url):
return url.replace('.in/','.in/amp/')
def preprocess_raw_html(self, raw, *a):
m = re.search('id="__NEXT_DATA__" type="application/json">', raw)
raw = raw[m.start():]
raw = raw.split('>', 1)[1]
data = json.JSONDecoder().raw_decode(raw)[0]
data = data['props']['pageProps']['initialState']['server']['page_data']
title = data['title']
body = '<div>' + data['description'] + '</div>'

slug = desc = image = author = date = imagecap = city = ''

if 'slug' in data:
slug = '<div>' + data['slug'] + '</div>\n'
if 'description_short' in data:
desc = '<p id="sub-h">' + data['description_short'] + '</p>\n'
if data.get('author'):
author = ''.join([names['title'] for names in data['author']])
if 'city' in data:
city = data['city']
if 'datetime_updated' in data:
date = data['datetime_updated']
if 'image_main' in data:
image = '<br/><img src="{}">'.format(data['image_main'])
if 'image_caption' in data:
imagecap = '<div id="imgcap">' + data['image_caption'] + '</div>'

html = '<html><body>' + slug + '<h1>' + title + '</h1>\n' + desc + '<div id="author">'\
+ author + '<span> ' + city + ' UPDATED: ' + date + '</span></div>\n' + image + imagecap + body\
+ '</body></html>'
return html

0 comments on commit 1ab50ec

Please sign in to comment.