Merge branch 'master' of https://github.com/unkn0w7n/calibre

kovidgoyal · Sep 17, 2024 · e9cb881 · e9cb881
2 parents c3a4cb2 + 21eca46
commit e9cb881
Showing 1 changed file with 108 additions and 84 deletions.
diff --git a/recipes/nytfeeds.recipe b/recipes/nytfeeds.recipe
@@ -12,17 +12,13 @@ def extract_json(raw):
  return js['initialData']['data']['article']['sprinkledBody']['content']
 
 def parse_image(i):
- if i['__typename'] == 'Image':
- yield '<div>'
- yield '<img src="{}">'.format(i['crops'][0]['renditions'][0]['url'])
- if i.get('caption'):
- yield '<div class="cap">{}'.format(
- i['caption'].get('text', '')
- )
- if i.get('credit'):
- yield '<span class="cred"> ' + i['credit'] + '</span>'
- yield '</div>'
+ yield '<div><img src="{}">'.format(i['crops'][0]['renditions'][0]['url'])
+ if i.get('caption'):
+ yield '<div class="cap">' + ''.join(parse_types(i['caption']))
+ if i.get('credit'):
+ yield '<span class="cred"> ' + i['credit'] + '</span>'
  yield '</div>'
+ yield '</div>'
 
 def parse_img_grid(g):
  for grd in g.get('gridMedia', {}):
@@ -33,92 +29,114 @@ def parse_img_grid(g):
  yield '<span class="cred"> ' + g['credit'] + '</span>'
  yield '</div>'
 
-def parse_cnt(cnt):
- txt = ''
- if cnt['__typename'] == 'TextInline':
- if cnt.get('formats'):
- for fmt in cnt.get('formats', {}):
- if fmt['__typename'] == 'ItalicFormat':
- txt += '<i>'
- if fmt['__typename'] == 'LinkFormat':
- txt += '<a href="{}">'.format(fmt['url'])
- txt += cnt['text']
- elif cnt['__typename'] == 'LineBreakInline':
- txt += '<br/>'
- if '<i>' in txt and '<a href' in txt:
- yield txt + '</a></i>'
- elif '<i>' in txt:
- yield txt + '</i>'
- elif '<a href' in txt:
- yield txt + '</a>'
- else:
- yield txt
-
 def parse_byline(byl):
  for b in byl.get('bylines', {}):
  yield '<div>' + b['renderedRepresentation'] + '</div>'
  for rl in byl.get('role', {}):
- yield '<div><i>' + ''.join(parse_cnt(rl)) + '</i></div>'
+ if ''.join(parse_cnt(rl)).strip():
+ yield '<div><i>' + ''.join(parse_cnt(rl)) + '</i></div>'
 
 def iso_date(x):
  dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone)
  return dt.strftime('%b %d, %Y at %I:%M %p')
 
-def header_parse(h):
+def parse_header(h):
  if h.get('label'):
- if h['label'].get('content'):
- for cl in h['label']['content']:
- yield '<div class="lbl">' + ''.join(parse_cnt(cl)) + '</div>'
- for ch in h['headline']['content']:
- yield '<h1>' + ''.join(parse_cnt(ch)) + '</h1>'
+ yield '<div class="lbl">' + ''.join(parse_types(h['label'])) + '</div>'
+ if h.get('headline'):
+ yield ''.join(parse_types(h['headline']))
  if h.get('summary'):
- for cs in h['summary']['content']:
- yield '<p class="sub">' + ''.join(parse_cnt(cs)) + '</p>'
+ yield '<p class="sub">' + ''.join(parse_types(h['summary'])) + '</p>'
  if h.get('ledeMedia'):
- if h['ledeMedia'].get('__typename', '') == 'ImageBlock':
- yield ''.join(parse_image(h['ledeMedia']['media']))
+ yield ''.join(parse_types(h['ledeMedia']))
  if h.get('byline'):
- yield '<div class="byl"><br/>'
- yield '\t' + '\t'.join(parse_byline(h['byline']))
- if h.get('timestampBlock'):
- yield '\t<div>' + iso_date(h['timestampBlock']['timestamp']) + '</div>'
- yield '</div>'
+ yield ''.join(parse_types(h['byline']))
+ if h.get('timestampBlock'):
+ yield ''.join(parse_types(h['timestampBlock']))
+
+def parse_fmt_type(fm):
+ for f in fm.get('formats', {}):
+ if f.get('__typename', '') == 'BoldFormat':
+ yield '<strong>'
+ if f.get('__typename', '') == 'ItalicFormat':
+ yield '<em>'
+ if f.get('__typename', '') == 'LinkFormat':
+ hrf = f['url']
+ yield '<a href="{}">'.format(hrf)
+ yield fm['text']
+ for f in reversed(fm.get('formats', {})):
+ if f.get('__typename', '') == 'BoldFormat':
+ yield '</strong>'
+ if f.get('__typename', '') == 'ItalicFormat':
+ yield '</em>'
+ if f.get('__typename', '') == 'LinkFormat':
+ yield '</a>'
+
+def parse_cnt(cnt):
+ if cnt.get('formats'):
+ yield ''.join(parse_fmt_type(cnt))
+ elif cnt.get('content'):
+ for cnt_ in cnt['content']:
+ yield from parse_types(cnt_)
+ elif cnt.get('text'):
+ yield cnt['text']
+
+def parse_types(x):
+ if 'Header' in x.get('__typename', ''):
+ yield '\n'.join(parse_header(x))
+
+ elif x.get('__typename', '') == 'Heading1Block':
+ yield '<h1>' + ''.join(parse_cnt(x)) + '</h1>'
+ elif x.get('__typename', '') in {'Heading2Block', 'Heading3Block', 'Heading4Block'}:
+ yield '<h4>' + ''.join(parse_cnt(x)) + '</h4>'
+
+ elif x.get('__typename', '') == 'ParagraphBlock':
+ yield '<p>' + ''.join(parse_cnt(x)) + '</p>'
+
+ elif x.get('__typename', '') == 'BylineBlock':
+ yield '<div class="byl"><br/>' + ''.join(parse_byline(x)) + '</div>' 
+ elif x.get('__typename', '') == 'LabelBlock':
+ yield '<div class="sc">' + ''.join(parse_cnt(x)) + '</div>'
+ elif x.get('__typename', '') == 'BlockquoteBlock':
+ yield '<blockquote>' + ''.join(parse_cnt(x)) + '</blockquote>'
+ elif x.get('__typename', '') == 'TimestampBlock':
+ yield '<div class="time">' + iso_date(x['timestamp']) + '</div>'
+ elif x.get('__typename', '') == 'LineBreakInline':
+ yield '<br/>'
+ elif x.get('__typename', '') == 'RuleBlock':
+ yield '<hr/>'
+
+ elif x.get('__typename', '') == 'Image':
+ yield ''.join(parse_image(x))
+ elif x.get('__typename', '') == 'ImageBlock':
+ yield ''.join(parse_image(x['media']))
+ elif x.get('__typename', '') == 'GridBlock':
+ yield ''.join(parse_img_grid(x))
+
+ elif x.get('__typename', '') == 'ListBlock':
+ yield '<ul>' + ''.join(parse_cnt(x)) + '</ul>'
+ elif x.get('__typename', '') == 'ListItemBlock':
+ yield '<li>' + ''.join(parse_cnt(x)) + '</li>'
+
+ elif x.get('__typename', '') == 'CapsuleBlock':
+ if x['capsuleContent'].get('body'):
+ yield ''.join(parse_cnt(x['capsuleContent']['body']))
+ elif x.get('__typename', '') == 'Capsule':
+ yield ''.join(parse_cnt(x['body']))
+
+ elif x.get('__typename', '') in {
+ 'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock', 'SummaryBlock'
+ }:
+ yield ''.join(parse_cnt(x))
+
+ elif x.get('__typename'):
+ if ''.join(parse_cnt(x)).strip():
+ yield '<p><i>' + ''.join(parse_cnt(x)) + '</i></p>'
 
 def article_parse(data):
  yield "<html><body>"
- for x in data:
- if x.get('__typename', '') in {'HeaderBasicBlock', 'HeaderFullBleedVerticalBlock', 'HeaderFullBleedHorizontalBlock'}:
- yield '\n'.join(header_parse(x))
- elif x.get('__typename', '') == 'ParagraphBlock':
- p_txt = ''
- for para in x['content']:
- p_txt += ''.join(parse_cnt(para))
- if p_txt.strip():
- yield '<p>' + p_txt + '</p>'
- elif x.get('__typename', '') in {'Heading2Block', 'Heading3Block'}:
- h4_txt = ''
- for h2 in x['content']:
- h4_txt += ''.join(parse_cnt(h2))
- if h4_txt.strip():
- yield '<h4>' + h4_txt + '</h4>'
- elif x.get('__typename', '') == 'Heading1Block':
- h1_txt = ''
- for h1 in x['content']:
- h1_txt += ''.join(parse_cnt(h1))
- if h1_txt.strip():
- yield '<h1>' + h1_txt + '</h1>'
- elif x.get('__typename', '') == 'BylineBlock':
- yield '<div class="byl">\n<br/>\t' + '\t'.join(parse_byline(x)) + '</div>'
- elif x.get('__typename', '') == 'ImageBlock':
- yield ''.join(parse_image(x['media']))
- elif x.get('__typename', '') == 'GridBlock':
- yield ''.join(parse_img_grid(x))
- elif x.get('content'):
- o_txt = ''
- for i in x['content']:
- o_txt += ''.join(parse_cnt(i))
- if o_txt.strip():
- yield '<p><i>' + o_txt + '</i></p>'
+ for d in data:
+ yield from parse_types(d)
  yield "</body></html>"
 
 
@@ -159,7 +177,7 @@ class nytFeeds(BasicNewsRecipe):
  'default': 'no'
  },
  'res': {
- 'short': 'For hi-res images, select a resolution from the\nfollowing options: popup, jumbo, mobileMasterAt3x, superJumbo',
+ 'short': 'For hi-res images, select a resolution from the following\noptions: popup, jumbo, mobileMasterAt3x, superJumbo',
  'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default(articleLarge), use articleInline.',
  }
  }
@@ -179,10 +197,12 @@ class nytFeeds(BasicNewsRecipe):
  self.compress_news_images = True
 
  extra_css = '''
- .byl { font-size:small; color:#202020; }
+ .byl, .time { font-size:small; color:#202020; }
  .cap { font-size:small; text-align:center; }
  .cred { font-style:italic; font-size:small; }
  .sub { font-style:italic; }
+ em, blockquote { color: #202020; }
+ .sc { font-variant: small-caps; }
  .lbl { font-size:small; color:#404040; }
  img { display:block; margin:0 auto; }
  '''
@@ -216,7 +236,11 @@ class nytFeeds(BasicNewsRecipe):
  def preprocess_html(self, soup):
  w = self.recipe_specific_options.get('res')
  if w and isinstance(w, str):
- res = '-' + w + '.jpg'
+ res = '-' + w
  for img in soup.findAll('img', attrs={'src':True}):
- img['src'] = img['src'].rsplit('-article', 1)[0] + res
+ ext = img['src'].split('?')[0].split('.')[-1]
+ img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext
+ for c in soup.findAll('div', attrs={'class':'cap'}):
+ for p in c.findAll(['p', 'div']):
+ p.name = 'span'
  return soup