Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
kovidgoyal committed Sep 30, 2024
2 parents 5b17a24 + 812cf96 commit 56eab0e
Show file tree
Hide file tree
Showing 8 changed files with 31 additions and 57 deletions.
32 changes: 11 additions & 21 deletions recipes/nytfeeds.recipe
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,12 @@ def parse_cnt(cnt):
yield ''.join(parse_fmt_type(cnt))
else:
for cnt_ in cnt[k]:
yield from parse_types(cnt_)
yield ''.join(parse_types(cnt_))
if isinstance(cnt[k], dict):
yield from parse_types(cnt[k])
if cnt.get('text') and 'formats' not in cnt:
yield cnt['text']
yield ''.join(parse_types(cnt[k]))
if cnt.get('text') and 'formats' not in cnt and 'content' not in cnt:
if isinstance(cnt['text'], str):
yield cnt['text']

def parse_types(x):
typename = x.get('__typename', '')
Expand Down Expand Up @@ -141,9 +142,6 @@ def parse_types(x):
elif typename == 'RuleBlock':
yield '<hr/>'

elif typename in {'ImageBlock', 'VideoBlock', 'InteractiveBlock'}:
yield "".join(parse_types(x['media']))

elif typename == 'Image':
yield "".join(parse_image(x))

Expand All @@ -161,23 +159,15 @@ def parse_types(x):
elif typename == 'ListItemBlock':
yield f'<li>{"".join(parse_cnt(x))}</li>'

elif typename == 'CapsuleBlock':
if x['capsuleContent'].get('body'):
yield "".join(parse_cnt(x['capsuleContent']['body']))
elif typename == 'Capsule':
yield "".join(parse_cnt(x['body']))

elif typename in {
'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock',
'SummaryBlock', 'VisualStackBlock'
}:
elif typename == 'TextInline':
yield "".join(parse_cnt(x))

elif typename in {'DetailBlock', 'TextRunKV'}:
yield f'<p><i>{"".join(parse_cnt(x))}</i></p>'

elif typename and typename not in {'RelatedLinksBlock', 'Dropzone'}:
if x.get('media'):
yield "".join(parse_types(x['media']))
elif "".join(parse_cnt(x)).strip():
yield f'<p><i>{"".join(parse_cnt(x))}</i></p>'
if "".join(parse_cnt(x)).strip():
yield "".join(parse_cnt(x))

def article_parse(data):
yield "<html><body>"
Expand Down
3 changes: 1 addition & 2 deletions recipes/science_advances.recipe
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,7 @@ class scienceadv(BasicNewsRecipe):

feeds = []

div = soup.find('div', attrs={'class':'toc__body'})
for sec in div.findAll('section', **classes('toc__section')):
for sec in soup.findAll('section', **classes('toc__section')):
name = sec.find(**classes('sidebar-article-title--decorated'))
section = self.tag_to_string(name).strip()
self.log(section)
Expand Down
3 changes: 1 addition & 2 deletions recipes/science_journal.recipe
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,7 @@ class science(BasicNewsRecipe):

feeds = []

div = soup.find('div', attrs={'class':'toc__body'})
for sec in div.findAll('section', **classes('toc__section')):
for sec in soup.findAll('section', **classes('toc__section')):
name = sec.find(**classes('sidebar-article-title--decorated'))
section = self.tag_to_string(name).strip()
self.log(section)
Expand Down
3 changes: 1 addition & 2 deletions recipes/sciimmunol.recipe
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,7 @@ class scienceadv(BasicNewsRecipe):

feeds = []

div = soup.find('div', attrs={'class':'toc__body'})
for sec in div.findAll('section', **classes('toc__section')):
for sec in soup.findAll('section', **classes('toc__section')):
name = sec.find(**classes('sidebar-article-title--decorated'))
section = self.tag_to_string(name).strip()
self.log(section)
Expand Down
3 changes: 1 addition & 2 deletions recipes/scirobotics.recipe
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,7 @@ class scienceadv(BasicNewsRecipe):

feeds = []

div = soup.find('div', attrs={'class':'toc__body'})
for sec in div.findAll('section', **classes('toc__section')):
for sec in soup.findAll('section', **classes('toc__section')):
name = sec.find(**classes('sidebar-article-title--decorated'))
section = self.tag_to_string(name).strip()
self.log(section)
Expand Down
3 changes: 1 addition & 2 deletions recipes/scisignaling.recipe
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,7 @@ class scienceadv(BasicNewsRecipe):

feeds = []

div = soup.find('div', attrs={'class':'toc__body'})
for sec in div.findAll('section', **classes('toc__section')):
for sec in soup.findAll('section', **classes('toc__section')):
name = sec.find(**classes('sidebar-article-title--decorated'))
section = self.tag_to_string(name).strip()
self.log(section)
Expand Down
3 changes: 1 addition & 2 deletions recipes/scistm.recipe
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,7 @@ class scienceadv(BasicNewsRecipe):

feeds = []

div = soup.find('div', attrs={'class':'toc__body'})
for sec in div.findAll('section', **classes('toc__section')):
for sec in soup.findAll('section', **classes('toc__section')):
name = sec.find(**classes('sidebar-article-title--decorated'))
section = self.tag_to_string(name).strip()
self.log(section)
Expand Down
38 changes: 14 additions & 24 deletions src/calibre/web/site_parsers/nytimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from calibre.utils.iso8601 import parse_iso8601

module_version = 7 # needed for live updates
module_version = 8 # needed for live updates
pprint


Expand Down Expand Up @@ -111,11 +111,12 @@ def parse_cnt(cnt):
yield ''.join(parse_fmt_type(cnt))
else:
for cnt_ in cnt[k]:
yield from parse_types(cnt_)
yield ''.join(parse_types(cnt_))
if isinstance(cnt[k], dict):
yield from parse_types(cnt[k])
if cnt.get('text') and 'formats' not in cnt:
yield cnt['text']
yield ''.join(parse_types(cnt[k]))
if cnt.get('text') and 'formats' not in cnt and 'content' not in cnt:
if isinstance(cnt['text'], str):
yield cnt['text']

def parse_types(x):
typename = x.get('__typename', '')
Expand Down Expand Up @@ -143,9 +144,6 @@ def parse_types(x):
elif typename == 'RuleBlock':
yield '<hr/>'

elif typename in {'ImageBlock', 'VideoBlock', 'InteractiveBlock'}:
yield "".join(parse_types(x['media']))

elif typename == 'Image':
yield "".join(parse_image(x))

Expand All @@ -161,25 +159,17 @@ def parse_types(x):
elif typename == 'ListBlock':
yield f'<ul>{"".join(parse_cnt(x))}</ul>'
elif typename == 'ListItemBlock':
yield f'<li>{"".join(parse_cnt(x))}</li>'

elif typename == 'CapsuleBlock':
if x['capsuleContent'].get('body'):
yield "".join(parse_cnt(x['capsuleContent']['body']))
elif typename == 'Capsule':
yield "".join(parse_cnt(x['body']))

elif typename in {
'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock',
'SummaryBlock', 'VisualStackBlock'
}:
yield f'\n<li>{"".join(parse_cnt(x))}</li>'

elif typename == 'TextInline':
yield "".join(parse_cnt(x))

elif typename in {'DetailBlock', 'TextRunKV'}:
yield f'<p><i>{"".join(parse_cnt(x))}</i></p>'

elif typename and typename not in {'RelatedLinksBlock', 'Dropzone'}:
if x.get('media'):
yield "".join(parse_types(x['media']))
elif "".join(parse_cnt(x)).strip():
yield f'<p><i>{"".join(parse_cnt(x))}</i></p>'
if "".join(parse_cnt(x)).strip():
yield "".join(parse_cnt(x))

def article_parse(data):
yield "<html><body>"
Expand Down

0 comments on commit 56eab0e

Please sign in to comment.