-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
78 lines (66 loc) · 2.68 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import json
import requests
POST_LOGO = "https://www.washingtonpost.com/touch-icon-iphone-retina.png"
START_MARKER = '<script id="__NEXT_DATA__" type="application/json">'
BOOKS = ["https://www.washingtonpost.com/entertainment/books/fiction/",
"https://www.washingtonpost.com/entertainment/books/nonfiction/"]
def lambda_handler(event, context):
return {
'statusCode': 200,
'body': get_json_feed(False)
}
def sorter(el):
return el["date_published"]
def get_json_feed(debug):
feed_items = []
c = 0
for url in BOOKS:
c += 1
print("GET " + url)
page = requests.get(url)
txt = page.text
start = txt.index(START_MARKER)
end = txt.index('</script>', start)
print("Found JSON " + str(start) + "-" + str(end))
json_string = txt[start+len(START_MARKER):end]
json_data = json.loads(json_string)
if debug:
post_file = open("post_" + str(c) + ".json", "w")
post_file.write(json.dumps(json_data, indent=2))
post_file.close()
items = json_data["props"]["pageProps"]["globalContent"]["items"]
print("Post Data Items " + str(len(items)))
for item in items:
article_id = item["_id"]
article_url = item["canonical_url"]
article_title = item["headlines"]["basic"]
print(article_title)
article_date = item["first_publish_date"]
article_image = item["additional_properties"]["lead_art"]["additional_properties"]["thumbnailResizeUrl"]
article_body = "<p><img src='" + article_image + "'/><p>" + item["description"]["basic"] + "</p>"
article_author = item["credits"]["by"][0]["name"]
feed_article = {
'id': article_id,
'title': article_title,
'authors': [{'name': article_author}],
'url': article_url,
'content_html': article_body,
'date_published': article_date,
'image': article_image,
}
feed_items.append(feed_article)
feed = {
'version': 'https://jsonfeed.org/version/1.1',
'title': 'Washington Post Books',
'home_page_url': "https://www.washingtonpost.com/entertainment/books/",
'user_comment': 'Generated by https://github.com/prenagha/washingtonpost-books-jsonfeed',
'icon': POST_LOGO,
'favicon': POST_LOGO,
'items': sorted(feed_items, key=sorter, reverse=True)
}
return json.dumps(feed, indent=2)
if __name__ == '__main__':
feed_str = get_json_feed(True)
feed_file = open("feed.json", "w")
feed_file.write(feed_str)
feed_file.close()