-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspiegel_async.py
44 lines (34 loc) · 1.45 KB
/
spiegel_async.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# -*- coding: utf-8 -*-
"""
Get all Spiegel articles of the last seven days; using asyncio
"""
from bs4 import BeautifulSoup
import asyncio
import aiohttp
import time
from spiegel_base import ArticleBase
class Article(ArticleBase):
async def get_content(self, session):
print(f'Getting content for {self.date}, {self.title}')
try:
async with session.get(self.url) as resp:
resp.raise_for_status()
content = await resp.text()
except (aiohttp.ClientResponseError, aiohttp.ClientConnectionError) as e:
print(f'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!GET on {self.url} failed: {e}')
# soup = BeautifulSoup(content, 'html.parser')
# paragraphs = soup('p')
# don't actually store the content (we don't want to mess up the memory footprint
# self.content = '\n'.join((p.text for p in paragraphs))
print(f' Got content for {self.date}, {self.title}')
async def get_spiegel_news():
articles = Article.get_articles()
# now that we have all articles we only need to get the content
async with aiohttp.ClientSession() as session:
tasks = [article.get_content(session) for article in articles]
start = time.perf_counter()
await asyncio.gather(*tasks)
diff = time.perf_counter() - start
print(f'Got {len(articles)} articles in {diff:.03f} seconds')
if __name__ == '__main__':
asyncio.run(get_spiegel_news())