-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspiegel_concurrent.py
58 lines (46 loc) · 2.03 KB
/
spiegel_concurrent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# -*- coding: utf-8 -*-
"""
Get all Spiegel articles of the last seven days; serial
"""
import requests
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
import time
from spiegel_base import ArticleBase
# maximum workers in executor pool.
# "play" w/ this value...
MAX_WORKERS = 20
class Article(ArticleBase):
def get_content(self):
print(f'Getting content for {self.date}, {self.title}')
with requests.Session() as session:
resp = session.get(self.url)
resp.raise_for_status()
content = resp.text
# soup = BeautifulSoup(content, 'html.parser')
# paragraphs = soup('p')
# don't actually store the content (we don't want to mess up the memory footprint
# self.content = '\n'.join((p.text for p in paragraphs))
print(f' Got content for {self.date}, {self.title}')
def get_spiegel_news():
articles = Article.get_articles()
# now that we have all articles we only need to get the content
# either use ProcessPoolExecutor or ThreadPoolExecutor instance
use_process_pool_executor = False
if use_process_pool_executor:
pool_executor = ProcessPoolExecutor
else:
pool_executor = ThreadPoolExecutor
with pool_executor(max_workers=MAX_WORKERS) as executor:
# assign all tasks to executors and wait for results
# as_completed is an iterator and takes an iterable of futures and returns each result as they become available
start = time.perf_counter()
if use_process_pool_executor:
# as_completed is an iterator and takes an iterable of futures and returns each result as they become
# available
list(as_completed([executor.submit(article.get_content()) for article in articles]))
else:
list(executor.map(lambda article: article.get_content(), articles))
diff = time.perf_counter() - start
print(f'Got {len(articles)} articles in {diff:.03f} seconds')
if __name__ == '__main__':
get_spiegel_news()