-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
63 lines (46 loc) · 1.72 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from gcs.gcs import GCSDownloader
from dotenv import load_dotenv
import os
import logging
import sys
from scraper.scraper import clean_content, article_to_content, download_and_clean
import argparse
import dataclasses
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
log = logging.getLogger()
load_dotenv()
project = os.getenv('GCLOUD_PROJECT')
bucket_name = os.getenv('GCS_BUCKET')
def clean_obj(object_name):
downloader = GCSDownloader(gcp_project=project)
res = downloader.download(bucket_name, object_name)
if res is None:
log.error("resource not found: %s/%s", bucket_name, object_name)
return
article = clean_content(res)
if article is None:
log.error("resource not cleaned: %s/%s", bucket_name, object_name)
content = article_to_content(article)
print(dataclasses.asdict(content))
log.info("success: %s/%s", bucket_name, object_name)
def main():
parser = argparse.ArgumentParser(description='newspaper checker')
parser.add_argument('--obj', type=str,
help='gcs object id', required=False)
parser.add_argument('--file', type=str,
help='file with list of objects (not implemented)')
parser.add_argument('--url', type=str,
help='url to check')
args = parser.parse_args()
if args.url is not None:
article = download_and_clean(args.url)
print(article.title)
print(article.text)
print(article.is_valid_body())
return
if args.obj is not None:
clean_obj(args.obj) # "003503d1-e865-59d3-8607-9f99528d1740.html"
return
if __name__ == "__main__":
main()