-
Notifications
You must be signed in to change notification settings - Fork 0
/
0_download.py
103 lines (79 loc) · 2.95 KB
/
0_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import json
import logging
import os
import sqlite3
from datetime import datetime
from time import sleep
import tensorboardX
from tqdm import tqdm
from utils.utils_dev import get_yaml_data
from utils.utils_pexels import download_pexels
from utils.utils_flickr import download_flickr
from utils.utils_unsplash import download_unsplash
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--logdir", type=str, default="logs")
parser.add_argument("--source", type=str, default="unsplash")
args = parser.parse_args()
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(filename=f'{args.logdir}/pexels-{ str(datetime.now().timestamp()) }.log', format=LOG_FORMAT)
cred_path = "credentials/credentials.yaml"
query_path = "data/queries.json"
metadata_path = f"dataset/pre_metadata-{ args.source }.json"
dataset_basedir = "dataset/images"
# TODO: <= Python 3.9
match args.source:
case "pexels":
cred = get_yaml_data(cred_path)['pexels']
os.environ.pop("HTTP_PROXY", None)
os.environ.pop("HTTPS_PROXY", None)
download = download_pexels
from utils.utils_pexels import get_api
case "flickr":
download = download_flickr
cred = get_yaml_data(cred_path)['flickr']
os.environ['HTTP_PROXY']="http://127.0.0.1:7890"
os.environ['HTTPS_PROXY']="http://127.0.0.1:7890"
from utils.utils_flickr import get_api
case "unsplash":
download = download_unsplash
os.environ.pop("HTTP_PROXY", None)
os.environ.pop("HTTPS_PROXY", None)
cred = get_yaml_data(cred_path)['unsplash']
from utils.utils_unsplash import get_api
case _:
raise NotImplementedError()
hour_amount, daily_amount = cred['maxPerHour'], cred['maxPerDay']
tokens = iter(cred['tokens'])
os.makedirs(dataset_basedir, exist_ok=True)
if os.path.exists(metadata_path):
metadata = json.load(open(metadata_path, "r"))
else:
metadata = { 'metadata': [], 'queries': {} }
queries = json.load(open(query_path, "r"))
queries = list(set(queries))
n_each_query = 40
api_stat = {}
dataset_stat = {}
api = get_api(next(tokens))
hour_count = 0
for query in tqdm(queries):
print(query)
meta, do_sleep = download(query, api, dataset_basedir, n_each_query)
metadata['metadata'] += meta
metadata['queries'][query] = len(meta) if query not in metadata['queries'] else metadata['queries'][query] + len(meta)
json.dump(metadata, open(metadata_path, "w"))
hour_count += 1
# hour_count += len(meta) + 1
if hour_count >= hour_amount or do_sleep:
hour_count = 0
try:
print("Changing to the next token")
logging.info("Changing to the next token")
api = get_api(next(tokens))
except StopIteration:
print("Sleeping...")
logging.info("Sleeping...")
tokens = iter(cred['tokens'])
api = get_api(next(tokens))
sleep(3600)