-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.py
69 lines (54 loc) · 2.33 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
import os
import importlib
import time
import sys
from datetime import datetime, timedelta
from usda.usda import send_clickhouse
def run_scrapes():
# Even though the container with this script waits for selenium_hub
# and firefox node to be created, it still was erroring out.
# After I added this time.sleep(300), it started working.
# Comment out if testing.
# TODO: Fix this. Avoid the need to sleep.
time.sleep(200)
# Use on AWS container to connect to hub
hub_url = "http://localhost:4444/wd/hub"
# Use within docker local container to connect to hub
# hub_url = "http://selenium_hub:4444/wd/hub"
shared_folder = "/shared-data/"
options = webdriver.FirefoxOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-extensions')
firefox_profile = webdriver.FirefoxProfile()
firefox_profile.set_preference("browser.download.folderList", 2)
firefox_profile.set_preference("browser.download.manager.showWhenStarting", False)
firefox_profile.set_preference("browser.download.dir", shared_folder)
firefox_profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
options.set_preference("firefox_profile", firefox_profile.encoded)
driver = webdriver.Remote(
command_executor=hub_url,
options=options
)
scripts_directory = "scripts"
script_files = [f for f in os.listdir(scripts_directory) if f.endswith(".py")]
for script_file in script_files:
try:
module_name = script_file[:-3]
module = importlib.import_module(f"{scripts_directory}.{module_name}")
if hasattr(module, "run_scrape"):
module.run_scrape(driver)
print("\n\n\n")
except Exception as error:
print(str(error))
driver.quit()
def run_usda():
current_date = datetime.now().strftime('%Y-%m-%d')
yesterday = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
send_clickhouse(yesterday, current_date)
if __name__ == "__main__":
run_scrapes()
run_usda()