-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape.py
62 lines (55 loc) · 1.91 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium import webdriver
from bs4 import BeautifulSoup
from threading import Thread
from queue import Queue
from time import sleep
from tqdm import tqdm
BUFFER = 64
writer_queue = Queue()
scrape_queue = Queue()
def writer():
videos = set()
writes = 0
pbar = tqdm(total = BUFFER)
while True:
pbar.n = 0
pbar.refresh()
with open('videos.txt', 'a') as f:
while pbar.n < BUFFER:
video = writer_queue.get()
if not video in videos:
scrape_queue.put(video)
videos.add(video)
f.write(video)
writes += 1
pbar.set_description('Latest: %s, Total: %d' % (video, writes))
pbar.update()
pbar.refresh()
sleep(1 / 30)
def scrape():
options = Options()
options.add_argument("--headless")
browser = webdriver.Chrome(options = options)
while True:
video = scrape_queue.get()
url = 'https://www.youtube.com/watch?v=' + video
browser.get(url)
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.ID, 'thumbnail')))
WebDriverWait(browser, 2)
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
thumbnails = soup.find_all('a', id = 'thumbnail')
for thumbnail in thumbnails:
if thumbnail.has_attr('href'):
href = thumbnail['href']
if len(href) == 20:
video = href[-11:]
writer_queue.put(video)
Thread(target = writer).start()
for i in range(4):
Thread(target = scrape).start()
scrape_queue.put('')