-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch.py
98 lines (84 loc) · 3.47 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import argparse
import requests
import time
import os
import concurrent.futures
CHUNK_SIZE = 32 * 1024 * 1024 # 8 MiB
OUTPUT_FILE = None
NUM_THREADS = os.cpu_count()
class Node:
def __init__(self, chunk_id, data=None, next=None):
self.data = data
self.chunk_id = chunk_id
self.next = next
def download_chunk(url, start, end, i, num_chunks, verbose):
"""Download a chunk of the video.
Note that i and num_chunks are only needed for verbose output
"""
headers = {"Range": f"bytes={start}-{end}"}
response = requests.get(url, headers=headers, stream=True)
if verbose:
print(f'Downloading chunk {i} of {num_chunks}')
start_time = time.time()
content = response.content
if verbose:
end_time = time.time()
download_speed = len(content) / (end_time - start_time)
print(
f'chunk {i} download speed: {download_speed / (1024 * 1024):.2f} MiB/s')
return content
def download_video(url, chunk_size, num_threads, output_file, verbose=False):
"""
Downloads the video by creating a linked list of concurrent.futures jobs and writes to output_file.
"""
head = None
current = None
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
response = requests.head(url)
file_size = int(response.headers["Content-Length"])
chunks = [(i * chunk_size, (i + 1) * chunk_size - 1)
for i in range(file_size//chunk_size)]
chunks[-1] = (chunks[-1][0], file_size - 1)
i = 0
num_chunks = len(chunks) + 1
for (start, end) in chunks:
i += 1
if head is None:
head = Node(i)
current = head
else:
current.next = Node(i)
current = current.next
current.data = executor.submit(
download_chunk, url, start, end, i, num_chunks, verbose)
# Use `head` instead of `current` so we can free up memory as we write to file
with open(output_file, "wb") as f:
while head is not None:
result = head.data.result()
if verbose:
print(f'Writing chunk {head.chunk_id} of {num_chunks}')
f.write(result)
head = head.next
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Download video file from url in parallel')
parser.add_argument(
'url', type=str, help='URL of the video file to download')
parser.add_argument('-c', '--chunk_size', type=int, default=CHUNK_SIZE,
help='Size of each chunk to download, in bytes')
parser.add_argument('-o', '--output_file', type=str, default=OUTPUT_FILE,
help='Output filename. Defaults to the last part of the url path')
parser.add_argument('-t', '--num_threads', type=int, default=NUM_THREADS,
help='Number of threads to use for downloading')
parser.add_argument('-v', '--verbose', action='store_true',
help='Print the current chunk number, total number of chunks and download speed')
args = parser.parse_args()
url = args.url
chunk_size = args.chunk_size
if args.output_file:
output_file = args.output_file
else:
output_file = os.path.basename(url)
num_threads = args.num_threads
verbose = args.verbose
download_video(url, chunk_size, num_threads, output_file, verbose)