This repository has been archived by the owner on Aug 13, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 10
Disk cache the manifests, fixes #392 #403
Merged
peterbe
merged 7 commits into
mozilla-services:master
from
peterbe:disk-cache-the-manifests-fixes-392
Apr 19, 2018
Merged
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
3e2a802
Disk cache the manifests, fixes #392
peterbe 8a2e063
hacking
peterbe 94f4484
fix for test_unzip_chunks
peterbe 4df3a66
avoid print
peterbe 2bd4f5e
ignore more
peterbe a2b9a5c
remove debug printing
peterbe 28d1cdc
review fixes
peterbe File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,3 +8,4 @@ buildhub-lambda*.zip | |
|
||
jobs/.cache | ||
ui/node_modules/ | ||
csv-download-directory/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,15 +5,21 @@ | |
import re | ||
import asyncio | ||
import datetime | ||
import glob | ||
import json | ||
import logging | ||
import os | ||
import pkgutil | ||
import tempfile | ||
import time | ||
import zlib | ||
from concurrent.futures import ThreadPoolExecutor | ||
|
||
import aiofiles | ||
import aiobotocore | ||
import botocore | ||
from decouple import config | ||
from aiohttp.client_exceptions import ClientPayloadError | ||
import kinto_http | ||
import raven | ||
from raven.handlers.logging import SentryHandler | ||
|
@@ -33,13 +39,19 @@ | |
'delivery-{inventory}/' | ||
) | ||
CHUNK_SIZE = 1024 * 256 # 256 KB | ||
MAX_CSV_DOWNLOAD_AGE = 60 * 60 * 24 * 2 # two days | ||
|
||
INITIALIZE_SERVER = os.getenv('INITIALIZE_SERVER', 'true').lower() == 'true' | ||
|
||
# Minimum number of hours old an entry in the CSV files need to be | ||
# to NOT be skipped. | ||
MIN_AGE_LAST_MODIFIED_HOURS = int(os.getenv('MIN_AGE_LAST_MODIFIED_HOURS', 0)) | ||
|
||
CSV_DOWNLOAD_DIRECTORY = config( | ||
'CSV_DOWNLOAD_DIRECTORY', | ||
tempfile.gettempdir() | ||
) | ||
|
||
# Optional Sentry with synchronuous client. | ||
SENTRY_DSN = os.getenv('SENTRY_DSN') | ||
sentry = raven.Client(SENTRY_DSN, transport=raven.transport.http.HTTPTransport) | ||
|
@@ -129,25 +141,82 @@ async def list_manifest_entries(loop, s3_client, inventory): | |
async with manifest['Body'] as stream: | ||
body = await stream.read() | ||
manifest_content = json.loads(body.decode('utf-8')) | ||
# Return keys of csv.gz files | ||
for f in manifest_content['files']: | ||
yield f['key'] | ||
|
||
|
||
async def download_csv(loop, s3_client, keys_stream, chunk_size=CHUNK_SIZE): | ||
# Here, each 'f' is a dictionary that looks something like this: | ||
# | ||
# { | ||
# "key" : "inventories/net-mozaw...f-b1a0-5fb25bb83752.csv.gz", | ||
# "size" : 7945521, | ||
# "MD5checksum" : "7454b0d773000f790f15b867ee152049" | ||
# } | ||
# | ||
# We yield the whole thing. The key is used to download from S3. | ||
# The MD5checksum is used to know how to store the file on | ||
# disk for caching. | ||
yield f | ||
|
||
|
||
async def download_csv( | ||
loop, | ||
s3_client, | ||
files_stream, | ||
chunk_size=CHUNK_SIZE, | ||
download_directory=CSV_DOWNLOAD_DIRECTORY, | ||
): | ||
""" | ||
Download the S3 object of each key and return deflated data chunks (CSV). | ||
:param loop: asyncio event loop. | ||
:param s3_client: Initialized S3 client. | ||
:param keys_stream async generator: List of object keys for | ||
the csv.gz manifests. | ||
""" | ||
async for key in keys_stream: | ||
key = 'public/' + key | ||
logger.info('Fetching inventory piece {}'.format(key)) | ||
file_csv_gz = await s3_client.get_object(Bucket=BUCKET, Key=key) | ||
|
||
# Make sure the directory exists if it wasn't already created. | ||
if not os.path.isdir(download_directory): | ||
os.makedirs(download_directory, exists_ok=True) | ||
|
||
# Look for old download junk in the download directory. | ||
too_old = MAX_CSV_DOWNLOAD_AGE | ||
for file_path in glob.glob(os.path.join(download_directory, '*.csv.gz')): | ||
age = time.time() - os.stat(file_path).st_mtime | ||
if age > too_old: | ||
logger.info( | ||
f'Delete old download file {file_path} ' | ||
f'({age} seconds old)' | ||
) | ||
os.remove(file_path) | ||
|
||
async for files in files_stream: | ||
# If it doesn't exist on disk, download to disk. | ||
file_path = os.path.join( | ||
download_directory, | ||
files['MD5checksum'] + '.csv.gz' | ||
) | ||
# The file neither exists or has data. | ||
if os.path.isfile(file_path) and os.stat(file_path).st_size: | ||
logger.debug(f'{file_path} was already downloaded locally') | ||
else: | ||
key = 'public/' + files['key'] | ||
logger.info('Fetching inventory piece {}'.format(key)) | ||
file_csv_gz = await s3_client.get_object(Bucket=BUCKET, Key=key) | ||
try: | ||
async with aiofiles.open(file_path, 'wb') as destination: | ||
async with file_csv_gz['Body'] as source: | ||
while 'there are chunks to read': | ||
gzip_chunk = await source.read(chunk_size) | ||
if not gzip_chunk: | ||
break # End of response. | ||
await destination.write(gzip_chunk) | ||
size = os.stat(file_path).st_size | ||
logger.info(f'Downloaded {key} to {file_path} ({size} bytes)') | ||
except ClientPayloadError: | ||
if os.path.exists(file_path): | ||
os.remove(file_path) | ||
raise | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. r+ :) |
||
|
||
# Now we expect the file to exist locally. Let's read it. | ||
gzip = zlib.decompressobj(zlib.MAX_WBITS | 16) | ||
async with file_csv_gz['Body'] as stream: | ||
async with aiofiles.open(file_path, 'rb') as stream: | ||
while 'there are chunks to read': | ||
gzip_chunk = await stream.read(chunk_size) | ||
if not gzip_chunk: | ||
|
@@ -195,8 +264,8 @@ async def main(loop, inventory): | |
async with session.create_client( | ||
's3', region_name=REGION_NAME, config=boto_config | ||
) as client: | ||
keys_stream = list_manifest_entries(loop, client, inventory) | ||
csv_stream = download_csv(loop, client, keys_stream) | ||
files_stream = list_manifest_entries(loop, client, inventory) | ||
csv_stream = download_csv(loop, client, files_stream) | ||
records_stream = csv_to_records( | ||
loop, | ||
csv_stream, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: add explicit
st_size > 0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Out-of-curiousity; why?
Isn't it explicit already that the test is for the
st_size
to be anything greater than 0?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Matter of taste maybe :)