|
| 1 | +import os |
| 2 | +import requests |
| 3 | +import logging |
| 4 | +import config |
| 5 | +import pandas as pd |
| 6 | +import sqlite3 |
| 7 | +import py7zr |
| 8 | +import zipfile |
| 9 | + |
| 10 | +# setup logging first |
| 11 | +logging.basicConfig(format='%(asctime)s :: %(levelname)-8s :: %(name)-20s :: %(message)s', |
| 12 | + datefmt='%Y-%m-%d %H:%M:%S', |
| 13 | + level="DEBUG") |
| 14 | +logger = logging.getLogger("twitch.data.downloader") |
| 15 | + |
| 16 | +if not os.path.exists(config.ZIP_PATH): |
| 17 | + # download the dataset file from dataverse |
| 18 | + logger.info("Downloading data file: %s", config.TWITCH_DATA_URL) |
| 19 | + with requests.get(config.TWITCH_DATA_URL, stream=True) as r: |
| 20 | + r.raise_for_status() |
| 21 | + with open(config.ZIP_PATH, 'wb') as f: |
| 22 | + for chunk in r.iter_content(chunk_size=8192): |
| 23 | + f.write(chunk) |
| 24 | + |
| 25 | +# Extract the data |
| 26 | +inner_zip_file = os.path.join(config.DATA_FOLDER, "Twitch_data.zip") |
| 27 | +if not os.path.exists(inner_zip_file): |
| 28 | + logger.info("Extracting data 7z file: %s", config.ZIP_PATH) |
| 29 | + with py7zr.SevenZipFile(config.ZIP_PATH, mode='r') as z: |
| 30 | + z.extractall(config.DATA_FOLDER) |
| 31 | + |
| 32 | +data_folder = os.path.join(config.DATA_FOLDER, "ICWSM19_data") |
| 33 | +if not os.path.exists(data_folder): |
| 34 | + logger.info("Extracting data ZIP file: %s", inner_zip_file) |
| 35 | + with zipfile.ZipFile(inner_zip_file, 'r') as zip_ref: |
| 36 | + zip_ref.extractall(config.DATA_FOLDER) |
| 37 | + |
| 38 | +# load the pickle files into pandas dataframes and write into local SQL database |
| 39 | +with sqlite3.connect(config.DB_PATH) as conn: |
| 40 | + for f in os.listdir(data_folder): |
| 41 | + fpath = os.path.join(data_folder, f) |
| 42 | + logger.info("Reading data file: %s", fpath) |
| 43 | + data_frame = pd.read_pickle(fpath) |
| 44 | + table_name = os.path.basename(fpath).replace(".pkl", "") |
| 45 | + # Identify datetime columns |
| 46 | + datetime_cols = data_frame.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, UTC]']).columns |
| 47 | + # Convert datetime columns to ISO 8601 strings |
| 48 | + for col in datetime_cols: |
| 49 | + data_frame[col] = data_frame[col].astype(str) |
| 50 | + data_frame.to_sql(table_name, conn, |
| 51 | + if_exists='replace', index=False) |
0 commit comments