initial commit

Hyper5phere · Hyper5phere · commit 8dc3acc5b487 · 2024-06-07T22:48:01.000+03:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+/temp/*
+/data/*
+/__pycache__/*
+/.vscode/*
+*.p
+*.pyc
+.bash_history
+.python_history
+
diff --git a/README.md b/README.md
@@ -0,0 +1,3 @@
+# Twitch data pipeline
+
+Python data pipeline implementation of downloading, extracting, preprocessing and analyzing Twitch chat data.
diff --git a/config.py b/config.py
@@ -0,0 +1,7 @@
+import os
+script_dir = os.path.dirname(os.path.abspath(__file__))
+
+TWITCH_DATA_URL = "https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/VE0IVQ/5VNGY6"
+DATA_FOLDER = os.path.join(script_dir, "data")
+DB_PATH = os.path.join(DATA_FOLDER, "twitch_data.db")
+ZIP_PATH = os.path.join(DATA_FOLDER, "twitch_data.7z")
diff --git a/download_database.py b/download_database.py
@@ -0,0 +1,51 @@
+import os
+import requests
+import logging
+import config
+import pandas as pd
+import sqlite3
+import py7zr
+import zipfile
+
+# setup logging first
+logging.basicConfig(format='%(asctime)s :: %(levelname)-8s :: %(name)-20s :: %(message)s',
+                    datefmt='%Y-%m-%d %H:%M:%S',
+                    level="DEBUG")
+logger = logging.getLogger("twitch.data.downloader")
+
+if not os.path.exists(config.ZIP_PATH):
+    # download the dataset file from dataverse
+    logger.info("Downloading data file: %s", config.TWITCH_DATA_URL)
+    with requests.get(config.TWITCH_DATA_URL, stream=True) as r:
+        r.raise_for_status()
+        with open(config.ZIP_PATH, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+
+# Extract the data
+inner_zip_file = os.path.join(config.DATA_FOLDER, "Twitch_data.zip")
+if not os.path.exists(inner_zip_file):
+    logger.info("Extracting data 7z file: %s", config.ZIP_PATH)
+    with py7zr.SevenZipFile(config.ZIP_PATH, mode='r') as z:
+        z.extractall(config.DATA_FOLDER)
+
+data_folder = os.path.join(config.DATA_FOLDER, "ICWSM19_data")
+if not os.path.exists(data_folder):
+    logger.info("Extracting data ZIP file: %s", inner_zip_file)
+    with zipfile.ZipFile(inner_zip_file, 'r') as zip_ref:
+        zip_ref.extractall(config.DATA_FOLDER)
+
+# load the pickle files into pandas dataframes and write into local SQL database
+with sqlite3.connect(config.DB_PATH) as conn:
+    for f in os.listdir(data_folder):
+        fpath = os.path.join(data_folder, f)
+        logger.info("Reading data file: %s", fpath)
+        data_frame = pd.read_pickle(fpath)
+        table_name = os.path.basename(fpath).replace(".pkl", "")
+        # Identify datetime columns
+        datetime_cols = data_frame.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, UTC]']).columns
+        # Convert datetime columns to ISO 8601 strings
+        for col in datetime_cols:
+            data_frame[col] = data_frame[col].astype(str)
+        data_frame.to_sql(table_name, conn,
+                          if_exists='replace', index=False)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+py7zr

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Twitch data pipeline`
	`2`	`+`
	`3`	`+Python data pipeline implementation of downloading, extracting, preprocessing and analyzing Twitch chat data.`