Skip to content

Commit 8dc3acc

Browse files
committed
initial commit
0 parents  commit 8dc3acc

File tree

5 files changed

+71
-0
lines changed

5 files changed

+71
-0
lines changed

.gitignore

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
/temp/*
2+
/data/*
3+
/__pycache__/*
4+
/.vscode/*
5+
*.p
6+
*.pyc
7+
.bash_history
8+
.python_history
9+

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Twitch data pipeline
2+
3+
Python data pipeline implementation of downloading, extracting, preprocessing and analyzing Twitch chat data.

config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import os
2+
script_dir = os.path.dirname(os.path.abspath(__file__))
3+
4+
TWITCH_DATA_URL = "https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/VE0IVQ/5VNGY6"
5+
DATA_FOLDER = os.path.join(script_dir, "data")
6+
DB_PATH = os.path.join(DATA_FOLDER, "twitch_data.db")
7+
ZIP_PATH = os.path.join(DATA_FOLDER, "twitch_data.7z")

download_database.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import os
2+
import requests
3+
import logging
4+
import config
5+
import pandas as pd
6+
import sqlite3
7+
import py7zr
8+
import zipfile
9+
10+
# setup logging first
11+
logging.basicConfig(format='%(asctime)s :: %(levelname)-8s :: %(name)-20s :: %(message)s',
12+
datefmt='%Y-%m-%d %H:%M:%S',
13+
level="DEBUG")
14+
logger = logging.getLogger("twitch.data.downloader")
15+
16+
if not os.path.exists(config.ZIP_PATH):
17+
# download the dataset file from dataverse
18+
logger.info("Downloading data file: %s", config.TWITCH_DATA_URL)
19+
with requests.get(config.TWITCH_DATA_URL, stream=True) as r:
20+
r.raise_for_status()
21+
with open(config.ZIP_PATH, 'wb') as f:
22+
for chunk in r.iter_content(chunk_size=8192):
23+
f.write(chunk)
24+
25+
# Extract the data
26+
inner_zip_file = os.path.join(config.DATA_FOLDER, "Twitch_data.zip")
27+
if not os.path.exists(inner_zip_file):
28+
logger.info("Extracting data 7z file: %s", config.ZIP_PATH)
29+
with py7zr.SevenZipFile(config.ZIP_PATH, mode='r') as z:
30+
z.extractall(config.DATA_FOLDER)
31+
32+
data_folder = os.path.join(config.DATA_FOLDER, "ICWSM19_data")
33+
if not os.path.exists(data_folder):
34+
logger.info("Extracting data ZIP file: %s", inner_zip_file)
35+
with zipfile.ZipFile(inner_zip_file, 'r') as zip_ref:
36+
zip_ref.extractall(config.DATA_FOLDER)
37+
38+
# load the pickle files into pandas dataframes and write into local SQL database
39+
with sqlite3.connect(config.DB_PATH) as conn:
40+
for f in os.listdir(data_folder):
41+
fpath = os.path.join(data_folder, f)
42+
logger.info("Reading data file: %s", fpath)
43+
data_frame = pd.read_pickle(fpath)
44+
table_name = os.path.basename(fpath).replace(".pkl", "")
45+
# Identify datetime columns
46+
datetime_cols = data_frame.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, UTC]']).columns
47+
# Convert datetime columns to ISO 8601 strings
48+
for col in datetime_cols:
49+
data_frame[col] = data_frame[col].astype(str)
50+
data_frame.to_sql(table_name, conn,
51+
if_exists='replace', index=False)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
py7zr

0 commit comments

Comments
 (0)