Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1714 migrate old data #1742

Merged
merged 10 commits into from
Jun 4, 2024
2 changes: 1 addition & 1 deletion components/DateSelector/DateSelector.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ function DateSelector({
<p className={tooltipParagraph}>
<strong>
Currently, 311-Data loads only 311 service
request data from 2023 onward.
request data from 2022 onward.
</strong>
</p>
<p className={tooltipParagraph}>
Expand Down
2 changes: 1 addition & 1 deletion components/common/ReactDayPicker/ReactDayPicker.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ function ReactDayPicker({
onDayClick={handleDayClick}
onDayMouseEnter={handleDayMouseEnter}
weekdayElement={<WeekDay />}
fromMonth={new Date(2022, 12)}
fromMonth={new Date(2021, 12)}
/>
</>
);
Expand Down
17 changes: 9 additions & 8 deletions components/db/DbProvider.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,9 @@ const datasets = {
hfYtd2024:
'https://huggingface.co/datasets/311-data/2024/resolve/main/2024.parquet', // 2024 year-to-date
hfYtd2023:
'https://huggingface.co/datasets/311-data/2023/resolve/main/2023.parquet', // 2023 year-to-date
hfLastMonth:
'https://huggingface.co/datasets/edwinjue/311-data-last-month/resolve/refs%2Fconvert%2Fparquet/edwinjue--311-data-last-month/csv-train.parquet', // last month
},
csv: {
// huggingface
hfYtd:
'https://huggingface.co/datasets/edwinjue/311-data-2023/resolve/main/2023.csv', // year-to-date
'https://huggingface.co/datasets/311-data/2023/resolve/main/2023.parquet', // 2023 entire year
hfYtd2022:
'https://huggingface.co/datasets/311-data/2022/resolve/main/2022.parquet', // 2022 entire year
},
};

Expand Down Expand Up @@ -71,6 +66,12 @@ function DbProvider({ children, startDate }) {
4,
);

await newDb.registerFileURL(
'requests2022.parquet',
datasets.parquet.hfYtd2022,
4,
);

// Create db connection
const newConn = await newDb.connect();

Expand Down
120 changes: 120 additions & 0 deletions scripts/cleanOldHfDataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
'''
This script downloads the selected year's csv file from Edwin's HuggingFace (which we no longer use),
and transform the csv into a parquet file, creates the selected year's repo on 311-Data's HuggingFace and
uploads the parquet file.

This is only use for migrating older years' data for case-by-case usage, not to be confused with the
daily cron-job.

To process an older year's data, run the script with Python in the terminal with input year:
ie.: `python3 cleanHfDataset.py 2022`
'''

import duckdb
import requests
import os
import glob
from tqdm import tqdm
from huggingface_hub import HfApi, login
from dotenv import load_dotenv
import sys

load_dotenv()

def dlData(year):
'''
Download the dataset from Edwin's huggingface
'''
url = f"https://huggingface.co/datasets/edwinjue/311-data-{year}/resolve/main/{year}.csv"
outfile = f"{year}.csv"
response = requests.get(url, stream=True)

# Save downloaded file
with open(outfile, "wb") as file:
for data in tqdm(response.iter_content()):
file.write(data)


def hfClean(year):
'''
Clean the dataset by removing problematic string combinations and update timestamp to ISO format
'''
infile = f"{year}.csv"
fixed_filename = f"{year}-fixed.csv"
clean_filename = f"{year}-clean.parquet"

# List of problematic strings to be replaced with ""
replace_strings = ["VE, 0"]

conn = duckdb.connect(database=':memory:')

try:
# Clean and save modified file
with open(infile, "r") as input_file, open(fixed_filename, "w") as output_file:
for line in input_file:
for replace_string in replace_strings:
line = line.replace(replace_string, "")
output_file.write(line)

# Open modified file and perform an import/export to duckdb to ensure timestamps are formatted correctly
conn.execute(
f"create table requests as select * from read_csv_auto('{fixed_filename}', header=True, timestampformat='%m/%d/%Y %H:%M:%S %p');")
conn.execute(
f"copy (select * from requests) to '{clean_filename}' with (FORMAT PARQUET);")

except FileNotFoundError:
print(f"File {infile} not found.")


def hfUpload(year):
'''
Upload the clean dataset to huggingface.co
'''
local_filename = f"{year}-clean.parquet"
dest_filename = f"{year}.parquet"
username = '311-data'
repo_name = str(year)
repo_type = 'dataset'

repo_id = f"{username}/{repo_name}"
TOKEN = os.getenv('HUGGINGFACE_LOGIN_TOKEN')

login(TOKEN)
api = HfApi()

# Check if the repository exists, and create it if it doesn't
try:
api.repo_info(repo_id)
except:
api.create_repo(repo_id, repo_type=repo_type, exist_ok=True)

# Upload the file to the repository
api.upload_file(
path_or_fileobj=local_filename,
path_in_repo=dest_filename,
repo_id=repo_id,
repo_type=repo_type,
)


def cleanUp():
for file in glob.glob('*.csv'):
os.remove(file)
for file in glob.glob('*.parquet'):
os.remove(file)


def process_data(year):
dlData(year)
hfClean(year)
hfUpload(year)
cleanUp()


if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python one_time_script.py <year>")
sys.exit(1)

year = sys.argv[1]
process_data(year)