diff --git a/components/DateSelector/DateSelector.jsx b/components/DateSelector/DateSelector.jsx index a6be11aff..cfd5cd01c 100644 --- a/components/DateSelector/DateSelector.jsx +++ b/components/DateSelector/DateSelector.jsx @@ -48,7 +48,7 @@ function DateSelector({

Currently, 311-Data loads only 311 service - request data from 2023 onward. + request data from 2022 onward.

diff --git a/components/common/ReactDayPicker/ReactDayPicker.jsx b/components/common/ReactDayPicker/ReactDayPicker.jsx index ece94e899..500b14b61 100644 --- a/components/common/ReactDayPicker/ReactDayPicker.jsx +++ b/components/common/ReactDayPicker/ReactDayPicker.jsx @@ -222,7 +222,7 @@ function ReactDayPicker({ onDayClick={handleDayClick} onDayMouseEnter={handleDayMouseEnter} weekdayElement={} - fromMonth={new Date(2022, 12)} + fromMonth={new Date(2021, 12)} /> ); diff --git a/components/db/DbProvider.jsx b/components/db/DbProvider.jsx index df90de9e5..ad8b38d14 100644 --- a/components/db/DbProvider.jsx +++ b/components/db/DbProvider.jsx @@ -13,14 +13,9 @@ const datasets = { hfYtd2024: 'https://huggingface.co/datasets/311-data/2024/resolve/main/2024.parquet', // 2024 year-to-date hfYtd2023: - 'https://huggingface.co/datasets/311-data/2023/resolve/main/2023.parquet', // 2023 year-to-date - hfLastMonth: - 'https://huggingface.co/datasets/edwinjue/311-data-last-month/resolve/refs%2Fconvert%2Fparquet/edwinjue--311-data-last-month/csv-train.parquet', // last month - }, - csv: { - // huggingface - hfYtd: - 'https://huggingface.co/datasets/edwinjue/311-data-2023/resolve/main/2023.csv', // year-to-date + 'https://huggingface.co/datasets/311-data/2023/resolve/main/2023.parquet', // 2023 entire year + hfYtd2022: + 'https://huggingface.co/datasets/311-data/2022/resolve/main/2022.parquet', // 2022 entire year }, }; @@ -71,6 +66,12 @@ function DbProvider({ children, startDate }) { 4, ); + await newDb.registerFileURL( + 'requests2022.parquet', + datasets.parquet.hfYtd2022, + 4, + ); + // Create db connection const newConn = await newDb.connect(); diff --git a/scripts/cleanOldHfDataset.py b/scripts/cleanOldHfDataset.py new file mode 100644 index 000000000..16c11f2b2 --- /dev/null +++ b/scripts/cleanOldHfDataset.py @@ -0,0 +1,120 @@ +''' +This script downloads the selected year's csv file from Edwin's HuggingFace (which we no longer use), +and transform the csv into a parquet file, creates the selected year's repo on 311-Data's HuggingFace and +uploads the parquet file. + +This is only use for migrating older years' data for case-by-case usage, not to be confused with the +daily cron-job. + +To process an older year's data, run the script with Python in the terminal with input year: +ie.: `python3 cleanHfDataset.py 2022` +''' + +import duckdb +import requests +import os +import glob +from tqdm import tqdm +from huggingface_hub import HfApi, login +from dotenv import load_dotenv +import sys + +load_dotenv() + +def dlData(year): + ''' + Download the dataset from Edwin's huggingface + ''' + url = f"https://huggingface.co/datasets/edwinjue/311-data-{year}/resolve/main/{year}.csv" + outfile = f"{year}.csv" + response = requests.get(url, stream=True) + + # Save downloaded file + with open(outfile, "wb") as file: + for data in tqdm(response.iter_content()): + file.write(data) + + +def hfClean(year): + ''' + Clean the dataset by removing problematic string combinations and update timestamp to ISO format + ''' + infile = f"{year}.csv" + fixed_filename = f"{year}-fixed.csv" + clean_filename = f"{year}-clean.parquet" + + # List of problematic strings to be replaced with "" + replace_strings = ["VE, 0"] + + conn = duckdb.connect(database=':memory:') + + try: + # Clean and save modified file + with open(infile, "r") as input_file, open(fixed_filename, "w") as output_file: + for line in input_file: + for replace_string in replace_strings: + line = line.replace(replace_string, "") + output_file.write(line) + + # Open modified file and perform an import/export to duckdb to ensure timestamps are formatted correctly + conn.execute( + f"create table requests as select * from read_csv_auto('{fixed_filename}', header=True, timestampformat='%m/%d/%Y %H:%M:%S %p');") + conn.execute( + f"copy (select * from requests) to '{clean_filename}' with (FORMAT PARQUET);") + + except FileNotFoundError: + print(f"File {infile} not found.") + + +def hfUpload(year): + ''' + Upload the clean dataset to huggingface.co + ''' + local_filename = f"{year}-clean.parquet" + dest_filename = f"{year}.parquet" + username = '311-data' + repo_name = str(year) + repo_type = 'dataset' + + repo_id = f"{username}/{repo_name}" + TOKEN = os.getenv('HUGGINGFACE_LOGIN_TOKEN') + + login(TOKEN) + api = HfApi() + + # Check if the repository exists, and create it if it doesn't + try: + api.repo_info(repo_id) + except: + api.create_repo(repo_id, repo_type=repo_type, exist_ok=True) + + # Upload the file to the repository + api.upload_file( + path_or_fileobj=local_filename, + path_in_repo=dest_filename, + repo_id=repo_id, + repo_type=repo_type, + ) + + +def cleanUp(): + for file in glob.glob('*.csv'): + os.remove(file) + for file in glob.glob('*.parquet'): + os.remove(file) + + +def process_data(year): + dlData(year) + hfClean(year) + hfUpload(year) + cleanUp() + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python one_time_script.py ") + sys.exit(1) + + year = sys.argv[1] + process_data(year)