From d0412accfe3efa0b765ccc2dfdde02973aaee2a0 Mon Sep 17 00:00:00 2001 From: Johnny Wu Date: Thu, 30 May 2024 17:23:32 -0700 Subject: [PATCH 1/9] Feat: set up script to manually clean old dataset from edwin's hf with year parameter and upload cleaned dataset to 311's hf --- scripts/cleanHfDataset.py | 100 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 scripts/cleanHfDataset.py diff --git a/scripts/cleanHfDataset.py b/scripts/cleanHfDataset.py new file mode 100644 index 000000000..d2eaf7763 --- /dev/null +++ b/scripts/cleanHfDataset.py @@ -0,0 +1,100 @@ +import duckdb +import requests +import os +import glob +from tqdm import tqdm +from huggingface_hub import HfApi, login +from dotenv import load_dotenv +import sys + +load_dotenv() + +def dlData(year): + ''' + Download the dataset from huggingface + ''' + url = f"https://huggingface.co/datasets/edwinjue/311-data-2022/resolve/main/{year}.csv" + outfile = f"{year}.csv" + response = requests.get(url, stream=True) + + # Save downloaded file + with open(outfile, "wb") as file: + for data in tqdm(response.iter_content()): + file.write(data) + + +def hfClean(year): + ''' + Clean the dataset by removing problematic string combinations and update timestamp to ISO format + ''' + infile = f"{year}.csv" + fixed_filename = f"{year}-fixed.csv" + clean_filename = f"{year}-clean.parquet" + + # List of problematic strings to be replaced with "" + replace_strings = ["VE, 0"] + + conn = duckdb.connect(database=':memory:') + + try: + # Clean and save modified file + with open(infile, "r") as input_file, open(fixed_filename, "w") as output_file: + for line in input_file: + for replace_string in replace_strings: + line = line.replace(replace_string, "") + output_file.write(line) + + # Open modified file and perform an import/export to duckdb to ensure timestamps are formatted correctly + conn.execute( + f"create table requests as select * from read_csv_auto('{fixed_filename}', header=True, timestampformat='%m/%d/%Y %H:%M:%S %p');") + conn.execute( + f"copy (select * from requests) to '{clean_filename}' with (FORMAT PARQUET);") + + except FileNotFoundError: + print(f"File {infile} not found.") + + +def hfUpload(year): + ''' + Upload the clean dataset to huggingface.co + ''' + local_filename = f"{year}-clean.parquet" + dest_filename = f"{year}.parquet" + username = '311-data' + repo_name = str(year) + repo_type = 'dataset' + + repo_id = f"{username}/{repo_name}" + TOKEN = os.getenv('HUGGINGFACE_LOGIN_TOKEN') + + login(TOKEN) + api = HfApi() + api.upload_file( + path_or_fileobj=local_filename, + path_in_repo=dest_filename, + repo_id=repo_id, + repo_type=repo_type, + ) + + +def cleanUp(): + for file in glob.glob('*.csv'): + os.remove(file) + for file in glob.glob('*.parquet'): + os.remove(file) + + +def process_data(year): + dlData(year) + hfClean(year) + hfUpload(year) + cleanUp() + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python one_time_script.py ") + sys.exit(1) + + year = sys.argv[1] + process_data(year) From 40935910f1aa2f5481bb1f1d9c2ba6676dceccf3 Mon Sep 17 00:00:00 2001 From: Johnny Wu Date: Thu, 30 May 2024 17:35:26 -0700 Subject: [PATCH 2/9] Feat: add check if hf repo exist when upload and create --- scripts/cleanHfDataset.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/cleanHfDataset.py b/scripts/cleanHfDataset.py index d2eaf7763..798689c06 100644 --- a/scripts/cleanHfDataset.py +++ b/scripts/cleanHfDataset.py @@ -13,7 +13,7 @@ def dlData(year): ''' Download the dataset from huggingface ''' - url = f"https://huggingface.co/datasets/edwinjue/311-data-2022/resolve/main/{year}.csv" + url = f"https://huggingface.co/datasets/edwinjue/311-data-{year}/resolve/main/{year}.csv" outfile = f"{year}.csv" response = requests.get(url, stream=True) @@ -69,6 +69,14 @@ def hfUpload(year): login(TOKEN) api = HfApi() + + # Check if the repository exists, and create it if it doesn't + try: + api.repo_info(repo_id) + except: + api.create_repo(repo_id, repo_type=repo_type, exist_ok=True) + + # Upload the file to the repository api.upload_file( path_or_fileobj=local_filename, path_in_repo=dest_filename, From 8b71fad0bed0c7d0feed92b6b4c5dc5331e92956 Mon Sep 17 00:00:00 2001 From: Johnny Wu Date: Thu, 30 May 2024 17:37:58 -0700 Subject: [PATCH 3/9] docs: update dlData comment --- scripts/cleanHfDataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cleanHfDataset.py b/scripts/cleanHfDataset.py index 798689c06..b8a9b8feb 100644 --- a/scripts/cleanHfDataset.py +++ b/scripts/cleanHfDataset.py @@ -11,7 +11,7 @@ def dlData(year): ''' - Download the dataset from huggingface + Download the dataset from Edwin's huggingface ''' url = f"https://huggingface.co/datasets/edwinjue/311-data-{year}/resolve/main/{year}.csv" outfile = f"{year}.csv" From 45147375eabe68c291b3b1f89628d6112f2def53 Mon Sep 17 00:00:00 2001 From: Johnny Wu Date: Fri, 31 May 2024 16:10:17 -0700 Subject: [PATCH 4/9] ops: add dataset registration --- components/db/DbProvider.jsx | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/components/db/DbProvider.jsx b/components/db/DbProvider.jsx index df90de9e5..ad8b38d14 100644 --- a/components/db/DbProvider.jsx +++ b/components/db/DbProvider.jsx @@ -13,14 +13,9 @@ const datasets = { hfYtd2024: 'https://huggingface.co/datasets/311-data/2024/resolve/main/2024.parquet', // 2024 year-to-date hfYtd2023: - 'https://huggingface.co/datasets/311-data/2023/resolve/main/2023.parquet', // 2023 year-to-date - hfLastMonth: - 'https://huggingface.co/datasets/edwinjue/311-data-last-month/resolve/refs%2Fconvert%2Fparquet/edwinjue--311-data-last-month/csv-train.parquet', // last month - }, - csv: { - // huggingface - hfYtd: - 'https://huggingface.co/datasets/edwinjue/311-data-2023/resolve/main/2023.csv', // year-to-date + 'https://huggingface.co/datasets/311-data/2023/resolve/main/2023.parquet', // 2023 entire year + hfYtd2022: + 'https://huggingface.co/datasets/311-data/2022/resolve/main/2022.parquet', // 2022 entire year }, }; @@ -71,6 +66,12 @@ function DbProvider({ children, startDate }) { 4, ); + await newDb.registerFileURL( + 'requests2022.parquet', + datasets.parquet.hfYtd2022, + 4, + ); + // Create db connection const newConn = await newDb.connect(); From 92b8ffe51c8775f3ebd39406c02ebc0a60745b42 Mon Sep 17 00:00:00 2001 From: Johnny Wu Date: Fri, 31 May 2024 16:18:20 -0700 Subject: [PATCH 5/9] feat: enable calendar to browse 2022 --- components/common/ReactDayPicker/ReactDayPicker.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/common/ReactDayPicker/ReactDayPicker.jsx b/components/common/ReactDayPicker/ReactDayPicker.jsx index ece94e899..500b14b61 100644 --- a/components/common/ReactDayPicker/ReactDayPicker.jsx +++ b/components/common/ReactDayPicker/ReactDayPicker.jsx @@ -222,7 +222,7 @@ function ReactDayPicker({ onDayClick={handleDayClick} onDayMouseEnter={handleDayMouseEnter} weekdayElement={} - fromMonth={new Date(2022, 12)} + fromMonth={new Date(2021, 12)} /> ); From e0a7cf07f0adaa856854ad0d8ba83155fb8d9feb Mon Sep 17 00:00:00 2001 From: Johnny Wu Date: Fri, 31 May 2024 16:23:30 -0700 Subject: [PATCH 6/9] feat: update tooltip text --- components/DateSelector/DateSelector.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/DateSelector/DateSelector.jsx b/components/DateSelector/DateSelector.jsx index a6be11aff..cfd5cd01c 100644 --- a/components/DateSelector/DateSelector.jsx +++ b/components/DateSelector/DateSelector.jsx @@ -48,7 +48,7 @@ function DateSelector({

Currently, 311-Data loads only 311 service - request data from 2023 onward. + request data from 2022 onward.

From ed44f81e7dc12fe0e2b6f3e8e6ffe1e6f6052586 Mon Sep 17 00:00:00 2001 From: Johnny Wu Date: Fri, 31 May 2024 17:47:09 -0700 Subject: [PATCH 7/9] docs: added comment --- scripts/cleanHfDataset.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/cleanHfDataset.py b/scripts/cleanHfDataset.py index b8a9b8feb..83e100835 100644 --- a/scripts/cleanHfDataset.py +++ b/scripts/cleanHfDataset.py @@ -1,3 +1,15 @@ +''' +This script downloads the selected year's csv file from Edwin's HuggingFace (which we no longer use), +and transform the csv into a parquet file, creates selected the year's repo on 311-Data's HuggingFace and +uploads the parquet file. + +This is only use for migrating older years' data for case-by-case usage, not to be confused with the +daily cron-job. + +Run the script with Python in the terminal with input year: +ie.: `python3 cleanHfDataset.py 2022` +''' + import duckdb import requests import os From e373cc7313a4f2145a9187924c86967427da739b Mon Sep 17 00:00:00 2001 From: Johnny Wu Date: Fri, 31 May 2024 17:50:02 -0700 Subject: [PATCH 8/9] fix: typo --- scripts/cleanHfDataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/cleanHfDataset.py b/scripts/cleanHfDataset.py index 83e100835..16c11f2b2 100644 --- a/scripts/cleanHfDataset.py +++ b/scripts/cleanHfDataset.py @@ -1,12 +1,12 @@ ''' This script downloads the selected year's csv file from Edwin's HuggingFace (which we no longer use), -and transform the csv into a parquet file, creates selected the year's repo on 311-Data's HuggingFace and +and transform the csv into a parquet file, creates the selected year's repo on 311-Data's HuggingFace and uploads the parquet file. This is only use for migrating older years' data for case-by-case usage, not to be confused with the daily cron-job. -Run the script with Python in the terminal with input year: +To process an older year's data, run the script with Python in the terminal with input year: ie.: `python3 cleanHfDataset.py 2022` ''' From 9f50d62364bf663041761f73669611c8ea6e5674 Mon Sep 17 00:00:00 2001 From: Johnny Wu Date: Mon, 3 Jun 2024 10:14:35 -0700 Subject: [PATCH 9/9] fix: update file name for more clarity --- scripts/{cleanHfDataset.py => cleanOldHfDataset.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/{cleanHfDataset.py => cleanOldHfDataset.py} (100%) diff --git a/scripts/cleanHfDataset.py b/scripts/cleanOldHfDataset.py similarity index 100% rename from scripts/cleanHfDataset.py rename to scripts/cleanOldHfDataset.py