diff --git a/components/DateSelector/DateSelector.jsx b/components/DateSelector/DateSelector.jsx
index a6be11aff..cfd5cd01c 100644
--- a/components/DateSelector/DateSelector.jsx
+++ b/components/DateSelector/DateSelector.jsx
@@ -48,7 +48,7 @@ function DateSelector({
Currently, 311-Data loads only 311 service
- request data from 2023 onward.
+ request data from 2022 onward.
diff --git a/components/common/ReactDayPicker/ReactDayPicker.jsx b/components/common/ReactDayPicker/ReactDayPicker.jsx
index ece94e899..500b14b61 100644
--- a/components/common/ReactDayPicker/ReactDayPicker.jsx
+++ b/components/common/ReactDayPicker/ReactDayPicker.jsx
@@ -222,7 +222,7 @@ function ReactDayPicker({
onDayClick={handleDayClick}
onDayMouseEnter={handleDayMouseEnter}
weekdayElement={}
- fromMonth={new Date(2022, 12)}
+ fromMonth={new Date(2021, 12)}
/>
>
);
diff --git a/components/db/DbProvider.jsx b/components/db/DbProvider.jsx
index df90de9e5..ad8b38d14 100644
--- a/components/db/DbProvider.jsx
+++ b/components/db/DbProvider.jsx
@@ -13,14 +13,9 @@ const datasets = {
hfYtd2024:
'https://huggingface.co/datasets/311-data/2024/resolve/main/2024.parquet', // 2024 year-to-date
hfYtd2023:
- 'https://huggingface.co/datasets/311-data/2023/resolve/main/2023.parquet', // 2023 year-to-date
- hfLastMonth:
- 'https://huggingface.co/datasets/edwinjue/311-data-last-month/resolve/refs%2Fconvert%2Fparquet/edwinjue--311-data-last-month/csv-train.parquet', // last month
- },
- csv: {
- // huggingface
- hfYtd:
- 'https://huggingface.co/datasets/edwinjue/311-data-2023/resolve/main/2023.csv', // year-to-date
+ 'https://huggingface.co/datasets/311-data/2023/resolve/main/2023.parquet', // 2023 entire year
+ hfYtd2022:
+ 'https://huggingface.co/datasets/311-data/2022/resolve/main/2022.parquet', // 2022 entire year
},
};
@@ -71,6 +66,12 @@ function DbProvider({ children, startDate }) {
4,
);
+ await newDb.registerFileURL(
+ 'requests2022.parquet',
+ datasets.parquet.hfYtd2022,
+ 4,
+ );
+
// Create db connection
const newConn = await newDb.connect();
diff --git a/scripts/cleanOldHfDataset.py b/scripts/cleanOldHfDataset.py
new file mode 100644
index 000000000..16c11f2b2
--- /dev/null
+++ b/scripts/cleanOldHfDataset.py
@@ -0,0 +1,120 @@
+'''
+This script downloads the selected year's csv file from Edwin's HuggingFace (which we no longer use),
+and transform the csv into a parquet file, creates the selected year's repo on 311-Data's HuggingFace and
+uploads the parquet file.
+
+This is only use for migrating older years' data for case-by-case usage, not to be confused with the
+daily cron-job.
+
+To process an older year's data, run the script with Python in the terminal with input year:
+ie.: `python3 cleanHfDataset.py 2022`
+'''
+
+import duckdb
+import requests
+import os
+import glob
+from tqdm import tqdm
+from huggingface_hub import HfApi, login
+from dotenv import load_dotenv
+import sys
+
+load_dotenv()
+
+def dlData(year):
+ '''
+ Download the dataset from Edwin's huggingface
+ '''
+ url = f"https://huggingface.co/datasets/edwinjue/311-data-{year}/resolve/main/{year}.csv"
+ outfile = f"{year}.csv"
+ response = requests.get(url, stream=True)
+
+ # Save downloaded file
+ with open(outfile, "wb") as file:
+ for data in tqdm(response.iter_content()):
+ file.write(data)
+
+
+def hfClean(year):
+ '''
+ Clean the dataset by removing problematic string combinations and update timestamp to ISO format
+ '''
+ infile = f"{year}.csv"
+ fixed_filename = f"{year}-fixed.csv"
+ clean_filename = f"{year}-clean.parquet"
+
+ # List of problematic strings to be replaced with ""
+ replace_strings = ["VE, 0"]
+
+ conn = duckdb.connect(database=':memory:')
+
+ try:
+ # Clean and save modified file
+ with open(infile, "r") as input_file, open(fixed_filename, "w") as output_file:
+ for line in input_file:
+ for replace_string in replace_strings:
+ line = line.replace(replace_string, "")
+ output_file.write(line)
+
+ # Open modified file and perform an import/export to duckdb to ensure timestamps are formatted correctly
+ conn.execute(
+ f"create table requests as select * from read_csv_auto('{fixed_filename}', header=True, timestampformat='%m/%d/%Y %H:%M:%S %p');")
+ conn.execute(
+ f"copy (select * from requests) to '{clean_filename}' with (FORMAT PARQUET);")
+
+ except FileNotFoundError:
+ print(f"File {infile} not found.")
+
+
+def hfUpload(year):
+ '''
+ Upload the clean dataset to huggingface.co
+ '''
+ local_filename = f"{year}-clean.parquet"
+ dest_filename = f"{year}.parquet"
+ username = '311-data'
+ repo_name = str(year)
+ repo_type = 'dataset'
+
+ repo_id = f"{username}/{repo_name}"
+ TOKEN = os.getenv('HUGGINGFACE_LOGIN_TOKEN')
+
+ login(TOKEN)
+ api = HfApi()
+
+ # Check if the repository exists, and create it if it doesn't
+ try:
+ api.repo_info(repo_id)
+ except:
+ api.create_repo(repo_id, repo_type=repo_type, exist_ok=True)
+
+ # Upload the file to the repository
+ api.upload_file(
+ path_or_fileobj=local_filename,
+ path_in_repo=dest_filename,
+ repo_id=repo_id,
+ repo_type=repo_type,
+ )
+
+
+def cleanUp():
+ for file in glob.glob('*.csv'):
+ os.remove(file)
+ for file in glob.glob('*.parquet'):
+ os.remove(file)
+
+
+def process_data(year):
+ dlData(year)
+ hfClean(year)
+ hfUpload(year)
+ cleanUp()
+
+
+if __name__ == "__main__":
+ if len(sys.argv) != 2:
+ print("Usage: python one_time_script.py ")
+ sys.exit(1)
+
+ year = sys.argv[1]
+ process_data(year)