hackforla · Skydodle · Jun 4, 2024 · May 31, 2024 · May 31, 2024 · May 31, 2024
diff --git a/components/DateSelector/DateSelector.jsx b/components/DateSelector/DateSelector.jsx
@@ -48,7 +48,7 @@ function DateSelector({
             <p className={tooltipParagraph}>
               <strong>
                 Currently, 311-Data loads only 311 service
-                request data from 2023 onward.
+                request data from 2022 onward.
               </strong>
             </p>
             <p className={tooltipParagraph}>

diff --git a/components/common/ReactDayPicker/ReactDayPicker.jsx b/components/common/ReactDayPicker/ReactDayPicker.jsx
@@ -222,7 +222,7 @@ function ReactDayPicker({
         onDayClick={handleDayClick}
         onDayMouseEnter={handleDayMouseEnter}
         weekdayElement={<WeekDay />}
-        fromMonth={new Date(2022, 12)}
+        fromMonth={new Date(2021, 12)}
       />
     </>
   );

diff --git a/components/db/DbProvider.jsx b/components/db/DbProvider.jsx
@@ -13,14 +13,9 @@ const datasets = {
     hfYtd2024:
       'https://huggingface.co/datasets/311-data/2024/resolve/main/2024.parquet', // 2024 year-to-date
     hfYtd2023:
-      'https://huggingface.co/datasets/311-data/2023/resolve/main/2023.parquet', // 2023 year-to-date
-    hfLastMonth:
-      'https://huggingface.co/datasets/edwinjue/311-data-last-month/resolve/refs%2Fconvert%2Fparquet/edwinjue--311-data-last-month/csv-train.parquet', // last month
-  },
-  csv: {
-    // huggingface
-    hfYtd:
-      'https://huggingface.co/datasets/edwinjue/311-data-2023/resolve/main/2023.csv', // year-to-date
+      'https://huggingface.co/datasets/311-data/2023/resolve/main/2023.parquet', // 2023 entire year
+    hfYtd2022:
+      'https://huggingface.co/datasets/311-data/2022/resolve/main/2022.parquet', // 2022 entire year
   },
 };
 
@@ -71,6 +66,12 @@ function DbProvider({ children, startDate }) {
           4,
         );
 
+        await newDb.registerFileURL(
+          'requests2022.parquet',
+          datasets.parquet.hfYtd2022,
+          4,
+        );
+
         // Create db connection
         const newConn = await newDb.connect();
 

diff --git a/scripts/cleanOldHfDataset.py b/scripts/cleanOldHfDataset.py
@@ -0,0 +1,120 @@
+'''
+This script downloads the selected year's csv file from Edwin's HuggingFace (which we no longer use),
+and transform the csv into a parquet file, creates the selected year's repo on 311-Data's HuggingFace and
+uploads the parquet file.
+
+This is only use for migrating older years' data for case-by-case usage, not to be confused with the
+daily cron-job.
+
+To process an older year's data, run the script with Python in the terminal with input year:
+ie.: `python3 cleanHfDataset.py 2022`
+'''
+
+import duckdb
+import requests
+import os
+import glob
+from tqdm import tqdm
+from huggingface_hub import HfApi, login
+from dotenv import load_dotenv
+import sys
+
+load_dotenv()
+
+def dlData(year):
+    '''
+    Download the dataset from Edwin's huggingface
+    '''
+    url = f"https://huggingface.co/datasets/edwinjue/311-data-{year}/resolve/main/{year}.csv"
+    outfile = f"{year}.csv"
+    response = requests.get(url, stream=True)
+
+    # Save downloaded file
+    with open(outfile, "wb") as file:
+        for data in tqdm(response.iter_content()):
+            file.write(data)
+
+
+def hfClean(year):
+    '''
+    Clean the dataset by removing problematic string combinations and update timestamp to ISO format
+    '''
+    infile = f"{year}.csv"
+    fixed_filename = f"{year}-fixed.csv"
+    clean_filename = f"{year}-clean.parquet"
+
+    # List of problematic strings to be replaced with ""
+    replace_strings = ["VE, 0"]
+
+    conn = duckdb.connect(database=':memory:')
+
+    try:
+        # Clean and save modified file
+        with open(infile, "r") as input_file, open(fixed_filename, "w") as output_file:
+            for line in input_file:
+                for replace_string in replace_strings:
+                    line = line.replace(replace_string, "")
+                output_file.write(line)
+
+        # Open modified file and perform an import/export to duckdb to ensure timestamps are formatted correctly
+        conn.execute(
+            f"create table requests as select * from read_csv_auto('{fixed_filename}', header=True, timestampformat='%m/%d/%Y %H:%M:%S %p');")
+        conn.execute(
+            f"copy (select * from requests) to '{clean_filename}' with (FORMAT PARQUET);")
+
+    except FileNotFoundError:
+        print(f"File {infile} not found.")
+
+
+def hfUpload(year):
+    '''
+    Upload the clean dataset to huggingface.co
+    '''
+    local_filename = f"{year}-clean.parquet"
+    dest_filename = f"{year}.parquet"
+    username = '311-data'
+    repo_name = str(year)
+    repo_type = 'dataset'
+
+    repo_id = f"{username}/{repo_name}"
+    TOKEN = os.getenv('HUGGINGFACE_LOGIN_TOKEN')
+
+    login(TOKEN)
+    api = HfApi()
+
+    # Check if the repository exists, and create it if it doesn't
+    try:
+        api.repo_info(repo_id)
+    except:
+        api.create_repo(repo_id, repo_type=repo_type, exist_ok=True)
+
+    # Upload the file to the repository
+    api.upload_file(
+        path_or_fileobj=local_filename,
+        path_in_repo=dest_filename,
+        repo_id=repo_id,
+        repo_type=repo_type,
+    )
+
+
+def cleanUp():
+    for file in glob.glob('*.csv'):
+        os.remove(file)
+    for file in glob.glob('*.parquet'):
+        os.remove(file)
+
+
+def process_data(year):
+    dlData(year)
+    hfClean(year)
+    hfUpload(year)
+    cleanUp()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python one_time_script.py <year>")
+        sys.exit(1)
+
+    year = sys.argv[1]
+    process_data(year)