From d0412accfe3efa0b765ccc2dfdde02973aaee2a0 Mon Sep 17 00:00:00 2001
From: Johnny Wu <skydodle@gmail.com>
Date: Thu, 30 May 2024 17:23:32 -0700
Subject: [PATCH 1/9] Feat: set up script to manually clean old dataset from
 edwin's hf with year parameter and upload cleaned dataset to 311's hf

---
 scripts/cleanHfDataset.py | 100 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 scripts/cleanHfDataset.py
diff --git a/scripts/cleanHfDataset.py b/scripts/cleanHfDataset.py
new file mode 100644
index 000000000..d2eaf7763
--- /dev/null
+++ b/scripts/cleanHfDataset.py
@@ -0,0 +1,100 @@
+import duckdb
+import requests
+import os
+import glob
+from tqdm import tqdm
+from huggingface_hub import HfApi, login
+from dotenv import load_dotenv
+import sys
+
+load_dotenv()
+
+def dlData(year):
+    '''
+    Download the dataset from huggingface
+    '''
+    url = f"https://huggingface.co/datasets/edwinjue/311-data-2022/resolve/main/{year}.csv"
+    outfile = f"{year}.csv"
+    response = requests.get(url, stream=True)
+
+    # Save downloaded file
+    with open(outfile, "wb") as file:
+        for data in tqdm(response.iter_content()):
+            file.write(data)
+
+
+def hfClean(year):
+    '''
+    Clean the dataset by removing problematic string combinations and update timestamp to ISO format
+    '''
+    infile = f"{year}.csv"
+    fixed_filename = f"{year}-fixed.csv"
+    clean_filename = f"{year}-clean.parquet"
+
+    # List of problematic strings to be replaced with ""
+    replace_strings = ["VE, 0"]
+
+    conn = duckdb.connect(database=':memory:')
+
+    try:
+        # Clean and save modified file
+        with open(infile, "r") as input_file, open(fixed_filename, "w") as output_file:
+            for line in input_file:
+                for replace_string in replace_strings:
+                    line = line.replace(replace_string, "")
+                output_file.write(line)
+
+        # Open modified file and perform an import/export to duckdb to ensure timestamps are formatted correctly
+        conn.execute(
+            f"create table requests as select * from read_csv_auto('{fixed_filename}', header=True, timestampformat='%m/%d/%Y %H:%M:%S %p');")
+        conn.execute(
+            f"copy (select * from requests) to '{clean_filename}' with (FORMAT PARQUET);")
+
+    except FileNotFoundError:
+        print(f"File {infile} not found.")
+
+
+def hfUpload(year):
+    '''
+    Upload the clean dataset to huggingface.co
+    '''
+    local_filename = f"{year}-clean.parquet"
+    dest_filename = f"{year}.parquet"
+    username = '311-data'
+    repo_name = str(year)
+    repo_type = 'dataset'
+
+    repo_id = f"{username}/{repo_name}"
+    TOKEN = os.getenv('HUGGINGFACE_LOGIN_TOKEN')
+
+    login(TOKEN)
+    api = HfApi()
+    api.upload_file(
+        path_or_fileobj=local_filename,
+        path_in_repo=dest_filename,
+        repo_id=repo_id,
+        repo_type=repo_type,
+    )
+
+
+def cleanUp():
+    for file in glob.glob('*.csv'):
+        os.remove(file)
+    for file in glob.glob('*.parquet'):
+        os.remove(file)
+
+
+def process_data(year):
+    dlData(year)
+    hfClean(year)
+    hfUpload(year)
+    cleanUp()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python one_time_script.py <year>")
+        sys.exit(1)
+
+    year = sys.argv[1]
+    process_data(year)

From 40935910f1aa2f5481bb1f1d9c2ba6676dceccf3 Mon Sep 17 00:00:00 2001
From: Johnny Wu <skydodle@gmail.com>
Date: Thu, 30 May 2024 17:35:26 -0700
Subject: [PATCH 2/9] Feat: add check if hf repo exist when upload and create

---
 scripts/cleanHfDataset.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/scripts/cleanHfDataset.py b/scripts/cleanHfDataset.py
index d2eaf7763..798689c06 100644
--- a/scripts/cleanHfDataset.py
+++ b/scripts/cleanHfDataset.py
@@ -13,7 +13,7 @@ def dlData(year):
     '''
     Download the dataset from huggingface
     '''
-    url = f"https://huggingface.co/datasets/edwinjue/311-data-2022/resolve/main/{year}.csv"
+    url = f"https://huggingface.co/datasets/edwinjue/311-data-{year}/resolve/main/{year}.csv"
     outfile = f"{year}.csv"
     response = requests.get(url, stream=True)
 
@@ -69,6 +69,14 @@ def hfUpload(year):
 
     login(TOKEN)
     api = HfApi()
+
+    # Check if the repository exists, and create it if it doesn't
+    try:
+        api.repo_info(repo_id)
+    except:
+        api.create_repo(repo_id, repo_type=repo_type, exist_ok=True)
+
+    # Upload the file to the repository
     api.upload_file(
         path_or_fileobj=local_filename,
         path_in_repo=dest_filename,

From 8b71fad0bed0c7d0feed92b6b4c5dc5331e92956 Mon Sep 17 00:00:00 2001
From: Johnny Wu <skydodle@gmail.com>
Date: Thu, 30 May 2024 17:37:58 -0700
Subject: [PATCH 3/9] docs: update dlData comment

---
 scripts/cleanHfDataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/cleanHfDataset.py b/scripts/cleanHfDataset.py
index 798689c06..b8a9b8feb 100644
--- a/scripts/cleanHfDataset.py
+++ b/scripts/cleanHfDataset.py
@@ -11,7 +11,7 @@
 
 def dlData(year):
     '''
-    Download the dataset from huggingface
+    Download the dataset from Edwin's huggingface
     '''
     url = f"https://huggingface.co/datasets/edwinjue/311-data-{year}/resolve/main/{year}.csv"
     outfile = f"{year}.csv"

From 45147375eabe68c291b3b1f89628d6112f2def53 Mon Sep 17 00:00:00 2001
From: Johnny Wu <skydodle@gmail.com>
Date: Fri, 31 May 2024 16:10:17 -0700
Subject: [PATCH 4/9] ops: add dataset registration

---
 components/db/DbProvider.jsx | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/components/db/DbProvider.jsx b/components/db/DbProvider.jsx
index df90de9e5..ad8b38d14 100644
--- a/components/db/DbProvider.jsx
+++ b/components/db/DbProvider.jsx
@@ -13,14 +13,9 @@ const datasets = {
     hfYtd2024:
       'https://huggingface.co/datasets/311-data/2024/resolve/main/2024.parquet', // 2024 year-to-date
     hfYtd2023:
-      'https://huggingface.co/datasets/311-data/2023/resolve/main/2023.parquet', // 2023 year-to-date
-    hfLastMonth:
-      'https://huggingface.co/datasets/edwinjue/311-data-last-month/resolve/refs%2Fconvert%2Fparquet/edwinjue--311-data-last-month/csv-train.parquet', // last month
-  },
-  csv: {
-    // huggingface
-    hfYtd:
-      'https://huggingface.co/datasets/edwinjue/311-data-2023/resolve/main/2023.csv', // year-to-date
+      'https://huggingface.co/datasets/311-data/2023/resolve/main/2023.parquet', // 2023 entire year
+    hfYtd2022:
+      'https://huggingface.co/datasets/311-data/2022/resolve/main/2022.parquet', // 2022 entire year
   },
 };
 
@@ -71,6 +66,12 @@ function DbProvider({ children, startDate }) {
           4,
         );
 
+        await newDb.registerFileURL(
+          'requests2022.parquet',
+          datasets.parquet.hfYtd2022,
+          4,
+        );
+
         // Create db connection
         const newConn = await newDb.connect();
 

From 92b8ffe51c8775f3ebd39406c02ebc0a60745b42 Mon Sep 17 00:00:00 2001
From: Johnny Wu <skydodle@gmail.com>
Date: Fri, 31 May 2024 16:18:20 -0700
Subject: [PATCH 5/9] feat: enable calendar to browse 2022

---
 components/common/ReactDayPicker/ReactDayPicker.jsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/components/common/ReactDayPicker/ReactDayPicker.jsx b/components/common/ReactDayPicker/ReactDayPicker.jsx
index ece94e899..500b14b61 100644
--- a/components/common/ReactDayPicker/ReactDayPicker.jsx
+++ b/components/common/ReactDayPicker/ReactDayPicker.jsx
@@ -222,7 +222,7 @@ function ReactDayPicker({
         onDayClick={handleDayClick}
         onDayMouseEnter={handleDayMouseEnter}
         weekdayElement={<WeekDay />}
-        fromMonth={new Date(2022, 12)}
+        fromMonth={new Date(2021, 12)}
       />
     </>
   );

From e0a7cf07f0adaa856854ad0d8ba83155fb8d9feb Mon Sep 17 00:00:00 2001
From: Johnny Wu <skydodle@gmail.com>
Date: Fri, 31 May 2024 16:23:30 -0700
Subject: [PATCH 6/9] feat: update tooltip text

---
 components/DateSelector/DateSelector.jsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/components/DateSelector/DateSelector.jsx b/components/DateSelector/DateSelector.jsx
index a6be11aff..cfd5cd01c 100644
--- a/components/DateSelector/DateSelector.jsx
+++ b/components/DateSelector/DateSelector.jsx
@@ -48,7 +48,7 @@ function DateSelector({
             <p className={tooltipParagraph}>
               <strong>
                 Currently, 311-Data loads only 311 service
-                request data from 2023 onward.
+                request data from 2022 onward.
               </strong>
             </p>
             <p className={tooltipParagraph}>

From ed44f81e7dc12fe0e2b6f3e8e6ffe1e6f6052586 Mon Sep 17 00:00:00 2001
From: Johnny Wu <skydodle@gmail.com>
Date: Fri, 31 May 2024 17:47:09 -0700
Subject: [PATCH 7/9] docs: added comment

---
 scripts/cleanHfDataset.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/scripts/cleanHfDataset.py b/scripts/cleanHfDataset.py
index b8a9b8feb..83e100835 100644
--- a/scripts/cleanHfDataset.py
+++ b/scripts/cleanHfDataset.py
@@ -1,3 +1,15 @@
+'''
+This script downloads the selected year's csv file from Edwin's HuggingFace (which we no longer use),
+and transform the csv into a parquet file, creates selected the year's repo on 311-Data's HuggingFace and
+uploads the parquet file.
+
+This is only use for migrating older years' data for case-by-case usage, not to be confused with the
+daily cron-job.
+
+Run the script with Python in the terminal with input year:
+ie.: `python3 cleanHfDataset.py 2022`
+'''
+
 import duckdb
 import requests
 import os

From e373cc7313a4f2145a9187924c86967427da739b Mon Sep 17 00:00:00 2001
From: Johnny Wu <skydodle@gmail.com>
Date: Fri, 31 May 2024 17:50:02 -0700
Subject: [PATCH 8/9] fix: typo

---
 scripts/cleanHfDataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/cleanHfDataset.py b/scripts/cleanHfDataset.py
index 83e100835..16c11f2b2 100644
--- a/scripts/cleanHfDataset.py
+++ b/scripts/cleanHfDataset.py
@@ -1,12 +1,12 @@
 '''
 This script downloads the selected year's csv file from Edwin's HuggingFace (which we no longer use),
-and transform the csv into a parquet file, creates selected the year's repo on 311-Data's HuggingFace and
+and transform the csv into a parquet file, creates the selected year's repo on 311-Data's HuggingFace and
 uploads the parquet file.
 
 This is only use for migrating older years' data for case-by-case usage, not to be confused with the
 daily cron-job.
 
-Run the script with Python in the terminal with input year:
+To process an older year's data, run the script with Python in the terminal with input year:
 ie.: `python3 cleanHfDataset.py 2022`
 '''
 

From 9f50d62364bf663041761f73669611c8ea6e5674 Mon Sep 17 00:00:00 2001
From: Johnny Wu <skydodle@gmail.com>
Date: Mon, 3 Jun 2024 10:14:35 -0700
Subject: [PATCH 9/9] fix: update file name for more clarity

---
 scripts/{cleanHfDataset.py => cleanOldHfDataset.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/{cleanHfDataset.py => cleanOldHfDataset.py} (100%)

diff --git a/scripts/cleanHfDataset.py b/scripts/cleanOldHfDataset.py
similarity index 100%
rename from scripts/cleanHfDataset.py
rename to scripts/cleanOldHfDataset.py