hackforla · Skydodle · Jun 8, 2024 · Jun 4, 2024 · Jun 4, 2024 · Jun 4, 2024
diff --git a/components/DateSelector/DateSelector.jsx b/components/DateSelector/DateSelector.jsx
@@ -35,9 +35,11 @@ function DateSelector({
     setExpanded(false);
   }, []);
 
-  const { option, selected, label, iconStyle, tooltipParagraph } = classes;
+  const {
+    option, selected, label, iconStyle, tooltipParagraph,
+  } = classes;
 
-  const linkedinPageLink = <a href='https://www.linkedin.com/company/hack-for-la/'>LinkedIn Page</a>
+  const linkedinPageLink = <a href="https://www.linkedin.com/company/hack-for-la/">LinkedIn Page</a>;
 
   return (
     <>
@@ -48,12 +50,15 @@ function DateSelector({
             <p className={tooltipParagraph}>
               <strong>
                 Currently, 311-Data loads only 311 service
-                request data from 2022 onward.
+                request data from 2020 onward.
               </strong>
             </p>
             <p className={tooltipParagraph}>
               For updates on the release of available 311
-              Data, please follow our {linkedinPageLink}.
+              Data, please follow our
+              {' '}
+              {linkedinPageLink}
+              .
             </p>
           </div>
         </ArrowToolTip>

diff --git a/components/common/ReactDayPicker/ReactDayPicker.jsx b/components/common/ReactDayPicker/ReactDayPicker.jsx
@@ -222,7 +222,7 @@ function ReactDayPicker({
         onDayClick={handleDayClick}
         onDayMouseEnter={handleDayMouseEnter}
         weekdayElement={<WeekDay />}
-        fromMonth={new Date(2021, 12)}
+        fromMonth={new Date(2019, 12)}
       />
     </>
   );

diff --git a/components/db/DbProvider.jsx b/components/db/DbProvider.jsx
@@ -16,6 +16,10 @@ const datasets = {
       'https://huggingface.co/datasets/311-data/2023/resolve/main/2023.parquet', // 2023 entire year
     hfYtd2022:
       'https://huggingface.co/datasets/311-data/2022/resolve/main/2022.parquet', // 2022 entire year
+    hfYtd2021:
+      'https://huggingface.co/datasets/311-data/2021/resolve/main/2021.parquet', // 2021 entire year
+    hfYtd2020:
+      'https://huggingface.co/datasets/311-data/2020/resolve/main/2020.parquet', // 2020 entire year
   },
 };
 
@@ -72,6 +76,18 @@ function DbProvider({ children, startDate }) {
           4,
         );
 
+        await newDb.registerFileURL(
+          'requests2021.parquet',
+          datasets.parquet.hfYtd2021,
+          4,
+        );
+
+        await newDb.registerFileURL(
+          'requests2020.parquet',
+          datasets.parquet.hfYtd2020,
+          4,
+        );
+
         // Create db connection
         const newConn = await newDb.connect();
 

diff --git a/scripts/csv_debug_tools/add_na_column.py b/scripts/csv_debug_tools/add_na_column.py
@@ -0,0 +1,30 @@
+'''
+This script is for adding 'N/A' values to the 8th column 'CreatedByUserOrganization'
+Due to 2021 data missing values in that entire column, which shifted all columns
+after it forward
+Saving this for future similar situation
+'''
+
+import csv
+
+input_file = "2021.csv"
+output_file = "2021_with_na.csv"
+
+with open(input_file, "r", newline='', encoding='utf-8') as infile, open(output_file, "w", newline='', encoding='utf-8') as outfile:
+    reader = csv.reader(infile)
+    writer = csv.writer(outfile)
+
+    # Read the header
+    header = next(reader)
+    writer.writerow(header)
+
+    for line_number, row in enumerate(reader, start=2):
+        # Ensure row has the correct length by adding 'N/A' to the 8th column if necessary
+        if len(row) != len(header):
+            if len(row) == len(header) - 1:
+                row.insert(8, 'N/A')
+            else:
+                print(f"Line {line_number} has an incorrect number of columns: {len(row)} instead of {len(header)}")
+        writer.writerow(row)
+
+print(f"Processed {input_file} and saved to {output_file}")
diff --git a/scripts/csv_debug_tools/correct_column_count.py b/scripts/csv_debug_tools/correct_column_count.py
@@ -0,0 +1,7 @@
+def get_correct_column_count(file_path):
+    with open(file_path, "r") as file:
+        header = file.readline().strip()
+        return len(header.split(','))
+
+correct_columns = get_correct_column_count("2022.csv")
+print(f"Correct number of columns: {correct_columns}")
diff --git a/scripts/csv_debug_tools/inspect_csv.py b/scripts/csv_debug_tools/inspect_csv.py
@@ -0,0 +1,8 @@
+def find_problematic_line(file_path, num_lines=5):
+    with open(file_path, "r") as file:
+        for line_number, line in enumerate(file, start=1):
+            columns = line.strip().split(',')
+            if len(columns) != 34:
+                print(f"Problematic line {line_number}: {line.strip()}")
+
+find_problematic_line("2021-fixed.csv")
diff --git a/scripts/cleanOldHfDataset.py → scripts/migrateOldHfDataset.py b/scripts/cleanOldHfDataset.py → scripts/migrateOldHfDataset.py
@@ -7,7 +7,7 @@
 daily cron-job.
 
 To process an older year's data, run the script with Python in the terminal with input year:
-ie.: `python3 cleanHfDataset.py 2022`
+ie.: `python3 cleanOldHfDataset.py 2022`, make sure to change the year to your intended year
 '''
 
 import duckdb
@@ -18,21 +18,30 @@
 from huggingface_hub import HfApi, login
 from dotenv import load_dotenv
 import sys
+import logging
 
 load_dotenv()
 
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
 def dlData(year):
     '''
     Download the dataset from Edwin's huggingface
     '''
     url = f"https://huggingface.co/datasets/edwinjue/311-data-{year}/resolve/main/{year}.csv"
     outfile = f"{year}.csv"
+    chunk_size = 1024 * 1024 # 1 MB
+
     response = requests.get(url, stream=True)
 
     # Save downloaded file
     with open(outfile, "wb") as file:
-        for data in tqdm(response.iter_content()):
-            file.write(data)
+        for chunk in tqdm(response.iter_content(chunk_size=chunk_size), desc="Downloading data"):
+            if chunk:  # filter out keep-alive new chunks
+                file.write(chunk)
+
+    logging.info(f"Downloaded {outfile} successfully.")
 
 
 def hfClean(year):
@@ -58,7 +67,7 @@ def hfClean(year):
 
         # Open modified file and perform an import/export to duckdb to ensure timestamps are formatted correctly
         conn.execute(
-            f"create table requests as select * from read_csv_auto('{fixed_filename}', header=True, timestampformat='%m/%d/%Y %H:%M:%S %p');")
+            f"create table requests as select * from read_csv_auto('{fixed_filename}', header=True, timestampformat='%m/%d/%Y %H:%M:%S %p', parallel=false);")
         conn.execute(
             f"copy (select * from requests) to '{clean_filename}' with (FORMAT PARQUET);")
 
@@ -104,17 +113,24 @@ def cleanUp():
         os.remove(file)
 
 
-def process_data(year):
-    dlData(year)
-    hfClean(year)
+def process_data(year, skip_download=False, skip_clean=False, stop_after_clean=False):
+    if not skip_download:
+        dlData(year)
+    if not skip_clean:
+        hfClean(year)
+    if stop_after_clean:
+        logging.info("Stopping after hfClean as requested.")
+        return
     hfUpload(year)
     cleanUp()
 
-
 if __name__ == "__main__":
-    if len(sys.argv) != 2:
-        print("Usage: python one_time_script.py <year>")
+    if len(sys.argv) < 2:
+        print("Usage: python one_time_script.py <year> [--skip-download] [--skip-clean] [--stop-after-clean]")
         sys.exit(1)
 
     year = sys.argv[1]
-    process_data(year)
+    skip_download = '--skip-download' in sys.argv
+    skip_clean = '--skip-clean' in sys.argv
+    stop_after_clean = '--stop-after-clean' in sys.argv
+    process_data(year, skip_download, skip_clean, stop_after_clean)