Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1714-2 migrate 2020-2021 data #1747

Merged
merged 14 commits into from
Jun 8, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions components/DateSelector/DateSelector.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,11 @@ function DateSelector({
setExpanded(false);
}, []);

const { option, selected, label, iconStyle, tooltipParagraph } = classes;
const {
option, selected, label, iconStyle, tooltipParagraph,
} = classes;

const linkedinPageLink = <a href='https://www.linkedin.com/company/hack-for-la/'>LinkedIn Page</a>
const linkedinPageLink = <a href="https://www.linkedin.com/company/hack-for-la/">LinkedIn Page</a>;

return (
<>
Expand All @@ -48,12 +50,15 @@ function DateSelector({
<p className={tooltipParagraph}>
<strong>
Currently, 311-Data loads only 311 service
request data from 2022 onward.
request data from 2020 onward.
</strong>
</p>
<p className={tooltipParagraph}>
For updates on the release of available 311
Data, please follow our {linkedinPageLink}.
Data, please follow our
{' '}
{linkedinPageLink}
.
</p>
</div>
</ArrowToolTip>
Expand Down
2 changes: 1 addition & 1 deletion components/common/ReactDayPicker/ReactDayPicker.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ function ReactDayPicker({
onDayClick={handleDayClick}
onDayMouseEnter={handleDayMouseEnter}
weekdayElement={<WeekDay />}
fromMonth={new Date(2021, 12)}
fromMonth={new Date(2019, 12)}
/>
</>
);
Expand Down
16 changes: 16 additions & 0 deletions components/db/DbProvider.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ const datasets = {
'https://huggingface.co/datasets/311-data/2023/resolve/main/2023.parquet', // 2023 entire year
hfYtd2022:
'https://huggingface.co/datasets/311-data/2022/resolve/main/2022.parquet', // 2022 entire year
hfYtd2021:
'https://huggingface.co/datasets/311-data/2021/resolve/main/2021.parquet', // 2021 entire year
hfYtd2020:
'https://huggingface.co/datasets/311-data/2020/resolve/main/2020.parquet', // 2020 entire year
},
};

Expand Down Expand Up @@ -72,6 +76,18 @@ function DbProvider({ children, startDate }) {
4,
);

await newDb.registerFileURL(
'requests2021.parquet',
datasets.parquet.hfYtd2021,
4,
);

await newDb.registerFileURL(
'requests2020.parquet',
datasets.parquet.hfYtd2020,
4,
);

// Create db connection
const newConn = await newDb.connect();

Expand Down
30 changes: 30 additions & 0 deletions scripts/csv_debug_tools/add_na_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
'''
This script is for adding 'N/A' values to the 8th column 'CreatedByUserOrganization'
Due to 2021 data missing values in that entire column, which shifted all columns
after it forward
Saving this for future similar situation
'''

import csv

input_file = "2021.csv"
output_file = "2021_with_na.csv"

with open(input_file, "r", newline='', encoding='utf-8') as infile, open(output_file, "w", newline='', encoding='utf-8') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)

# Read the header
header = next(reader)
writer.writerow(header)

for line_number, row in enumerate(reader, start=2):
# Ensure row has the correct length by adding 'N/A' to the 8th column if necessary
if len(row) != len(header):
if len(row) == len(header) - 1:
row.insert(8, 'N/A')
else:
print(f"Line {line_number} has an incorrect number of columns: {len(row)} instead of {len(header)}")
writer.writerow(row)

print(f"Processed {input_file} and saved to {output_file}")
7 changes: 7 additions & 0 deletions scripts/csv_debug_tools/correct_column_count.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
def get_correct_column_count(file_path):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not exactly requesting a change here since these are clearly helper methods. But would be nice to generalize these for use in other scripts, or make them command-line friendly (specifying input files, etc)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

requested change impemented

with open(file_path, "r") as file:
header = file.readline().strip()
return len(header.split(','))

correct_columns = get_correct_column_count("2022.csv")
print(f"Correct number of columns: {correct_columns}")
8 changes: 8 additions & 0 deletions scripts/csv_debug_tools/inspect_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
def find_problematic_line(file_path, num_lines=5):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar comment as above

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

requested change impemented

with open(file_path, "r") as file:
for line_number, line in enumerate(file, start=1):
columns = line.strip().split(',')
if len(columns) != 34:
print(f"Problematic line {line_number}: {line.strip()}")

find_problematic_line("2021-fixed.csv")
38 changes: 27 additions & 11 deletions scripts/cleanOldHfDataset.py → scripts/migrateOldHfDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
daily cron-job.

To process an older year's data, run the script with Python in the terminal with input year:
ie.: `python3 cleanHfDataset.py 2022`
ie.: `python3 cleanOldHfDataset.py 2022`, make sure to change the year to your intended year
'''

import duckdb
Expand All @@ -18,21 +18,30 @@
from huggingface_hub import HfApi, login
from dotenv import load_dotenv
import sys
import logging

load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def dlData(year):
'''
Download the dataset from Edwin's huggingface
'''
url = f"https://huggingface.co/datasets/edwinjue/311-data-{year}/resolve/main/{year}.csv"
outfile = f"{year}.csv"
chunk_size = 1024 * 1024 # 1 MB

response = requests.get(url, stream=True)

# Save downloaded file
with open(outfile, "wb") as file:
for data in tqdm(response.iter_content()):
file.write(data)
for chunk in tqdm(response.iter_content(chunk_size=chunk_size), desc="Downloading data"):
if chunk: # filter out keep-alive new chunks
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

were you encountering keep-alive chunks while using this script? just curious how this was affecting the result csv

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is to handle HTTP's keep-alive mechanism which may send empty chunks to keep the connection open especially for downloading larger files. i didnt check to see if there were empty chunks downloaded prior, but just implement check if the current chunk is not empty as a precaution, so that it only downloads non-empty chunks.

file.write(chunk)

logging.info(f"Downloaded {outfile} successfully.")


def hfClean(year):
Expand All @@ -58,7 +67,7 @@ def hfClean(year):

# Open modified file and perform an import/export to duckdb to ensure timestamps are formatted correctly
conn.execute(
f"create table requests as select * from read_csv_auto('{fixed_filename}', header=True, timestampformat='%m/%d/%Y %H:%M:%S %p');")
f"create table requests as select * from read_csv_auto('{fixed_filename}', header=True, timestampformat='%m/%d/%Y %H:%M:%S %p', parallel=false);")
conn.execute(
f"copy (select * from requests) to '{clean_filename}' with (FORMAT PARQUET);")

Expand Down Expand Up @@ -104,17 +113,24 @@ def cleanUp():
os.remove(file)


def process_data(year):
dlData(year)
hfClean(year)
def process_data(year, skip_download=False, skip_clean=False, stop_after_clean=False):
if not skip_download:
dlData(year)
if not skip_clean:
hfClean(year)
if stop_after_clean:
logging.info("Stopping after hfClean as requested.")
return
hfUpload(year)
cleanUp()


if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python one_time_script.py <year>")
if len(sys.argv) < 2:
print("Usage: python one_time_script.py <year> [--skip-download] [--skip-clean] [--stop-after-clean]")
sys.exit(1)

year = sys.argv[1]
process_data(year)
skip_download = '--skip-download' in sys.argv
skip_clean = '--skip-clean' in sys.argv
stop_after_clean = '--stop-after-clean' in sys.argv
process_data(year, skip_download, skip_clean, stop_after_clean)