-
-
Notifications
You must be signed in to change notification settings - Fork 63
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
1714-2 migrate 2020-2021 data #1747
Changes from 11 commits
2953b79
c27efe3
31a3e8a
1f7e125
f6f9393
2313d78
cf5f027
cc08dac
cf0e15d
8189ca2
8437812
2ffcb5e
c3a8d7d
ae0b84d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
''' | ||
This script is for adding 'N/A' values to the 8th column 'CreatedByUserOrganization' | ||
Due to 2021 data missing values in that entire column, which shifted all columns | ||
after it forward | ||
Saving this for future similar situation | ||
''' | ||
|
||
import csv | ||
|
||
input_file = "2021.csv" | ||
output_file = "2021_with_na.csv" | ||
|
||
with open(input_file, "r", newline='', encoding='utf-8') as infile, open(output_file, "w", newline='', encoding='utf-8') as outfile: | ||
reader = csv.reader(infile) | ||
writer = csv.writer(outfile) | ||
|
||
# Read the header | ||
header = next(reader) | ||
writer.writerow(header) | ||
|
||
for line_number, row in enumerate(reader, start=2): | ||
# Ensure row has the correct length by adding 'N/A' to the 8th column if necessary | ||
if len(row) != len(header): | ||
if len(row) == len(header) - 1: | ||
row.insert(8, 'N/A') | ||
else: | ||
print(f"Line {line_number} has an incorrect number of columns: {len(row)} instead of {len(header)}") | ||
writer.writerow(row) | ||
|
||
print(f"Processed {input_file} and saved to {output_file}") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
def get_correct_column_count(file_path): | ||
with open(file_path, "r") as file: | ||
header = file.readline().strip() | ||
return len(header.split(',')) | ||
|
||
correct_columns = get_correct_column_count("2022.csv") | ||
print(f"Correct number of columns: {correct_columns}") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
def find_problematic_line(file_path, num_lines=5): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar comment as above There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. requested change impemented |
||
with open(file_path, "r") as file: | ||
for line_number, line in enumerate(file, start=1): | ||
columns = line.strip().split(',') | ||
if len(columns) != 34: | ||
print(f"Problematic line {line_number}: {line.strip()}") | ||
|
||
find_problematic_line("2021-fixed.csv") |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,7 @@ | |
daily cron-job. | ||
|
||
To process an older year's data, run the script with Python in the terminal with input year: | ||
ie.: `python3 cleanHfDataset.py 2022` | ||
ie.: `python3 cleanOldHfDataset.py 2022`, make sure to change the year to your intended year | ||
''' | ||
|
||
import duckdb | ||
|
@@ -18,21 +18,30 @@ | |
from huggingface_hub import HfApi, login | ||
from dotenv import load_dotenv | ||
import sys | ||
import logging | ||
|
||
load_dotenv() | ||
|
||
# Configure logging | ||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | ||
|
||
def dlData(year): | ||
''' | ||
Download the dataset from Edwin's huggingface | ||
''' | ||
url = f"https://huggingface.co/datasets/edwinjue/311-data-{year}/resolve/main/{year}.csv" | ||
outfile = f"{year}.csv" | ||
chunk_size = 1024 * 1024 # 1 MB | ||
|
||
response = requests.get(url, stream=True) | ||
|
||
# Save downloaded file | ||
with open(outfile, "wb") as file: | ||
for data in tqdm(response.iter_content()): | ||
file.write(data) | ||
for chunk in tqdm(response.iter_content(chunk_size=chunk_size), desc="Downloading data"): | ||
if chunk: # filter out keep-alive new chunks | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. were you encountering keep-alive chunks while using this script? just curious how this was affecting the result csv There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is to handle HTTP's keep-alive mechanism which may send empty chunks to keep the connection open especially for downloading larger files. i didnt check to see if there were empty chunks downloaded prior, but just implement check if the current chunk is not empty as a precaution, so that it only downloads non-empty chunks. |
||
file.write(chunk) | ||
|
||
logging.info(f"Downloaded {outfile} successfully.") | ||
|
||
|
||
def hfClean(year): | ||
|
@@ -58,7 +67,7 @@ def hfClean(year): | |
|
||
# Open modified file and perform an import/export to duckdb to ensure timestamps are formatted correctly | ||
conn.execute( | ||
f"create table requests as select * from read_csv_auto('{fixed_filename}', header=True, timestampformat='%m/%d/%Y %H:%M:%S %p');") | ||
f"create table requests as select * from read_csv_auto('{fixed_filename}', header=True, timestampformat='%m/%d/%Y %H:%M:%S %p', parallel=false);") | ||
conn.execute( | ||
f"copy (select * from requests) to '{clean_filename}' with (FORMAT PARQUET);") | ||
|
||
|
@@ -104,17 +113,24 @@ def cleanUp(): | |
os.remove(file) | ||
|
||
|
||
def process_data(year): | ||
dlData(year) | ||
hfClean(year) | ||
def process_data(year, skip_download=False, skip_clean=False, stop_after_clean=False): | ||
if not skip_download: | ||
dlData(year) | ||
if not skip_clean: | ||
hfClean(year) | ||
if stop_after_clean: | ||
logging.info("Stopping after hfClean as requested.") | ||
return | ||
hfUpload(year) | ||
cleanUp() | ||
|
||
|
||
if __name__ == "__main__": | ||
if len(sys.argv) != 2: | ||
print("Usage: python one_time_script.py <year>") | ||
if len(sys.argv) < 2: | ||
print("Usage: python one_time_script.py <year> [--skip-download] [--skip-clean] [--stop-after-clean]") | ||
sys.exit(1) | ||
|
||
year = sys.argv[1] | ||
process_data(year) | ||
skip_download = '--skip-download' in sys.argv | ||
skip_clean = '--skip-clean' in sys.argv | ||
stop_after_clean = '--stop-after-clean' in sys.argv | ||
process_data(year, skip_download, skip_clean, stop_after_clean) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not exactly requesting a change here since these are clearly helper methods. But would be nice to generalize these for use in other scripts, or make them command-line friendly (specifying input files, etc)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
requested change impemented