Skip to content

Commit

Permalink
feat: add schema lint script (#61)
Browse files Browse the repository at this point in the history
  • Loading branch information
NoopDog committed Nov 28, 2024
1 parent e5130eb commit d9f1c96
Show file tree
Hide file tree
Showing 8 changed files with 519 additions and 19 deletions.
2 changes: 2 additions & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,5 @@ node_modules
# python
.venv/
venv

catalog-schema/docs
84 changes: 84 additions & 0 deletions catalog-build/alignments-columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import pandas as pd
import os


def process_alignments():
"""
Reads 'source/alignments.csv', processes columns including 'version', 'reference_coordinates',
and adds an 'alignment' column. Special cases:
- If 'pipeline' is 'minigraph-cactus', use 'mc' instead of the pipeline name in 'alignment'.
- Skip determining 'reference_coordinates' if the file name contains 'all'.
Writes the result to 'source/alignments2.csv'.
"""
# Define the directory and file paths
directory = "source"
input_file = os.path.join(directory, "alignments.csv")
output_file = os.path.join(directory, "alignments2.csv")

try:
# Read the alignments file into a DataFrame
df = pd.read_csv(input_file)

# Ensure the necessary columns exist
if "file" not in df.columns:
raise ValueError("The input file must have a 'file' column.")
if "pipeline" not in df.columns:
raise ValueError("The input file must have a 'pipeline' column.")
if "reference_coordinates" not in df.columns:
# If the 'reference_coordinates' column doesn't exist, add it
df["reference_coordinates"] = ""

# Define a function to determine the version
def determine_version(file_name):
if "v1.0" in file_name:
return "v1.0"
elif "v1.1" in file_name:
return "v1.1"
else:
return ""

# Define a function to extract reference coordinates from the file name
def extract_reference_coordinates(file_name):
if "all" in file_name.lower():
return "" # Skip if 'all' is in the file name
if "chm13" in file_name:
return "chm13"
elif "grch38" in file_name:
return "grch38"
else:
return ""

# Add the 'version' column
df["version"] = df["file"].apply(determine_version)

# Update 'reference_coordinates' where 'pipeline' is 'pggb' and 'all' is not in the file name
df.loc[df["pipeline"] == "pggb", "reference_coordinates"] = df.loc[
df["pipeline"] == "pggb", "file"
].apply(extract_reference_coordinates)

# Define a function to build the 'alignment' column
def build_alignment(row):
# Use 'mc' instead of 'minigraph-cactus' for the pipeline
pipeline_value = (
"mc" if row["pipeline"] == "minigraph-cactus" else row["pipeline"]
)
base = f"hprc-{row['version']}-{pipeline_value}"
if row["reference_coordinates"]:
return f"{base}-{row['reference_coordinates']}"
return base

# Add the 'alignment' column
df["alignment"] = df.apply(build_alignment, axis=1)

# Write the updated DataFrame to the output file
df.to_csv(output_file, index=False)
print(f"Processed file successfully written to '{output_file}'")
except FileNotFoundError:
print(f"Error: The input file '{input_file}' was not found.")
except Exception as e:
print(f"An error occurred: {e}")


# Run the function
if __name__ == "__main__":
process_alignments()
155 changes: 142 additions & 13 deletions catalog-build/build-sequencing-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# Define paths relative to the script's directory
STORAGE_FOLDER_PATH = os.path.join(BASE_DIR, "unprocessed_files/")
OUTPUT_PATH = os.path.join(BASE_DIR, "source/sequencing-data.csv")
FIELDS_CSV_PATH = os.path.join(BASE_DIR, "source/fields.csv")

HIC_URL = "https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/main/data/hprc-data-explorer-tables/HPRC_HiC.tsv"
ONT_URL = "https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/main/data/hprc-data-explorer-tables/HPRC_ONT.tsv"
Expand All @@ -19,38 +20,166 @@
METADATA_URLS = [HIC_URL, ONT_URL, PACBIO_HIFI_URL]
BIOSAMPLES_TABLE_URL = "https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/main/data/production/hprc-production-biosample-table.tsv"


def downloadSourceFiles(urls, outputFolderPath):
paths = []
for url in urls:
# Get the filename and the path where the output will be saved
paths.append(downloadFile(url, outputFolderPath))
return paths


def joinSamples(metadataPaths, biosamplesTablePath):
# Generate each column across all provided sheets
metadataList = [pd.read_csv(path, sep="\t", keep_default_na=False).drop_duplicates() for path in metadataPaths]
metadataList = [
pd.read_csv(path, sep="\t", keep_default_na=False).drop_duplicates()
for path in metadataPaths
]
metadataColumns = np.unique([col for df in metadataList for col in df.columns])
# Concatenate all the provided metadata sheets
allMetadata = pd.concat(
metadataList,
axis=0,
ignore_index=True
).reindex(columns=metadataColumns).fillna("N/A")
allMetadata = (
pd.concat(metadataList, axis=0, ignore_index=True)
.reindex(columns=metadataColumns)
.fillna("N/A")
)
# Join the concatenated sheets with the table
biosamplesTable = pd.read_csv(biosamplesTablePath, sep="\t")
joined = allMetadata.merge(
biosamplesTable,
left_on='sample_ID',
right_on='Sample',
how='left',
validate="many_to_one"
left_on="sample_ID",
right_on="Sample",
how="left",
validate="many_to_one",
)
print("The following biosamples did not have corresponding metadata:")
print(", ".join(allMetadata[~allMetadata["sample_ID"].isin(biosamplesTable["Sample"])]["sample_ID"].unique()))
print(
", ".join(
allMetadata[~allMetadata["sample_ID"].isin(biosamplesTable["Sample"])][
"sample_ID"
].unique()
)
)
return joined


if __name__ == "__main__":
metadataFiles = downloadSourceFiles(METADATA_URLS, STORAGE_FOLDER_PATH)
biosamplesTableFile = downloadSourceFiles([BIOSAMPLES_TABLE_URL], STORAGE_FOLDER_PATH)[0]
biosamplesTableFile = downloadSourceFiles(
[BIOSAMPLES_TABLE_URL], STORAGE_FOLDER_PATH
)[0]
joined = joinSamples(metadataFiles, biosamplesTableFile)
joined.to_csv(OUTPUT_PATH, index=False)
joined.to_csv(OUTPUT_PATH, index=False)

# Read columns from each file
hic_columns = set(pd.read_csv(HIC_URL, sep="\t", keep_default_na=False).columns)
ont_columns = set(pd.read_csv(ONT_URL, sep="\t", keep_default_na=False).columns)
pacbio_hifi_columns = set(
pd.read_csv(PACBIO_HIFI_URL, sep="\t", keep_default_na=False).columns
)

# Calculate common columns across all input files
common_columns = hic_columns & ont_columns & pacbio_hifi_columns

# Calculate unique columns for each file
unique_hic_columns = hic_columns - ont_columns - pacbio_hifi_columns
unique_ont_columns = ont_columns - hic_columns - pacbio_hifi_columns
unique_pacbio_hifi_columns = pacbio_hifi_columns - hic_columns - ont_columns

# Calculate columns that are in two but not all three files
in_two_files_columns = (
(hic_columns & ont_columns)
| (hic_columns & pacbio_hifi_columns)
| (ont_columns & pacbio_hifi_columns)
)
in_two_files_columns -= common_columns

print("")
print("")
print("**************************************")
print("**************************************")
print("**************************************")
print("")
print("")

# Read columns from each file
hic_columns = set(pd.read_csv(HIC_URL, sep="\t", keep_default_na=False).columns)
ont_columns = set(pd.read_csv(ONT_URL, sep="\t", keep_default_na=False).columns)
pacbio_hifi_columns = set(
pd.read_csv(PACBIO_HIFI_URL, sep="\t", keep_default_na=False).columns
)

# Calculate all unique columns across all files
all_columns = sorted(hic_columns | ont_columns | pacbio_hifi_columns)

# Create a DataFrame to store the presence of each column in each file
df = pd.DataFrame(
index=all_columns,
columns=["HPRC_HiC.tsv", "HPRC_ONT.tsv", "HPRC_PacBio_HiFi.tsv"],
)
df.index.name = "Field"

# Mark presence of columns in each file
for column in all_columns:
if column in hic_columns:
df.at[column, "HPRC_HiC.tsv"] = "x"
if column in ont_columns:
df.at[column, "HPRC_ONT.tsv"] = "x"
if column in pacbio_hifi_columns:
df.at[column, "HPRC_PacBio_HiFi.tsv"] = "x"

# Fill NaN values with empty strings
df = df.fillna("")

# Add a column to count the number of files that have each field
df["Count"] = df.apply(lambda row: row.str.count("x").sum(), axis=1)

# Sort the DataFrame by the count (descending) and then by the field name (alphabetically)
df = df.sort_values(by=["Count", "Field"], ascending=[False, True])

# Drop the count column
df = df.drop(columns=["Count"])

# Save the DataFrame to a CSV file
df.to_csv(FIELDS_CSV_PATH)

# Print out common columns across all input files
common_columns = hic_columns & ont_columns & pacbio_hifi_columns
print("Common columns across all input files:")
for column in sorted(common_columns):
print(column)

# Print out columns specific to the HPRC_HiC.tsv file
unique_hic_columns = hic_columns - ont_columns - pacbio_hifi_columns
print("\nColumns specific to HPRC_HiC.tsv:")
for column in sorted(unique_hic_columns):
print(column)

# Print out columns specific to the HPRC_PacBio_HiFi.tsv file
unique_pacbio_hifi_columns = pacbio_hifi_columns - hic_columns - ont_columns
print("\nColumns specific to HPRC_PacBio_HiFi.tsv:")
for column in sorted(unique_pacbio_hifi_columns):
print(column)

# Print out columns specific to the HPRC_ONT.tsv file
unique_ont_columns = ont_columns - hic_columns - pacbio_hifi_columns
print("\nColumns specific to HPRC_ONT.tsv:")
for column in sorted(unique_ont_columns):
print(column)

# Print out columns that are in two but not all three files
in_two_files_columns = (
(hic_columns & ont_columns)
| (hic_columns & pacbio_hifi_columns)
| (ont_columns & pacbio_hifi_columns)
)
in_two_files_columns -= common_columns
print("\nColumns in two but not all three files:")
for column in sorted(in_two_files_columns):
files_containing_column = []
if column in hic_columns:
files_containing_column.append("HPRC_HiC.tsv")
if column in ont_columns:
files_containing_column.append("HPRC_ONT.tsv")
if column in pacbio_hifi_columns:
files_containing_column.append("HPRC_PacBio_HiFi.tsv")
print(f"{column}: {', '.join(files_containing_column)}")
58 changes: 58 additions & 0 deletions catalog-build/source/fields.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
Field,HPRC_HiC.tsv,HPRC_ONT.tsv,HPRC_PacBio_HiFi.tsv
data_type,x,x,x
design_description,x,x,x
filename,x,x,x
generator_contact,x,x,x
generator_facility,x,x,x
instrument_model,x,x,x
library_ID,x,x,x
library_layout,x,x,x
library_selection,x,x,x
library_source,x,x,x
library_strategy,x,x,x
path,x,x,x
platform,x,x,x
shear_method,x,x,x
total_Gbp,x,x,x
N50,,x,x
accession,x,,x
biosample_accession,x,,x
coverage,x,x,
filetype,x,x,
notes,,x,x
ntsm_score,x,x,
sample_ID,,x,x
size_selection,,x,x
study,x,,x
total_bp,x,,x
100kb+,,x,
1Mb+,,x,
200kb+,,x,
300kb+,,x,
400kb+,,x,
500kb+,,x,
DeepConsensus_version,,,x
MM_tag,,,x
N25,,,x
N75,,,x
assembly,x,,
basecaller,,x,
basecaller_model,,x,
basecaller_version,,x,
bioproject_accession,x,,
ccs_algorithm,,,x
max,,,x
mean,,,x
min,,,x
ntsm_result,x,,
polymerase_version,,,x
quartile_25,,,x
quartile_50,,,x
quartile_75,,,x
result,,x,
sample_id,x,,
seq_kit,,x,
seq_plate_chemistry_version,,,x
title,x,,
total_reads,,,x
whales,,x,
1 change: 1 addition & 0 deletions catalog-schema/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
docs/*
Loading

0 comments on commit d9f1c96

Please sign in to comment.