Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patch that fixes several bugs in metadata validation and submission #244

Merged
merged 8 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified assets/sample_metadata/bacteria_test_metadata.xlsx
Binary file not shown.
15 changes: 9 additions & 6 deletions bin/submission_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,8 @@ def submission_main():
report_fetched = False # Flag to indicate if a report has been fetched

while time.time() - start_time < timeout:
if sample.ftp_upload:
# if user is submitting to genbank via ftp and provided the necessary files
if parameters['genbank'] and 'genbank' not in databases_to_skip and sample.ftp_upload:
submission_objects = {'biosample': biosample_submission, 'sra': sra_submission, 'genbank': genbank_submission}
else:
submission_objects = {'biosample': biosample_submission, 'sra': sra_submission}
Expand Down Expand Up @@ -605,11 +606,13 @@ def add_action_block(self, submission):
spuid = ET.SubElement(sample_id, 'SPUID', {'spuid_namespace': f"{spuid_namespace_value}_BS"})
spuid.text = self.safe_text(self.top_metadata['ncbi-spuid'])
descriptor = ET.SubElement(biosample, 'Descriptor')
title = ET.SubElement(descriptor, 'Title')
title.text = self.safe_text(self.top_metadata['title'])
# BioSample XSD will not accept a description here, even though the example submission.xml has one
#description = ET.SubElement(descriptor, 'Description')
#description.text = self.safe_text(self.top_metadata['description'])
if 'title' in self.top_metadata and self.top_metadata['title']:
title = ET.SubElement(descriptor, 'Title')
title.text = self.safe_text(self.top_metadata['title'])
# BioSample XSD will not accept a description here, although the example submission.xml has one ("white space not allowed, attribute is element-only")
#if 'description' in self.top_metadata and self.top_metadata['description']:
# description = ET.SubElement(descriptor, 'Description')
# description.text = self.safe_text(self.top_metadata['description'])
organism = ET.SubElement(biosample, 'Organism')
organismName = ET.SubElement(organism, 'OrganismName')
organismName.text = self.safe_text(self.biosample_metadata['organism'])
Expand Down
24 changes: 17 additions & 7 deletions bin/validate_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,13 +213,11 @@ def populate_fields(self):
)
# Validate populated fields
try:
# Ensure no null values in specified columns
assert not any(final_df[field].isnull().any() for field in existing_terms)
# Ensure all values are either empty or "Not Provided"
# Ensure all values, if absent, are either empty or "Not Provided"
for field in existing_terms:
final_df[field] = final_df[field].apply(
lambda x: "Not Provided" if str(x).strip().lower() == "not provided" else x)
assert all(value == "" or value == "Not Provided" for value in final_df[field].values)
lambda x: "Not Provided" if str(x).strip().lower() == "not provided" else x)
assert all(isinstance(value, str) for value in final_df[field].values)
except AssertionError:
raise AssertionError(
f'Populating certain fields in the metadata df with "" or "Not Provided" was unsuccessful'
Expand Down Expand Up @@ -271,9 +269,21 @@ def __init__(self, filled_df, parameters):
# get the main utility class
self.main_util = main_util()

# normalize authors column
self.normalize_author_columns()

def normalize_author_columns(self):
""" Normalize author/authors column to always be 'authors' """
# Rename 'author' to 'authors' if 'authors' doesn't already exist
if 'author' in self.metadata_df.columns and 'authors' not in self.metadata_df.columns:
self.metadata_df.rename(columns={'author': 'authors'}, inplace=True)
elif 'authors' in self.metadata_df.columns and 'author' in self.metadata_df.columns:
# Merge both columns if they exist, prioritizing 'authors'
self.metadata_df['authors'] = self.metadata_df['authors'].fillna(self.metadata_df['author'])
self.metadata_df.drop(columns=['author'], inplace=True)

def validate_main(self):
""" Main validation function for the metadata
"""
""" Main validation function for the metadata """
# check if user would like to validate custom fields
metadata_samp_names = self.metadata_df['sample_name'].tolist()

Expand Down
2 changes: 1 addition & 1 deletion modules/local/update_submission/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ process UPDATE_SUBMISSION {
--metadata_file $validated_meta_path \
--species $params.species \
--output_dir . \
--fasta_file $fasta_path \
${fasta_path ? "--fasta_file $fasta_path" : ""} \
${annotations_path ? "--annotation_file $annotations_path" : ""} \
${fastq_1 ? "--fastq1 $fastq_1" : ""} \
${fastq_2 ? "--fastq2 $fastq_2" : ""} \
Expand Down
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ params {
vadr_output_dir = "vadr_outputs"
final_liftoff_output_dir = "liftoff_outputs"
val_output_dir = "validation_outputs"
vadr_models_dir = " "
vadr_models_dir = "${projectDir}/vadr_files/rsv-models"

// environment params
env_yml = "${projectDir}/environment.yml"
Expand Down
2 changes: 1 addition & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
"description": "Map only if its child features align with sequence identity > this value",
"default": 0.5
},
"lift_unmapped_feature_file_name": {
"lift_unmapped_features_file_name": {
"type": "string",
"description": "Name of unmapped features file name",
"default": "output.unmapped_features.txt"
Expand Down
Loading