Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge-dev-to-main-66.0.0 #1319

Merged
merged 10 commits into from
Feb 12, 2025
2 changes: 1 addition & 1 deletion cdk/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ Log in at [SSO login portal](https://igvf-dacc.awsapps.com/start/#), choose `Acc
Open a terminal window and run aws sso configuration command:

```bash
$ aws sso configure
$ aws configure sso
```

Choose `igvf-dev` account, enter the `SSO start URL` and `SSO region`, choose `PowerUserAccess` role. The `CLI Profile name` will default to `PowerUserAccess-xyz`, you might want to enter something more easy to remember, such as `igvf-dev-sso`.
Expand Down
2 changes: 1 addition & 1 deletion src/igvfd/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '65.0.0'
__version__ = '66.0.0'


import igvfd.schema_formats # needed to import before snovault to add FormatCheckers
Expand Down
2 changes: 1 addition & 1 deletion src/igvfd/audit/analysis_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def audit_analysis_set_multiplexed_samples(value, system):
{
"audit_description": "Analysis sets with multiplexed samples are expected to specify a demultiplexed sample.",
"audit_category": "missing demultiplexed sample",
"audit_level": "WARNING"
"audit_level": "NOT_COMPLIANT"
},
{
"audit_description": "Analysis sets are only expected to specify a demultiplexed sample if it has samples and they are all multiplexed.",
Expand Down
4 changes: 2 additions & 2 deletions src/igvfd/audit/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def audit_external_identifiers(value, system):
{
"audit_description": "Externally hosted files are expected to have identifiers from external resources in dbxrefs.",
"audit_category": "missing dbxrefs",
"audit_level": "WARNING"
"audit_level": "NOT_COMPLIANT"
}
]
'''
Expand All @@ -102,7 +102,7 @@ def audit_external_reference_files(value, system):
{
"audit_description": "Reference files uploaded from external resources are expected to have external identifiers in dbxrefs.",
"audit_category": "missing dbxrefs",
"audit_level": "WARNING"
"audit_level": "NOT_COMPLIANT"
}
]
'''
Expand Down
92 changes: 77 additions & 15 deletions src/igvfd/audit/file_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,25 +32,36 @@ def single_cell_check(system, value, object_type):
assay_term = value.get('assay_term')
return assay_term in single_cell_assay_terms
elif object_type == 'Auxiliary set':
assay_terms = []
measurement_sets = value.get('measurement_sets')
for measurement_set in measurement_sets:
measurement_set_obj = system.get('request').embed(measurement_set, '@@object?skip_calculated=true')
assay_term = measurement_set_obj.get('assay_term')
assay_terms.append(assay_term)
return any(assay in single_cell_assay_terms for assay in assay_terms)
if assay_term in single_cell_assay_terms:
return True
return False
elif object_type == 'Construct library set':
assay_terms = []
samples = value.get('applied_to_samples')
for sample in samples:
sample_obj = system.get('request').embed(sample)
sample_obj = system.get('request').embed(
sample, '@@object_with_select_calculated_properties?field=file_sets')
file_sets = sample_obj.get('file_sets')
for file_set in file_sets:
if file_set.startswith('/measurement-sets/'):
measurement_set_obj = system.get('request').embed(measurement_set, '@@object?skip_calculated=true')
assay_term = measurement_set_obj.get('assay_term')
assay_terms.append(assay_term)
return any(assay in single_cell_assay_terms for assay in assay_terms)
if assay_term in single_cell_assay_terms:
return True
return False
elif object_type == 'Analysis set':
for input_file_set in value.get('input_file_sets', []):
if input_file_set.startswith('/measurement-sets/'):
measurement_set_obj = system.get('request').embed(input_file_set, '@@object?skip_calculated=true')
assay_term = measurement_set_obj.get('assay_term')
if assay_term in single_cell_assay_terms:
return True
return False
else:
return False


@audit_checker('FileSet', frame='object')
Expand All @@ -60,13 +71,13 @@ def audit_no_files(value, system):
{
"audit_description": "File sets are expected to have files.",
"audit_category": "missing files",
"audit_level": "WARNING"
"audit_level": "NOT_COMPLIANT"
}
]
'''
object_type = space_in_words(value['@type'][0]).capitalize()
audit_message_missing_files = get_audit_message(audit_no_files, index=0)
if not (value.get('files', '')):
if not (value.get('files', '')) and object_type != 'Construct library set':
detail = (
f'{object_type} {audit_link(path_to_text(value["@id"]), value["@id"])} '
f'has no `files`.'
Expand All @@ -79,14 +90,18 @@ def audit_missing_seqspec(value, system):
'''
[
{
"audit_description": "Sequence files in a file set are expected to link to a sequence specification file.",
"audit_description": "Sequence files in a file set associated with bulk data are expected to link to a sequence specification file.",
"audit_category": "missing sequence specification",
"audit_level": "INTERNAL_ACTION"
},
{
"audit_description": "Sequence files in a file set associated with single cell data are expected to link to a sequence specification file.",
"audit_category": "missing sequence specification",
"audit_level": "NOT_COMPLIANT"
}
]
'''
object_type = space_in_words(value['@type'][0]).capitalize()
audit_message = get_audit_message(audit_missing_seqspec)
if 'files' in value:
no_seqspec = []
for file in value['files']:
Expand All @@ -97,6 +112,10 @@ def audit_missing_seqspec(value, system):
if no_seqspec:
no_seqspec = ', '.join([audit_link(path_to_text(file_no_seqspec), file_no_seqspec)
for file_no_seqspec in no_seqspec])
if single_cell_check(system, value, object_type):
audit_message = get_audit_message(audit_missing_seqspec, index=1)
else:
audit_message = get_audit_message(audit_missing_seqspec, index=0)
detail = (
f'{object_type} {audit_link(path_to_text(value["@id"]), value["@id"])} has sequence file(s): '
f'{no_seqspec} which do not have any `seqspecs`.'
Expand Down Expand Up @@ -287,15 +306,19 @@ def audit_inconsistent_sequencing_kit(value, system):
"audit_level": "ERROR"
},
{
"audit_description": "Sequence files should specify a sequencing kit.",
"audit_description": "Sequence files in a file set associated wtih bulk data should specify a sequencing kit.",
"audit_category": "missing sequencing kit",
"audit_level": "INTERNAL_ACTION"
},
{
"audit_description": "Sequence files in a file set associated wtih single cell data should specify a sequencing kit.",
"audit_category": "missing sequencing kit",
"audit_level": "NOT_COMPLIANT"
}
]
'''
object_type = space_in_words(value['@type'][0]).capitalize()
audit_message_inconsistent_kit = get_audit_message(audit_inconsistent_sequencing_kit, index=0)
audit_message_missing_kit = get_audit_message(audit_inconsistent_sequencing_kit, index=1)
if 'files' in value:
file_info = {}
for file in value['files']:
Expand Down Expand Up @@ -327,6 +350,10 @@ def audit_inconsistent_sequencing_kit(value, system):
yield AuditFailure(audit_message_inconsistent_kit.get('audit_category', ''), f'{detail} {audit_message_inconsistent_kit.get("audit_description", "")}', level=audit_message_inconsistent_kit.get('audit_level', ''))

if missing_kit:
if single_cell_check(system, value, object_type):
audit_message_missing_kit = get_audit_message(audit_inconsistent_sequencing_kit, index=2)
else:
audit_message_missing_kit = get_audit_message(audit_inconsistent_sequencing_kit, index=1)
detail = (
f'{object_type} {audit_link(path_to_text(value["@id"]), value["@id"])} has sequence '
f'file(s) {", ".join([audit_link(path_to_text(f), f) for f in missing_kit])} '
Expand Down Expand Up @@ -423,7 +450,7 @@ def audit_input_for(value, system):
{
"audit_description": "Raw data sets with files are expected to be associated with at least one analysis set.",
"audit_category": "missing analysis",
"audit_level": "NOT_COMPLIANT"
"audit_level": "WARNING"
}
]
'''
Expand Down Expand Up @@ -484,7 +511,7 @@ def audit_MPRA_read_names(value, system):
{
"audit_description": "MPRA measurement set and auxiliary set sequence files are expected to specify a read name.",
"audit_category": "missing read names",
"audit_level": "WARNING"
"audit_level": "NOT_COMPLIANT"
},
{
"audit_description": "MPRA measurement set and auxiliary set sequence files are only expected to specify read names: Barcode forward, UMI, or Barcode reverse.",
Expand Down Expand Up @@ -588,3 +615,38 @@ def audit_single_cell_read_names(value, system):
f'pipeline.'
)
yield AuditFailure(audit_message_unexpected_read_names.get('audit_category', ''), f'{detail} {audit_message_unexpected_read_names.get("audit_description", "")}', level=audit_message_unexpected_read_names.get('audit_level', ''))


@audit_checker('FileSet', frame='object')
def audit_control_for_control_type(value, system):
'''
[
{
"audit_description": "File sets that are controls for other file sets are expected to define a control type.",
"audit_category": "missing control type",
"audit_level": "NOT_COMPLIANT"
},
{
"audit_description": "File sets that specify a control type are expected to be a control for other file sets.",
"audit_category": "missing control for",
"audit_level": "NOT_COMPLIANT"
}
]
'''
object_type = space_in_words(value['@type'][0]).capitalize()
audit_message_missing_control_type = get_audit_message(audit_control_for_control_type, index=0)
audit_message_missing_control_for = get_audit_message(audit_control_for_control_type, index=1)
if value.get('control_for', '') and not (value.get('control_type', '')):
detail = (
f'{object_type} {audit_link(path_to_text(value["@id"]), value["@id"])} '
f'has no `control_type`.'
)
yield AuditFailure(audit_message_missing_control_type.get('audit_category', ''), f'{detail} {audit_message_missing_control_type.get("audit_description", "")}', level=audit_message_missing_control_type.get('audit_level', ''))
elif value.get('control_type', '') and not (value.get('control_for', '')):
detail = (
f'{object_type} {audit_link(path_to_text(value["@id"]), value["@id"])} '
f'has no `control_for`. The `control_file_sets` should be patched on '
f'the file sets it serves as a control for and this property, `control_for`, '
f'will be reverse calculated.'
)
yield AuditFailure(audit_message_missing_control_for.get('audit_category', ''), f'{detail} {audit_message_missing_control_for.get("audit_description", "")}', level=audit_message_missing_control_for.get('audit_level', ''))
2 changes: 1 addition & 1 deletion src/igvfd/audit/in_vitro_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def audit_cell_fate_change_treatments_purpose(value, system):
{
"audit_description": "Cell fate change treatments are expected to have a purpose associated with cell fate change.",
"audit_category": "inconsistent treatment purpose",
"audit_level": "WARNING"
"audit_level": "ERROR"
}
]
'''
Expand Down
2 changes: 1 addition & 1 deletion src/igvfd/audit/matrix_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def audit_matrix_file_dimensions(value, system):
{
"audit_description": "Matrix files, with the exception of .hic, .cool, and .mcool files, are expected to have different values for each dimension.",
"audit_category": "inconsistent dimensions",
"audit_level": "WARNING"
"audit_level": "ERROR"
}
]
'''
Expand Down
16 changes: 8 additions & 8 deletions src/igvfd/audit/measurement_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def audit_preferred_assay_title(value, system):
{
"audit_description": "Measurement sets are expected to specify an appropriate preferred assay title for its respective assay term.",
"audit_category": "inconsistent preferred assay title",
"audit_level": "WARNING"
"audit_level": "ERROR"
}
]
'''
Expand Down Expand Up @@ -266,7 +266,7 @@ def audit_missing_auxiliary_set_link(value, system):
{
"audit_description": "Measurement sets are expected to link to auxiliary sets if they share the same sample.",
"audit_category": "missing auxiliary set",
"audit_level": "WARNING"
"audit_level": "ERROR"
}
]
'''
Expand Down Expand Up @@ -297,7 +297,7 @@ def audit_targeted_genes(value, system):
{
"audit_description": "Only ChIP-seq and CRISPR flow cytometry assays are expected to specify targeted gene(s).",
"audit_category": "unexpected targeted genes",
"audit_level": "WARNING"
"audit_level": "ERROR"
}
]
'''
Expand Down Expand Up @@ -490,12 +490,12 @@ def audit_onlist(value, system):
{
"audit_description": "Measurement sets to be processed via the single cell uniform pipeline are expected to have onlist files and onlist methods indicated.",
"audit_category": "missing barcode onlist",
"audit_level": "WARNING"
"audit_level": "NOT_COMPLIANT"
},
{
"audit_description": "Measurement sets not intended for the single cell uniform pipeline are expected not to have onlist files or onlist methods.",
"audit_category": "unexpected barcode onlist",
"audit_level": "WARNING"
"audit_level": "ERROR"
}
]
'''
Expand Down Expand Up @@ -536,12 +536,12 @@ def audit_inconsistent_onlist_info(value, system):
{
"audit_description": "Measurement sets with 2 or more barcode onlist files are expected to have an onlist method of either product or multi.",
"audit_category": "inconsistent barcode onlist",
"audit_level": "WARNING"
"audit_level": "ERROR"
},
{
"audit_description": "Measurement sets with only 1 barcode onlist files are expected to have an onlist method of no combination.",
"audit_category": "inconsistent barcode onlist",
"audit_level": "WARNING"
"audit_level": "ERROR"
}
]
'''
Expand All @@ -566,7 +566,7 @@ def audit_unexpected_onlist_content(value, system):
{
"audit_description": "Onlist files are expected to be tabular files with barcode onlist as the content type.",
"audit_category": "unexpected onlist files",
"audit_level": "WARNING"
"audit_level": "ERROR"
}
]
'''
Expand Down
2 changes: 1 addition & 1 deletion src/igvfd/audit/model_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def audit_external_input_data_content_type(value, system):
{
"audit_description": "Tabular files linked as `external_input_data` must be of type external source data.",
"audit_category": "inconsistent external input data",
"audit_level": "WARNING"
"audit_level": "ERROR"
}
]
'''
Expand Down
23 changes: 22 additions & 1 deletion src/igvfd/audit/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def audit_construct_library_sets_types(value, system):
{
"audit_description": "Samples are expected to link to a construct library sets with the same file set type.",
"audit_category": "inconsistent construct library sets",
"audit_level": "WARNING"
"audit_level": "ERROR"
}
]
'''
Expand Down Expand Up @@ -215,3 +215,24 @@ def audit_parent_sample_with_singular_child(value, system):
f'in `{child_sample_type}`.'
)
yield AuditFailure(audit_message.get('audit_category', ''), f'{detail} {audit_message.get("audit_description", "")}', level=audit_message.get('audit_level', ''))


@audit_checker('Sample', frame='object')
def audit_missing_nucleic_acid_delivery(value, system):
'''
[
{
"audit_description": "Samples linked to construct library sets are expected to specify nucleic acid delivery method.",
"audit_category": "missing nucleic acid delivery",
"audit_level": "WARNING"
}
]
'''
object_type = space_in_words(value['@type'][0]).capitalize()
audit_message = get_audit_message(audit_missing_nucleic_acid_delivery)
if 'construct_library_sets' in value and 'nucleic_acid_delivery' not in value:
detail = (
f'{object_type} {audit_link(path_to_text(value["@id"]), value["@id"])} '
f'has `construct_library_sets` but is missing `nucleic_acid_delivery`.'
)
yield AuditFailure(audit_message.get('audit_category', ''), f'{detail} {audit_message.get("audit_description", "")}', level=audit_message.get('audit_level', ''))
7 changes: 6 additions & 1 deletion src/igvfd/loadxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@
'tabular_file',
'index_file',
'genome_browser_annotation_file',
'image_file'
'image_file',
'mpra_quality_metric',
'perturb_seq_quality_metric',
'single_cell_atac_seq_quality_metric',
'single_cell_rna_seq_quality_metric',
'starr_seq_quality_metric'
]

IS_ATTACHMENT = [
Expand Down
7 changes: 5 additions & 2 deletions src/igvfd/mappings/alignment_file.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"hash": "19e93a3331de4afde65b629878761208",
"index_name": "alignment_file_19e93a33",
"hash": "ec6a9f459d2d7882011e2dbca2eb1505",
"index_name": "alignment_file_ec6a9f45",
"item_type": "alignment_file",
"mapping": {
"dynamic_templates": [
Expand Down Expand Up @@ -1719,6 +1719,9 @@
"notes": {
"type": "text"
},
"quality_metrics": {
"type": "keyword"
},
"read_count": {
"store": true,
"type": "long"
Expand Down
Loading