Replace input CSV with YAML and parse sample ID from BAM (#106)

* update submodules * remove CSV and add YAML input * add custom schema * udate ouput dir structure set up * change patient_id to sample_id * update pipeval * remove empty line * update channels to parse input from YAML * Update CHANGELOG.md * fix YAML linting * fix YAML linting * update help comments * remove redundant code * update comment in config * update logging * use params.samples_to_process to parse sample id * set sample ID from tumor ID extracted using params.samples_to_process * remove redundant code * check for multiple tumor samples before setting sample param * fix typo in error --------- Co-authored-by: Mootor <mmootor@ip-0A125250.rhxrlfvjyzbupc03cc22jkch3c.xx.internal.cloudapp.net>
uclahs-cds · Aug 1, 2023 · 9917c17 · 9917c17
1 parent 0c8def0
commit 9917c17
Show file tree

Hide file tree

Showing 11 changed files with 229 additions and 131 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,7 +8,15 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 ---
 
 ## [Unreleased]
-- Update `README.md` to clarify adjustable parameters and note lab default values.
+### Added
+- YAML input
+
+### Changed
+- Parse sample ID from tumor BAM for output directory naming
+- Update `README.md` to clarify adjustable parameters and note lab default values
+
+### Removed
+- CSV input
 
 ---
 

diff --git a/config/custom_schema_types.config b/config/custom_schema_types.config
@@ -0,0 +1,91 @@
+/**
+* This custom schema namespace implements a custom type for checking input BAMs for call-sSV
+*/
+custom_schema_types {
+ allowed_input_types = [
+ 'BAM'
+ ]
+ allowed_bam_types = [
+ 'normal',
+ 'tumor'
+ ]
+
+ /**
+ * Check that input types are in allowed list
+ */
+ check_input_type_keys = { List given, String name, List choices=custom_schema_types.allowed_input_types ->
+ for (elem in given) {
+ if (!(elem in choices)) {
+ throw new Exception("Invalid paramter ${name}. Valid types: ${choices}.")
+ }
+ }
+ }
+
+ /**
+ * Check if given input is a Namespace
+ */
+ check_if_namespace = { val, String name ->
+ if (!(val in Map)) {
+ throw new Exception("${name} should be a Namespace, not ${val.getClass()}.")
+ }
+ }
+
+ /**
+ * Check if given input is a list
+ */
+ check_if_list = { val, String name ->
+ if (!(val in List || val in Set)) {
+ throw new Exception("${name} should be a List, not ${val.getClass()}.")
+ }
+ }
+
+ /**
+ * Check that input is namespace of expected types
+ */
+ check_input_namespace = { Map options, String name, Map properties ->
+ // Check parameters keys
+ custom_schema_types.check_if_namespace(options[name], name)
+ def given_keys = options[name].keySet() as ArrayList
+ custom_schema_types.check_input_type_keys(given_keys, name)
+
+ options[name].each { entry ->
+ def entry_as_map = [:]
+ entry_as_map[entry.key] = entry.value
+ schema.validate_parameter(entry_as_map, entry.key, properties.elements[entry.key])
+ }
+ }
+
+ /**
+ * Check namespace BAM
+ */
+ check_bam_namespace = { Map options, String name, Map properties ->
+ custom_schema_types.check_if_namespace(options[name], name)
+ def given_keys = options[name].keySet() as ArrayList
+ if (given_keys.size() <= 0) {
+ throw new Exception("No inputs provided! Please provide inputs in the CSV or YAML.")
+ }
+ custom_schema_types.check_input_type_keys(given_keys, name, custom_schema_types.allowed_bam_types)
+
+ options[name].each { entry ->
+ def entry_as_map = [:]
+ entry_as_map[entry.key] = entry.value
+ schema.validate_parameter(entry_as_map, entry.key, properties.elements[entry.key])
+ }
+ }
+
+ /**
+ * Check if proper BAM entry list
+ */
+ check_bam_list = { Map options, String name, Map properties ->
+ custom_schema_types.check_if_list(options[name], name)
+ for (item in options[name]) {
+ schema.check_path(item, 'r')
+ }
+ }
+
+ types = [
+ 'InputNamespace': custom_schema_types.check_input_namespace,
+ 'InputBAMNamespace': custom_schema_types.check_bam_namespace,
+ 'BAMEntryList': custom_schema_types.check_bam_list
+ ]
+}
diff --git a/config/default.config b/config/default.config
@@ -28,7 +28,7 @@ params {
  delly_version = '1.1.3'
  manta_version = '1.6.0'
  bcftools_version = '1.15.1'
- pipeval_version = '3.0.0'
+ pipeval_version = '4.0.0-rc.2'
 
  // Docker tool versions
  docker_image_delly = "${-> params.docker_container_registry}/delly:${params.delly_version}"

diff --git a/config/methods.config b/config/methods.config
@@ -1,6 +1,8 @@
-includeConfig "../external/pipeline-Nextflow-config/config/retry/retry.config"
+import nextflow.util.SysHelper
+includeConfig "../external/pipeline-Nextflow-config/config/bam/bam_parser.config"
+includeConfig "../external/pipeline-Nextflow-config/config/methods/common_methods.config"
 includeConfig "../external/pipeline-Nextflow-config/config/schema/schema.config"
-
+includeConfig "../external/pipeline-Nextflow-config/config/retry/retry.config"
 
 methods {
  check_permissions = { path ->
@@ -19,15 +21,31 @@ methods {
  }
  }
 
+ set_ids_from_bams = {
+ params.samples_to_process = [] as Set
+ params.input.BAM.each { k, v ->
+ v.each { bam_path ->
+ def bam_header = bam_parser.parse_bam_header(bam_path)
+ def sm_tags = bam_header['read_group'].collect{ it['SM'] }.unique()
+
+ if (sm_tags.size() != 1) {
+ throw new Exception("${bam_path} contains multiple samples! Please run pipeline with single sample BAMs.")
+ }
+ params.samples_to_process.add(['id': sm_tags[0], 'path': bam_path, 'sample_type': k])
+ }
+ }
+ }
 
  set_output_dir = {
- def sample
+ sample = params.samples_to_process
+ .findAll{ it.sample_type == 'tumor' }
+ .collect{ it.id }
 
- // assumes that project and samples name are in the pipeline.config
- def reader = new FileReader(params.input_csv)
- reader.splitEachLine(',') { parts -> [sample = parts[1].split('/')[-1].split('.bam')[0]] }
+ if (sample.size() != 1) {
+  throw new Exception("${params.samples_to_process}\n\n Multiple Tumor BAMs found in the input! Please run pipeline one Tumor sample at a time.")
+ }
 
- params.sample = "${sample}"
+ params.sample = sample[0]
 
  params.output_dir_base = "${params.output_dir}/${manifest.name}-${manifest.version}/${params.sample}"
  }
@@ -98,7 +116,7 @@ methods {
 
  set_resources_allocation = {
  // Function to ensure that resource requirements don't go beyond
- // a maximum limit 
+ // a maximum limit
  node_cpus = params.max_cpus
  node_memory_GB = params.max_memory.toGiga()
  // Load base.config by default for all pipelines
@@ -125,7 +143,6 @@ methods {
  }
  }
 
-
  /**
  * Check the permissions and existence of workDir.
  * If it doesn't exist, recursively find first existing directory and check write permission.
@@ -185,7 +202,7 @@ methods {
 
  timeline.enabled = true
  timeline.file = "${params.log_output_dir}/nextflow-log/timeline.html"
- 
+
  report.enabled = true
  report.file = "${params.log_output_dir}/nextflow-log/report.html"
  }
@@ -202,16 +219,17 @@ methods {
 
  // Set up env, timeline, trace, and report above.
  setup = {
+ methods.set_env()
+ schema.load_custom_types("${projectDir}/config/custom_schema_types.config")
+ schema.validate()
+ methods.set_ids_from_bams()
+ methods.set_resources_allocation()
  methods.set_output_dir()
  methods.set_log_output_dir()
  methods.check_permissions(params.log_output_dir)
- methods.set_env()
- methods.set_resources_allocation()
+ methods.set_pipeline_logs()
  methods.set_process()
  methods.set_docker_sudo()
- methods.set_pipeline_logs()
  retry.setup_retry()
- schema.validate()
  }
  }
-
diff --git a/config/schema.yaml b/config/schema.yaml
@@ -1,51 +1,68 @@
 ---
-input_csv:
- type: 'Path'
- mode: 'r'
- required: true
- help: 'Absolute path to the input CSV file'
+sample_id:
+ type: 'String'
+ required: true
+ help: 'Sample ID'
 reference_fasta:
-  type: 'Path'
-  mode: 'r'
-  required: true
-  help: 'Absolute path to a reference FASTA file'
+ type: 'Path'
+ mode: 'r'
+ required: true
+ help: 'Absolute path to a reference FASTA file'
 exclusion_file:
-  type: 'Path'
-  mode: 'r'
-  required: true
-  help: 'Absoulte path to an exclusion file'
+ type: 'Path'
+ mode: 'r'
+ required: true
+ help: 'Absoulte path to an exclusion file'
 algorithm:
-  type: 'List'
-  required: true
-  help: 'List of SV caller(s) for calling'
-  default:
-  - delly
-  - manta
-  choices:
-  - delly
-  - manta
+ type: 'List'
+ required: true
+ help: 'List of available somatic SV callers'
+ default:
+ - delly
+ - manta
+ choices:
+ - delly
+ - manta
 output_dir:
-  type: 'Path'
-  mode: 'w'
-  required: true
-  help: 'Absolute path to output directory'
+ type: 'Path'
+ mode: 'w'
+ required: true
+ help: 'Absolute path to output directory'
 dataset_id:
-  type: 'String'
-  required: true
-  help: 'Dataset identifier'
+ type: 'String'
+ required: true
+ help: 'Dataset identifier'
 map_qual:
-  type: 'Integer'
-  required: true
-  default: 20
+ type: 'Integer'
+ required: true
+ default: 20
 min_clique_size:
-  type: 'Integer'
-  required: true
-  default: 5
+ type: 'Integer'
+ required: true
+ default: 5
 mad_cutoff:
-  type: 'Integer'
-  required: true
-  default: 15
+ type: 'Integer'
+ required: true
+ default: 15
 filter_condition:
- type: 'String'
- required: true
- default: "FILTER=='PASS'"
+ type: 'String'
+ required: true
+ default: "FILTER=='PASS'"
+input:
+ type: 'InputNamespace'
+ required: true
+ help: 'Input samples'
+ elements:
+ BAM:
+ type: 'InputBAMNamespace'
+ required: true
+ help: 'Input BAMs for somatic structural variant calling'
+ elements:
+ normal:
+ type: 'BAMEntryList'
+ required: false
+ help: 'Input normal BAMs'
+ tumor:
+ type: 'BAMEntryList'
+ required: false
+ help: 'Input tumor BAMs'
diff --git a/external/pipeline-Nextflow-config b/external/pipeline-Nextflow-config
diff --git a/external/pipeline-Nextflow-module b/external/pipeline-Nextflow-module
diff --git a/input/call-sSV-input.csv b/input/call-sSV-input.csv
diff --git a/input/call-sSV-input.yaml b/input/call-sSV-input.yaml
@@ -0,0 +1,8 @@
+---
+sample_id: "sample_id"
+input:
+ BAM:
+ normal:
+ - "/absolute/path/to/BAM"
+ tumor:
+ - "/abosolute/path/to/BAM"