Skip to content

Commit

Permalink
Merge pull request #28 from UCLOrengoGroup/25-abstract-processes
Browse files Browse the repository at this point in the history
Processes abstracted into a shared file
  • Loading branch information
Rachel-Alcraft authored Jan 25, 2023
2 parents c7f5ad1 + 3e17886 commit 7206f94
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 116 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,4 +151,5 @@ dataset100k
kinfams-results/
kinfams-backup/
datasets/
cath-test-hpc-dataset100/
cath-test-hpc-dataset100/
results-*/
113 changes: 113 additions & 0 deletions workflows/cath-shared.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl=2

params.chunk_size = 10

params.cath_af_cli = 'cath-af-cli'
params.cath_version = 'v4_3_0'

params.publish_dir = "$workflow.launchDir/results-${params.dataset_name}"
params.af_version = 4
params.af_download_stem = "gs://public-datasets-deepmind-alphafold-v${params.af_version}"
params.af_cif_raw_dir = "cif_raw"
params.af_cif_chopped_dir = "cif_chopped"

process create_af_manifest_file {
input:
path uniprot_id_file

output:
path 'manifest.txt'

"""
cat ${uniprot_id_file} | awk '{print "${params.af_download_stem}/AF-"\$1"-F*.cif"}' > manifest.txt
"""
}

process retrieve_af_chain_cif_files {
publishDir "${params.publish_dir}/${params.af_cif_raw_dir}", mode: 'copy'

input:
path af_model_urls_file

output:
path "AF-*.cif", optional: true

// If Google returns 401 errors then make sure you have logged in:
//
// gcloud auth application-default login
//
// see: https://www.nextflow.io/docs/latest/google.html

"""
cat ${af_model_urls_file} | (gsutil -o GSUtil:parallel_process_count=1 -m cp -I . || echo "Ignoring non-zero exit code: \$?")
"""
}

process chop_cif {
publishDir params.publish_dir, mode: 'copy'

input:
path raw_cif_files
path full_uniprot_domain_ids_file

output:
path "${params.af_cif_chopped_dir}/*.cif"

"""
mkdir ${params.af_cif_chopped_dir}
# get a list of uniprot accessions from CIF files
find . -name "*.cif" | tr '-' ' ' | awk '{print \$2}' | sort | uniq > uniprot_ids.txt
# extract the uniprot domain ids for which we have CIF files
grep -F -f uniprot_ids.txt ${full_uniprot_domain_ids_file} > uniprot_domain_ids.txt
# do the chopping
cath-af-cli chop-cif \
--cif_in_dir . \
--id_file uniprot_domain_ids.txt \
--cif_out_dir ${params.af_cif_chopped_dir}/ \
--id_type uniprot \
--input_file_policy skip \
--output_file_policy skip \
--af_version ${params.af_version}
"""
}

process uniprot_domain_to_uniprot {
input:
path uniprot_domain_id

output:
path 'uniprot_ids.txt'

"""
cat ${uniprot_domain_id} | tr '/' ' ' | awk '{print \$1}' | sort | uniq > uniprot_ids.txt
"""
}

process cif_paths_to_uniprot_accessions {
input:
path 'cif_paths.txt'

output:
path 'uniprot_ids.txt'

"""
cat cif_paths.txt | perl -ne '/AF-(\\w+)-/ && print "\$1\\n"' | sort | uniq > uniprot_ids.txt
"""
}

process create_missing_uniprot_domain_ids {
input:
path 'found_uniprot_ids.txt'
path 'all_uniprot_domain_ids.txt'

output:
path 'missing_uniprot_domain_ids.txt'

"""
grep -F -v -f found_uniprot_ids.txt all_uniprot_domain_ids.txt > missing_uniprot_domain_ids.txt
"""
}
125 changes: 10 additions & 115 deletions workflows/cath-test-workflow.nf
Original file line number Diff line number Diff line change
@@ -1,126 +1,21 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl=2

params.chunk_size = 10

params.cath_af_cli = 'cath-af-cli'
params.cath_version = 'v4_3_0'
// Params defined before include are also used in the processes
params.dataset_name = 'dataset100'

params.dataset_dir = "$workflow.launchDir/tests/fixtures/dataset100"
params.publish_dir = "$workflow.launchDir/cath-test-hpc-${params.dataset_name}"
params.af_version = 4
params.af_download_stem = "gs://public-datasets-deepmind-alphafold-v${params.af_version}"
// Processes to include from the shared module
include { uniprot_domain_to_uniprot } from './cath-shared'
include { create_af_manifest_file } from './cath-shared'
include { retrieve_af_chain_cif_files } from './cath-shared'
include { chop_cif } from './cath-shared'
include { cif_paths_to_uniprot_accessions } from './cath-shared'
include { create_missing_uniprot_domain_ids } from './cath-shared'

params.uniprot_ids_csv_fn = "${params.dataset_name}.uniprot_ids.csv"
// Params defined after include are used locally
params.dataset_dir = "$workflow.launchDir/tests/fixtures/${params.dataset_name}"
params.uniprot_domain_ids_csv_fn = "${params.dataset_name}.uniprot_domain_ids.csv"

params.af_manifest_fn = "af_manifest.txt"
params.af_cif_raw_dir = "cif_raw"
params.af_cif_chopped_dir = "cif_chopped"

params.uniprot_ids_downloaded_csv_fn = "uniprot_ids.downloaded.csv"

process create_af_manifest_file {
input:
path uniprot_id_file

output:
path params.af_manifest_fn

"""
cat ${uniprot_id_file} | awk '{print "${params.af_download_stem}/AF-"\$1"-F*.cif"}' > ${params.af_manifest_fn}
"""
}

process retrieve_af_chain_cif_files {
publishDir "${params.publish_dir}/${params.af_cif_raw_dir}", mode: 'copy'

input:
path af_model_urls_file

output:
path "AF-*.cif", optional: true

// If Google returns 401 errors then make sure you have logged in:
//
// gcloud auth application-default login
//
// see: https://www.nextflow.io/docs/latest/google.html

"""
cat ${af_model_urls_file} | (gsutil -o GSUtil:parallel_process_count=1 -m cp -I . || echo "Ignoring non-zero exit code: \$?")
"""
}

process chop_cif {
publishDir params.publish_dir, mode: 'copy'

input:
path raw_cif_files
path full_uniprot_domain_ids_file

output:
path "${params.af_cif_chopped_dir}/*.cif"

"""
mkdir ${params.af_cif_chopped_dir}
# get a list of uniprot accessions from CIF files
find . -name "*.cif" | tr '-' ' ' | awk '{print \$2}' | sort | uniq > uniprot_ids.txt
# extract the uniprot domain ids for which we have CIF files
grep -F -f uniprot_ids.txt ${full_uniprot_domain_ids_file} > uniprot_domain_ids.txt
# do the chopping
cath-af-cli chop-cif \
--cif_in_dir . \
--id_file uniprot_domain_ids.txt \
--cif_out_dir ${params.af_cif_chopped_dir}/ \
--id_type uniprot \
--input_file_policy skip \
--output_file_policy skip \
--af_version ${params.af_version}
"""
}

process uniprot_domain_to_uniprot {
input:
path uniprot_domain_id

output:
path params.uniprot_ids_csv_fn

"""
cat ${uniprot_domain_id} | tr '/' ' ' | awk '{print \$1}' | sort | uniq > ${params.uniprot_ids_csv_fn}
"""
}

process cif_paths_to_uniprot_accessions {
input:
path 'cif_paths.txt'

output:
path 'uniprot_ids.txt'

"""
cat cif_paths.txt | perl -ne '/AF-(\\w+)-/ && print "\$1\\n"' | sort | uniq > uniprot_ids.txt
"""
}

process create_missing_uniprot_domain_ids {
input:
path 'found_uniprot_ids.txt'
path 'all_uniprot_domain_ids.txt'

output:
path 'missing_uniprot_domain_ids.txt'

"""
grep -F -v -f found_uniprot_ids.txt all_uniprot_domain_ids.txt > missing_uniprot_domain_ids.txt
"""
}

workflow {
def uniprot_domain_ids_path = "${params.dataset_dir}/${params.uniprot_domain_ids_csv_fn}"

Expand Down

0 comments on commit 7206f94

Please sign in to comment.