pipeline.xml

<?xml version="1.0" encoding="UTF-8"?>

<!-- This is an example file of a compi pipeline -->
<pipeline xmlns="http://www.sing-group.org/compi/pipeline-1.0"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
	<version>1.0.1</version>

	<params>
		<param name="host_working_dir" shortName="hw" global="true">Path of the working directory in the host.</param>
		<param name="working_dir" shortName="w" global="true" defaultValue="/working_dir">Path of the working directory.</param>
		<param name="input_fasta" shortName="if" global="true" defaultValue="input.fasta">Name of the input FASTA file containing the sequences to be annotated (relative to the working directory)</param>
		<param name="reference_fasta" shortName="rf" global="true" defaultValue="ref.fasta">Name of the FASTA file containing the reference sequence (relative to the working directory)</param>

		<param name="max_dist" shortName="md">Maximum distance between exons (in this case sequences identified by getorf) from the same gene. It only applies to large genome sequences where there is some chance that two genes with similar features are present.</param>
		<param name="intron_bp" shortName="ibp">Distance around the junction point between two sequences where to look for splicing signals.</param>
		<param name="selection_criterion" shortName="scr">The selection model to be used (1, 2, or 3): 1) similarity with reference sequence first, in case of a tie, percentage of gaps relative to reference sequence; 2) percentage of gaps relative to reference sequence first, in case of a tie, similarity with reference sequence; 3) a mixed model with similarity with reference sequence first, but if fewer gaps relative to reference sequence, similarity gets a bonus defined by the user. Currently, a bonus of 20, means 2%.</param>
		<param name="selection_correction" shortName="sco" defaultValue="0">A bonus percentage times 10. For instance, 20 means 2% bonus. Something with 18% similarity acts as having 20% similarity. Applied when selection_criterion=3.</param>
		<param name="min_full_nucleotide_size" shortName="mfps">Minimum size for CDS to be reported.</param>

		<flag name="skip_pull_docker_images" shortName="sdi">Use this flag to skip the pull-docker-images task.</flag>
		<param name="version_seqkit" shortName="vsq" defaultValue="2.1.0" global="true">Version of the pegi3s/seqkit image to use.</param>
		<param name="version_blast" shortName="vb" defaultValue="2.10.0" global="true">Version of the pegi3s/blast image to use.</param>
		<param name="version_emboss" shortName="ve" defaultValue="6.6.0" global="true">Version of the pegi3s/emboss image to use.</param>
		<param name="version_utilities" shortName="vu" defaultValue="0.20.0" global="true">Version of the pegi3s/utilities image to use.</param>
		
		<param name="scripts_dir" shortName="sd" defaultValue="/scripts" global="true">Path of the directory containing the pipeline scripts.</param>
		<flag name="keep_temporary_files" shortName="ktf" global="true">Use this flag to keep the temporary, intermediate files generated by the pipeline tasks (useful for debugging).</flag>
	</params>

	<tasks>
		<task id="pull-docker-images" params="skip_pull_docker_images" if="[ -v ${skip_pull_docker_images} ]">
			${scripts_dir}/pull_docker_images
		</task>

		<task id="initialization" after="pull-docker-images">
			${scripts_dir}/initialization
		</task>

		<task id="split-input" after="initialization">
			${scripts_dir}/split_input
		</task>

		<foreach id="get-orf" after="split-input"
			of="command" in="${scripts_dir}/list_files ${working_dir}" as="file">
			${scripts_dir}/get_orf ${working_dir} ${host_working_dir} ${file}
		</foreach>

		<foreach id="blast" after="*get-orf"
			of="command" in="${scripts_dir}/list_files ${working_dir}" as="file">
			${scripts_dir}/blast \
				${working_dir} ${host_working_dir} \
				${reference_fasta} ${file}
		</foreach>

		<foreach id="sort" after="*blast"
			of="command" in="${scripts_dir}/list_files ${working_dir}" as="file">
			${scripts_dir}/sort_fasta \
				${working_dir} ${host_working_dir} ${file} \
				02_${file}.ini 03_${file}.ini.sorted
		</foreach>

		<foreach id="join-exons" after="*sort"
			if="[[ -f ${working_dir}/cga_working_dir/${file}/03_${file}.ini.sorted ]]"
			of="command" in="${scripts_dir}/list_files ${working_dir}" as="file"
			params="max_dist intron_bp selection_criterion selection_correction">
			${scripts_dir}/join_exons \
				${working_dir} ${host_working_dir} \
				${file} ${reference_fasta} \
				${max_dist} ${intron_bp} ${selection_criterion} ${selection_correction}
		</foreach>
		
		<foreach id="predict" after="*join-exons"
			if="[[ -f ${working_dir}/cga_working_dir/${file}/04_${file}.join_exons_results ]]"
			of="command" in="${scripts_dir}/list_files ${working_dir}" as="file"
			params="min_full_nucleotide_size">
			${scripts_dir}/predict \
				${working_dir} ${host_working_dir} \
				${reference_fasta} ${file} ${min_full_nucleotide_size}
		</foreach>
        
		<task id="collect-results" after="predict">
			${scripts_dir}/collect_results
		</task>
	</tasks>
    
	<metadata>
			<task-description id="pull-docker-images">Downloads the neccessary Docker images for external software (BLAST, seqkit, etc.).</task-description>
			<task-description id="initialization">Initializes the internal working directory.</task-description>
			<task-description id="split-input">Splits the input FASTA file into several FASTA files containing one input sequence each.</task-description>
			<task-description id="get-orf">Obtains all between STOP codons open reading frames (ORF) longer than 30 bp using EMBOSS getorf.</task-description>
			<task-description id="blast">Creates a protein BLAST database for each file from the previous step and performs a blastp using the reference FASTA file.</task-description>
			<task-description id="sort">Sorts files to ensure that exons are ordered according to their relative location in the genome, using the information that the EMBOSS getorf program outputs in the sequence headers.</task-description>
			<task-description id="join-exons">Runs the 6 sub-steps for joining exons iteratively until all exons have been successfully joined. Check the documentation for more details about these 6 sub-steps.</task-description>
			<task-description id="predict">Processes the output of the previous step using a combination of the EMBOSS getorf and transeq programs to obtain the predicted CDS and protein sequences (only those in frame +1 are considered) only if they are longer than the minimum size specified by the user in the min_full_nucleotide_size parameter.</task-description>
			<task-description id="collect-results">Aggregates the results from each input sequence analysed to generate the aggregated output files under the results directory.</task-description>
	</metadata>
</pipeline>