diff --git a/README.rst b/README.rst index 6addd49e..d72b938f 100644 --- a/README.rst +++ b/README.rst @@ -6,16 +6,56 @@ The CGAT pipelines ================== In CGAT_ we have developed a set of ruffus_ based pipelines in comparative genomics -and NGS analysis. The pipelines are generally available and should -be fairly portable. Some documentation of the pipelines is +and NGS analysis. Some documentation of the pipelines is `here `_. We are working on improving the existing documentation and portability of the code to release a set of production pipelines soon so please stay tuned. -Meanwhile, in this repository you may find a set of working pipelines that -are under continuous development and changing rapidly. However, they might -give some ideas or building blocks when developing your own pipelines. +We are currently testing a script to automate the installation with conda_. Feel +free to give it a go:: + + # download installation script: + curl -O https://raw.githubusercontent.com/CGATOxford/CGATPipelines/master/install-CGAT-tools.sh + + # see help: + bash install-CGAT-tools.sh + + # install the development version (recommended, no production version yet): + bash install-CGAT-tools.sh --devel [--location ] + + # enable the conda environment as requested by the installation script: + source /conda-install/bin/activate cgat-p + + # and uninstall pika, which we use internally for our dashboard: + conda remove pika + + # finally, please run the cgatflow command-line tool to check the installation: + cgatflow --help + +The installation script will put everything under the specified location. It needs +15 GB of disk space and it takes about 35 minutes to complete. The aim of the +script is to provide a portable installation that does not interfere with the existing +software. As a result, you will have a conda environment working with the CGAT Pipelines +which can be enabled on demand according to your needs. + +On top of the instructions above, please make sure that you configure the following +environment variables:: + + # Access to the DRMAA library: https://en.wikipedia.org/wiki/DRMAA + export DRMAA_LIBRARY_PATH=//libdrmaa.so + + # You can get this value from your configured environment: + env | grep DRMAA_LIBRARY_PATH + + # or just look for the library: + find -name "*libdrmaa.so" + + # Also, make sure you have defined temporary folders + # 1. Local to execution hosts with + export TMPDIR=/tmp + # 2. Shared to pipeline working directory + export SHARED_TMPDIR=//scratch For questions, please open a new issue on `GitHub @@ -23,5 +63,5 @@ For questions, please open a new issue on .. _ruffus: http://www.ruffus.org.uk .. _CGAT: http://www.cgat.org - +.. _conda: https://conda.io diff --git a/conda/environments/pipeline-readqc-template.yml b/conda/environments/pipeline-readqc-template.yml index 09bc610b..bf78ac57 100644 --- a/conda/environments/pipeline-readqc-template.yml +++ b/conda/environments/pipeline-readqc-template.yml @@ -1,5 +1,5 @@ # output generated by /ifs/devel/sebastian/py35-v1/CGATPipelines/scripts/cgat_conda_deps.sh --pipeline readqc -# on Wed Nov 1 11:36:43 GMT 2017 +# on Mon Nov 13 10:45:57 GMT 2017 name: cgat-p @@ -28,12 +28,17 @@ dependencies: - fastq-screen - fastqc - fastx_toolkit +- flash - gmap - hisat2 - kallisto - nomkl +- pandaseq - samtools - shortstack +- sickle-trim - star +- trim-galore +- trimmomatic - zlib diff --git a/conda/environments/pipelines-devel-template.yml b/conda/environments/pipelines-devel-template.yml index 251972e9..4dc979ea 100644 --- a/conda/environments/pipelines-devel-template.yml +++ b/conda/environments/pipelines-devel-template.yml @@ -1,5 +1,5 @@ # output generated by /ifs/devel/sebastian/py35-v1/CGATPipelines/scripts/cgat_conda_deps.sh --all -# on Thu Nov 9 15:30:22 GMT 2017 +# on Mon Nov 13 10:23:04 GMT 2017 name: cgat-p @@ -10,7 +10,7 @@ channels: dependencies: # python dependencies -- python +- python=3.6.3 - beautifulsoup4 - biopython - brewer2mpl @@ -24,12 +24,12 @@ dependencies: - numpy - pandas - pep8 -- pika +- pika=0.10.0 - pybedtools - pybigwig -- pysam +- pysam=0.11.1 - python-drmaa -- rpy2 +- rpy2=2.8.5 - ruffus - scikit-learn - scipy @@ -40,7 +40,7 @@ dependencies: - toposort - web.py # R dependencies -- r-base +- r-base=3.4.1 - r-flashclust - r-ggplot2 - r-gmd @@ -78,24 +78,29 @@ dependencies: - fastq-screen - fastqc - fastx_toolkit +- flash - gat - gatk -- gmap +- gmap=2017.05.08 # https://github.com/ContinuumIO/anaconda-issues/issues/704 - hisat2 - htslib - idr - jupyter - kallisto - nomkl +- pandaseq - peakranger - picard - sailfish - salmon - samtools - shortstack +- sickle-trim - star - stringtie - subread +- trim-galore +- trimmomatic - ucsc-bedgraphtobigwig - ucsc-bedtobigbed - ucsc-gtftogenepred diff --git a/conda/environments/pipelines-devel.yml b/conda/environments/pipelines-devel.yml index e9627246..fd4e3924 100644 --- a/conda/environments/pipelines-devel.yml +++ b/conda/environments/pipelines-devel.yml @@ -33,8 +33,9 @@ dependencies: - fastqc=0.11.5 - fastx_toolkit=0.0.14 - future=0.16.0 +- flash=1.2.11 - gat=1.3.5 -- gatk=3.8 +- gatk=3.7 - gmap=2017.05.08 # https://github.com/ContinuumIO/anaconda-issues/issues/7047 - hisat2=2.1.0 - htslib=1.4.1 @@ -49,6 +50,7 @@ dependencies: - nose=1.3.7 - numpy=1.13.3 - pandas=0.21.0 +- pandaseq=2.11 - peakranger=1.18 - pep8=1.7.1 - picard=2.14.1 @@ -74,7 +76,7 @@ dependencies: - r-wgcna=1.61 - rpy2=2.8.5 - ruffus=2.6.3 -- sailfish=0.9.0 +- sailfish=0.10.1 - salmon=0.8.1 - samtools=1.4.1 - scikit-learn=0.19.1 @@ -83,11 +85,14 @@ dependencies: - setuptools=36.6.0 - shortstack=3.8.3 - six=1.11.0 +- sickle-trim=1.33 - sqlalchemy=1.1.13 - star=2.5.3a - stringtie=1.3.3 - subread=1.5.3 - toposort=1.5 +- trimmomatic=0.36 +- trim-galore=0.4.4 - ucsc-bedgraphtobigwig=332 - ucsc-bedtobigbed=332 - ucsc-gtftogenepred=332 diff --git a/conda/environments/pipelines-extra-template.yml b/conda/environments/pipelines-extra-template.yml index 68f52163..79663665 100644 --- a/conda/environments/pipelines-extra-template.yml +++ b/conda/environments/pipelines-extra-template.yml @@ -11,4 +11,6 @@ channels: dependencies: - cutadapt +- blast +- bashlex diff --git a/conda/environments/pipelines-extra.yml b/conda/environments/pipelines-extra.yml new file mode 100644 index 00000000..5c458eea --- /dev/null +++ b/conda/environments/pipelines-extra.yml @@ -0,0 +1,16 @@ + +# This environment will include other dependencies that have not been detected automatically +# but are required to run the pipelines + +name: cgat-p + +channels: +- bioconda +- conda-forge +- defaults + +dependencies: +- cutadapt=1.14 +- blast=2.7.1 +- bashlex=0.12 + diff --git a/conda/environments/pipelines-production-template.yml b/conda/environments/pipelines-production-template.yml index a2053a24..e2e885cd 100644 --- a/conda/environments/pipelines-production-template.yml +++ b/conda/environments/pipelines-production-template.yml @@ -1,5 +1,5 @@ # output generated by /ifs/devel/sebastian/py35-v1/CGATPipelines/scripts/merge_conda_deps.sh -# on Thu Nov 2 17:08:16 GMT 2017 +# on Mon Nov 13 10:51:36 GMT 2017 name: cgat-p @@ -9,6 +9,7 @@ channels: - defaults dependencies: + - bedtools - bioconductor-chipqc - bioconductor-cummerbund @@ -24,6 +25,7 @@ dependencies: - fastq-screen - fastqc - fastx_toolkit +- flash - gat - gmap - hisat2 @@ -35,6 +37,7 @@ dependencies: - nomkl - numpy - pandas +- pandaseq - peakranger - picard - pika @@ -60,12 +63,15 @@ dependencies: - scipy - seaborn - shortstack +- sickle-trim - six - sqlalchemy - star - stringtie - subread - toposort +- trim-galore +- trimmomatic - ucsc-bedgraphtobigwig - ucsc-bedtobigbed - ucsc-gtftogenepred diff --git a/conda/environments/pipelines-production.yml b/conda/environments/pipelines-production.yml index 99b4b3d1..97677afd 100644 --- a/conda/environments/pipelines-production.yml +++ b/conda/environments/pipelines-production.yml @@ -53,7 +53,7 @@ dependencies: - r-wasabi=0.2 - rpy2=2.8.5 - ruffus=2.6.3 -- sailfish=0.9.0 +- sailfish=0.10.1 - salmon=0.8.1 - samtools=1.4.1 - scipy=0.19.1 diff --git a/doc/InstallingPipelines.rst b/doc/InstallingPipelines.rst index d66877e5..db0381e6 100644 --- a/doc/InstallingPipelines.rst +++ b/doc/InstallingPipelines.rst @@ -27,13 +27,20 @@ Here are the steps:: # see help: bash install-CGAT-tools.sh - # install set of production scripts (well tested): - bash install-CGAT-tools.sh --production [--location ] - - # or go for the latest development version: + # install the development version (recommended, no production version yet): bash install-CGAT-tools.sh --devel [--location ] -The installation script will put everything under the specified location. The aim of the + # enable the conda environment as requested by the installation script: + source /conda-install/bin/activate cgat-p + + # and uninstall pika, which we use internally for our dashboard: + conda remove pika + + # finally, please run the cgatflow command-line tool to check the installation: + cgatflow --help + +The installation script will put everything under the specified location. It needs +15 GB of disk space and it takes about 35 minutes to complete. The aim of the script is to provide a portable installation that does not interfere with the existing software. As a result, you will have a conda environment working with the CGAT Pipelines which can be enabled on demand according to your needs. @@ -79,7 +86,24 @@ tasks are sent to the cluster, but for some tasks this is not possible. These might thus run on the :term:`submit host`, so make sure it is fairly powerful. Pipelines expects that the :term:`working directory` is accessible with -the same path both from the submit and the :term:`execution host`. +the same path both from the submit and the :term:`execution host`. + +Also, please make sure that you configure the following environment variables:: + + # Access to the DRMAA library: https://en.wikipedia.org/wiki/DRMAA + export DRMAA_LIBRARY_PATH=//libdrmaa.so + + # You can get this value from your configured environment: + env | grep DRMAA_LIBRARY_PATH + + # or just look for the library: + find -name "*libdrmaa.so" + + # Also, make sure you have defined temporary folders + # 1. Local to execution hosts with + export TMPDIR=/tmp + # 2. Shared to pipeline working directory + export SHARED_TMPDIR=//scratch Software requirements ===================== diff --git a/install-CGAT-tools.sh b/install-CGAT-tools.sh index 31738331..0472bae3 100755 --- a/install-CGAT-tools.sh +++ b/install-CGAT-tools.sh @@ -226,8 +226,8 @@ wget -O env-scripts.yml https://raw.githubusercontent.com/CGATOxford/cgat/${SCRI wget -O env-pipelines.yml https://raw.githubusercontent.com/CGATOxford/CGATPipelines/${TRAVIS_BRANCH}/conda/environments/${CONDA_INSTALL_TYPE_PIPELINES} -conda env create --quiet --name ${CONDA_INSTALL_ENV} -f env-pipelines.yml -conda env update --quiet --name ${CONDA_INSTALL_ENV} -f env-scripts.yml +conda env create --quiet --name ${CONDA_INSTALL_ENV} --file env-pipelines.yml +conda env update --quiet --name ${CONDA_INSTALL_ENV} --file env-scripts.yml # activate cgat environment source $CONDA_INSTALL_DIR/bin/activate $CONDA_INSTALL_ENV @@ -243,6 +243,10 @@ if [[ "$OS" != "travis" ]] ; then if [[ $INSTALL_DEVEL ]] || [[ $INSTALL_PRODUCTION ]] ; then + # install extra deps + wget -O env-extra.yml https://raw.githubusercontent.com/CGATOxford/CGATPipelines/${TRAVIS_BRANCH}/conda/environments/pipelines-extra.yml + conda env update --quiet --name ${CONDA_INSTALL_ENV} --file env-extra.yml + # make sure you are in the CGAT_HOME folder cd $CGAT_HOME diff --git a/scripts/cgat_check_deps.py b/scripts/cgat_check_deps.py index ae41fe3c..68569671 100644 --- a/scripts/cgat_check_deps.py +++ b/scripts/cgat_check_deps.py @@ -99,6 +99,21 @@ def is_cgat_executable_name(node): return result +def is_cgat_cmd(node): + ''' + Auxiliary function to check for cgat statement: + cmd = "command" + ''' + + result = False + result = type(node) is ast.Assign and \ + hasattr(node, 'targets') and \ + hasattr(node.targets[0], 'id') and \ + node.targets[0].id == "cmd" + + return result + + def is_cgat_append(node): ''' Auxiliary function to check for cgat statement: @@ -218,7 +233,8 @@ def checkDepedencies(pipeline): statement = "" if is_cgat_statement(node) or \ is_cgat_executable(node) or \ - is_cgat_executable_name(node): + is_cgat_executable_name(node) or \ + is_cgat_cmd(node): statement = get_cmd_string(node) diff --git a/scripts/cgat_conda_deps.sh b/scripts/cgat_conda_deps.sh index 42838b1f..29dbc9a9 100755 --- a/scripts/cgat_conda_deps.sh +++ b/scripts/cgat_conda_deps.sh @@ -202,6 +202,7 @@ MISC_DEPS[fastqc]="fastqc" MISC_DEPS[fastx_collapser]="fastx_toolkit" MISC_DEPS[fastx_reverse_complement]="fastx_toolkit" MISC_DEPS[featureCounts]="subread" +MISC_DEPS[flash]="flash" MISC_DEPS[flashpca]="ignore" # pipeline_gwas.py is not in production (missing tests) MISC_DEPS[gat-run.py]="gat" MISC_DEPS[gemini]="ignore" # pipeline_exome.py is not in production (missing tests) @@ -231,6 +232,7 @@ MISC_DEPS[meme-chip]="ignore" # pipeline_motifs.py is not in production (missing MISC_DEPS[meme]="meme" MISC_DEPS[mergeBed]="bedtools" MISC_DEPS[metaphlan.py]="ignore" # pipeline_metagenomecommunities.py is not in production +MISC_DEPS[pandaseq]="pandaseq" MISC_DEPS[paste]="coreutils" MISC_DEPS[peakranger]="peakranger" MISC_DEPS[plink2]="ignore" # pipeline_gwas.py is not in production (missing tests) @@ -242,6 +244,7 @@ MISC_DEPS[rmats2sashimiplot]="ignore" # pipeline_splicing.py is not in productio MISC_DEPS[sailfish]="sailfish" MISC_DEPS[salmon]="salmon" MISC_DEPS[samtools]="samtools" +MISC_DEPS[sickle]="sickle-trim" MISC_DEPS[slopBed]="bedtools" MISC_DEPS[snpEff.sh]="ignore" # pipeline_exome.py is not in production (missing tests) MISC_DEPS[solid2fastq]="ignore" # SOLiD sequencing technology is no longer in use @@ -257,6 +260,8 @@ MISC_DEPS[tophat2]="tophat" MISC_DEPS[tophat]="tophat" MISC_DEPS[tr]="coreutils" MISC_DEPS[transfac2meme]="ignore" # pipeline_motifs.py is not in production (missing tests) +MISC_DEPS[trim_galore]="trim-galore" +MISC_DEPS[trimmomatic]="trimmomatic" MISC_DEPS[vcf-compare]="vcftools" MISC_DEPS[vcf-isec]="ignore" # pipeline_exome_cancer.py not in production (missing tests) MISC_DEPS[vcf-stats]="vcftools"