bionode · thejmazz · Jun 9, 2017 · Jun 1, 2017 · Jun 1, 2017 · Jun 5, 2017
diff --git a/.gitignore b/.gitignore
@@ -25,4 +25,4 @@ test/files
 # example specific files/folders
 examples/pipelines/variant-calling-filtered/adapters
 examples/pipelines/variant-calling-filtered/bin
-
+.idea/
diff --git a/README.md b/README.md
@@ -76,6 +76,6 @@ const pipeline = join(
 
 ## Who is this tool for?
 
-Waterwheel is for **programmers** who desire an efficient and easy-to-write methodology for developing complex and dynamic data pipelines, while handling parallelization as much as possible. Waterwheel is an npm module, and is accessible by anyone willing to learn a little JavaScript. This is in contrast to other tools which develop their own DSL (domain specific language), which is not useful outside the tool. By leveraging the npm ecosystem and JavaScript on the client, Waterwheel can be built upon for inclusion on web apis, modern web applications, as well as native applications through [Electron](http://electron.atom.io/). Look forward to seeing Galaxy-like applications backed by a completely configurable Node API.
+Bionode-watermill is for **programmers** who desire an efficient and easy-to-write methodology for developing complex and dynamic data pipelines, while handling parallelization as much as possible. Bionode-watermill is an npm module, and is accessible by anyone willing to learn a little JavaScript. This is in contrast to other tools which develop their own DSL (domain specific language), which is not useful outside the tool. By leveraging the npm ecosystem and JavaScript on the client, Bionode-watermill can be built upon for inclusion on web apis, modern web applications, as well as native applications through [Electron](http://electron.atom.io/). Look forward to seeing Galaxy-like applications backed by a completely configurable Node API.
 
-Waterwheel is for **biologists** who understand it is important to experiment with sample data, parameter values, and tools. Compared to other workflow systems, the ease of swapping around parameters and tools is much improved, allowing you to iteratively compare results and construct more confident inferences. Consider the ability to construct your own [Teaser](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0803-1) for *your data* with a *simple syntax*, and getting utmost performance out of the box.
+Bionode-watermill is for **biologists** who understand it is important to experiment with sample data, parameter values, and tools. Compared to other workflow systems, the ease of swapping around parameters and tools is much improved, allowing you to iteratively compare results and construct more confident inferences. Consider the ability to construct your own [Teaser](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0803-1) for *your data* with a *simple syntax*, and getting utmost performance out of the box.
diff --git a/examples/pipelines/two-mappers/pipeline.js b/examples/pipelines/two-mappers/pipeline.js
@@ -0,0 +1,127 @@
+'use strict'
+
+// === WATERMILL ===
+const {
+  task,
+  join,
+  junction,
+  fork
+} = require('../../..')
+
+// === MODULES ===
+
+const fs = require('fs')
+const path = require('path')
+
+const request = require('request')
+const ncbi = require('bionode-ncbi')
+
+// === CONFIG ===
+
+const THREADS = parseInt(process.env.WATERMILL_THREADS) || 2
+
+const config = {
+  name: 'Streptococcus pneumoniae',
+  sraAccession: 'ERR045788',
+  referenceURL: 'http://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/007/045/' +
+  'GCF_000007045.1_ASM704v1/GCF_000007045.1_ASM704v1_genomic.fna.gz'
+}
+
+// === TASKS ===
+
+// first lets get the reference genome for our mapping
+const getReference = task({
+  params: { url: config.referenceURL },
+  output: '*_genomic.fna.gz',
+  name: 'Download reference genome for ${config.name}'
+}, ({ params, dir }) => {
+  const { url } = params
+  const outfile = url.split('/').pop()
+
+  // essentially curl -O
+  return request(url).pipe(fs.createWriteStream(dir + '/' + outfile))
+})
+
+//then get samples to work with
+const getSamples = task({
+  params: {
+    db: 'sra',
+    accession: config.sraAccession
+  },
+  output: '**/*.sra',
+  dir: process.cwd(), // Set dir to resolve input/output from
+  name: 'Download SRA ${config.sraAccession}'
+}, ({ params }) => `bionode-ncbi download ${params.db} ${params.accession}` )
+
+// extract the samples from fastq.gz
+const fastqDump = task({
+  input: '**/*.sra',
+  output: [1, 2].map(n => `*_${n}.fastq.gz`),
+  name: 'fastq-dump **/*.sra'
+}, ({ input }) => `fastq-dump --split-files --skip-technical --gzip ${input}` )
+
+// then index using first bwa ...
+const IndexReferenceBwa = task({
+  input: '*_genomic.fna.gz',
+  output: ['amb', 'ann', 'bwt', 'pac', 'sa'].map(suffix =>
+    `*_genomic.fna.gz.${suffix}`),
+  name: 'bwa index *_genomic.fna.gz'
+}, ({ input }) => `bwa index ${input}`)
+
+// and bowtie2
+
+const indexReferenceBowtie2 = task({
+  input: '*_genomic.fna.gz',
+  output: ['1.bt2', '2.bt2', '3.bt2', '4.bt2', 'rev.1.bt2',
+    'rev.2.bt2'].map(suffix => `bowtie_index.${suffix}`),
+  params: { output: 'bowtie_index'},
+  name: 'bowtie2-build -q uncompressed.fa bowtie_index'
+}, ({ params, input }) => `gunzip -c ${input} > uncompressed.fa | bowtie2-build -q uncompressed.fa ${params.output}`
+)
+
+// now use mappers with bwa
+
+const bwaMapper = task({
+  input: {
+    reference: '*_genomic.fna.gz',
+    reads:[1, 2].map(n => `*_${n}.fastq.gz`),
+    indexFiles: ['amb', 'ann', 'bwt', 'pac', 'sa'].map(suffix =>
+      `*_genomic.fna.gz.${suffix}`) //pass index files to bwa mem
+  },
+  output: '*.sam',
+  params: {output: 'bwa_output.sam'},
+  name: 'Mapping with bwa...'
+}, ({ input, params }) => `bwa mem -t ${THREADS} ${input.reference} ${input.reads[0]} ${input.reads[1]} > ${params.output}`
+)
+
+// and with bowtie2
+
+const bowtieMapper = task({
+  input: {
+    reference: '*_genomic.fna.gz',
+    reads:[1, 2].map(n => `*_${n}.fastq.gz`),
+    indexFiles: ['1.bt2', '2.bt2', '3.bt2', '4.bt2', 'rev.1.bt2',
+      'rev.2.bt2'].map(suffix => `bowtie_index.${suffix}`) //pass index files to
+    // bowtie2
+  },
+  output: '*.sam',
+  params: {output: 'bowtie2_output.sam'},
+  name: 'Mapping with bowtie2...'
+}, ({ input, params }) => `bowtie2 -p ${THREADS} -x bowtie_index -1 ${input.reads[0]} -2 ${input.reads[1]} -S ${params.output}`
+)
+
+// === PIPELINE ===
+
+const pipeline = join(
+  junction(
+      getReference,
+      join(getSamples,fastqDump)
+  ),
+  fork(
+    join(IndexReferenceBwa, bwaMapper),
+    join(indexReferenceBowtie2, bowtieMapper)
+  )
+)
+
+// actual run pipelines and return results
+pipeline().then(results => console.log('PIPELINE RESULTS: ', results))
diff --git a/examples/pipelines/variant-calling-filtered/pipeline.js b/examples/pipelines/variant-calling-filtered/pipeline.js
@@ -6,7 +6,7 @@ const {
   join,
   junction,
   fork
-} = require('bionode-watermill')
+} = require('../../..')
 
 // === MODULES ===
 const fs = require('fs')
@@ -18,17 +18,16 @@ const ncbi = require('bionode-ncbi')
 // === CONFIG ===
 const THREADS = parseInt(process.env.WATERMILL_THREADS) || 4
 const MEMORYGB = parseInt(process.env.WATERMILL_MEMORY) || 4
-const TMPDIR = path.resolve(__dirname, 'temp') // Assume this already exists
+const TMPDIR = path.resolve(__dirname, 'temp') // Now directory is created before running kmc
 const config = {
-  name: 'Salmonella enterica',
-  sraAccession: '2492428',
-  referenceURL: 'http://ftp.ncbi.nlm.nih.gov/genomes/all/GCA_000988525.2_ASM98852v2/GCA_000988525.2_ASM98852v2_genomic.fna.gz'
+  name: 'Streptococcus pneumoniae',
+  sraAccession: 'ERR045788',
+  referenceURL: 'http://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/007/045/GCF_000007045.1_ASM704v1/GCF_000007045.1_ASM704v1_genomic.fna.gz'
 }
-
 const KMERSIZE = 20
 const MINCOVERAGE = 5
-const PLOTXMAX = 60
-const PLOTYMAX = 1200000
+const PLOTXMAX = 60 //unused
+const PLOTYMAX = 1200000 //unused
 
 // === TASKS ===
 
@@ -61,7 +60,7 @@ const bwaIndex = task({
   input: '*_genomic.fna.gz',
   output: ['amb', 'ann', 'bwt', 'pac', 'sa'].map(suffix => `*_genomic.fna.gz.${suffix}`),
   name: 'bwa index *_genomic.fna.gz'
-}, ({ input }) => `bwa index ${input}` )
+}, ({ input }) => `bwa index ${input}`)
 
 
 /**
@@ -79,7 +78,7 @@ const getSamples = task({
   output: '**/*.sra',
   dir: process.cwd(), // Set dir to resolve input/output from
   name: `Download SRA ${config.sraAccession}`
-}, ({ params }) => ncbi.download(params.db, params.accession).resume() )
+}, ({ params }) => `bionode-ncbi download ${params.db} ${params.accession}` )
 
 
 /**
@@ -132,7 +131,7 @@ const filterKMC = task({
   output: 'reads.trim.pe.filtered.fastq.gz',
   params: { kmcFile: 'reads.trim.pe.kmc' },
   name: 'Filtering with KMC'
-}, ({ input, params }) => `
+}, ({ input, params }) => ` mkdir ${TMPDIR} > /dev/null &&
 kmc -k${KMERSIZE} -m${MEMORYGB} -t${THREADS} ${input} ${params.kmcFile} ${TMPDIR} && \
 kmc_tools filter ${params.kmcFile} -cx${MINCOVERAGE} ${input} -ci0 -cx0 reads.trim.pe.filtered.fastq.gz
 `)
@@ -230,7 +229,7 @@ const pipeline = join(
   junction(
     join(getReference, bwaIndex),
     join(getSamples, fastqDump)
-  ),
+   ),
   trim, mergeTrimEnds,
   decompressReference, // only b/c mpileup did not like fna.gz
   join(
@@ -241,3 +240,4 @@ const pipeline = join(
 
 pipeline().then(results => console.log('PIPELINE RESULTS: ', results))
 
+