Skip to content
Pierre Lindenbaum edited this page Jul 29, 2015 · 1 revision

Motivation

convert Makefile to snakemake: https://bitbucket.org/johanneskoester/snakemake/wiki/Home

The original Makefile

.PHONY: all all_fasta clean
GILIST=52854274 156118490 290782623 209485592 149126991 254749437 269857780 14971105 256041807 269857713

%.fa: 
	$(description $@,download gi:$(basename $@) from NCBI as fasta)wget -O "$@"  "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=$(basename $@)&retmode=text&rettype=fasta"

all: all_fasta
	echo "Done"

all_fasta: longest.fa

longest.fa : all.fa
	$(description $@,get the longest sequence in $<)awk '/^>/ { printf("%s%s\t",(NR==1?"":"\n"),$$0);next;} { printf("%s",$$0);} END {printf("\n");}' $< |\
	awk -F '\t' '{printf("%d\t%s\n",length($$2),$$0);}' | sort -t '	' -k1,1n | tail -n1 | cut -f 2- |\
	tr "\t" "\n" > $@

all.fa : $(addsuffix .fa,${GILIST})
	$(description $@,concatenate everything)cat $^ > $@
	
clean:
	rm -f $(addsuffix .fa,${GILIST}) longest.fa

convert the makefile to XML

$ ../make-4.1/bin/xml-make4.1 --xml test03.xml -f test03.mk all

content of test03.xml:

<?xml version="1.0" encoding="UTF-8"?>
<make shell="/bin/sh" shellflags="-c" path="/home/lindenb/package/jdk1.8.0_40/bin:/home/lindenb/package/eclipse:/home/lindenb/package/jdk1.8.0_40/bin:/home/lindenb/package/eclipse:/home/lindenb/package/firefox:/home/lindenb/bin:/usr/lib/lightdm/lightdm:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/home/lindenb/package/edirect" pwd="/home/lindenb/src/xml-patch-make/tests">
  <target name="52854274.fa" description="download gi:52854274 from NCBI as fasta" id="1" precious="0" phony="0">
    <statements>
      <statement>wget -O "52854274.fa"  "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&amp;id=52854274&amp;retmode=text&amp;rettype=fasta"</statement>
    </statements>
  </target>
  <target name="156118490.fa" description="download gi:156118490 from NCBI as fasta" id="2" precious="0" phony="0">
    <statements>
      <statement>wget -O "156118490.fa"  "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&amp;id=156118490&amp;retmode=text&amp;rettype=fasta"</statement>
    </statements>
  </target>
  <target name="290782623.fa" description="download gi:290782623 from NCBI as fasta" id="3" precious="0" phony="0">
    <statements>
      <statement>wget -O "290782623.fa"  "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&amp;id=290782623&amp;retmode=text&amp;rettype=fasta"</statement>
    </statements>
  </target>
  <target name="209485592.fa" description="download gi:209485592 from NCBI as fasta" id="4" precious="0" phony="0">
    <statements>
      <statement>wget -O "209485592.fa"  "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&amp;id=209485592&amp;retmode=text&amp;rettype=fasta"</statement>
    </statements>
  </target>
  <target name="149126991.fa" description="download gi:149126991 from NCBI as fasta" id="5" precious="0" phony="0">
    <statements>
      <statement>wget -O "149126991.fa"  "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&amp;id=149126991&amp;retmode=text&amp;rettype=fasta"</statement>
    </statements>
  </target>
  <target name="254749437.fa" description="download gi:254749437 from NCBI as fasta" id="6" precious="0" phony="0">
    <statements>
      <statement>wget -O "254749437.fa"  "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&amp;id=254749437&amp;retmode=text&amp;rettype=fasta"</statement>
    </statements>
  </target>
  <target name="269857780.fa" description="download gi:269857780 from NCBI as fasta" id="7" precious="0" phony="0">
    <statements>
      <statement>wget -O "269857780.fa"  "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&amp;id=269857780&amp;retmode=text&amp;rettype=fasta"</statement>
    </statements>
  </target>
  <target name="14971105.fa" description="download gi:14971105 from NCBI as fasta" id="8" precious="0" phony="0">
    <statements>
      <statement>wget -O "14971105.fa"  "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&amp;id=14971105&amp;retmode=text&amp;rettype=fasta"</statement>
    </statements>
  </target>
  <target name="256041807.fa" description="download gi:256041807 from NCBI as fasta" id="9" precious="0" phony="0">
    <statements>
      <statement>wget -O "256041807.fa"  "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&amp;id=256041807&amp;retmode=text&amp;rettype=fasta"</statement>
    </statements>
  </target>
  <target name="269857713.fa" description="download gi:269857713 from NCBI as fasta" id="10" precious="0" phony="0">
    <statements>
      <statement>wget -O "269857713.fa"  "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&amp;id=269857713&amp;retmode=text&amp;rettype=fasta"</statement>
    </statements>
  </target>
  <target name="all.fa" description="concatenate everything" id="11" precious="0" phony="0">
    <prerequisites>
      <prerequisite name="52854274.fa" ref="1"/>
      <prerequisite name="156118490.fa" ref="2"/>
      <prerequisite name="290782623.fa" ref="3"/>
      <prerequisite name="209485592.fa" ref="4"/>
      <prerequisite name="149126991.fa" ref="5"/>
      <prerequisite name="254749437.fa" ref="6"/>
      <prerequisite name="269857780.fa" ref="7"/>
      <prerequisite name="14971105.fa" ref="8"/>
      <prerequisite name="256041807.fa" ref="9"/>
      <prerequisite name="269857713.fa" ref="10"/>
    </prerequisites>
    <statements>
      <statement>cat 52854274.fa 156118490.fa 290782623.fa 209485592.fa 149126991.fa 254749437.fa 269857780.fa 14971105.fa 256041807.fa 269857713.fa &gt; all.fa</statement>
      <statement/>
    </statements>
  </target>
  <target name="longest.fa" description="get the longest sequence in all.fa" id="12" precious="0" phony="0">
    <prerequisites>
      <prerequisite name="all.fa" ref="11"/>
    </prerequisites>
    <statements>
      <statement>awk '/^&gt;/ { printf("%s%s\t",(NR==1?"":"\n"),$0);next;} { printf("%s",$0);} END {printf("\n");}' all.fa |\
	awk -F '\t' '{printf("%d\t%s\n",length($2),$0);}' | sort -t '	' -k1,1n | tail -n1 | cut -f 2- |\
	tr "\t" "\n" &gt; longest.fa</statement>
    </statements>
  </target>
  <target name="all_fasta" description="all_fasta" id="13" precious="0" phony="1">
    <prerequisites>
      <prerequisite name="longest.fa" ref="12"/>
    </prerequisites>
  </target>
  <target name="all" description="all" id="14" precious="0" phony="1">
    <prerequisites>
      <prerequisite name="all_fasta" ref="13"/>
    </prerequisites>
    <statements>
      <statement>echo "Done"</statement>
    </statements>
  </target>
</make>

Convert the XML to Snake using xslt

$ xsltproc ../stylesheets/graph2snake.xsl test03.xml > Snakefile

content of the Snakefile:

shell.executable("/bin/sh")




rule rule14:
	"""all"""

	output: '__14_phony.flag'
	input: '__13_phony.flag'
	
	shell: 
		"touch '__14_phony.flag';" \
		"echo \"Done\""


rule rule13:
	"""all_fasta"""

	output: '__13_phony.flag'
	input: 'longest.fa'
	
	shell: 
		"touch '__13_phony.flag';" \
		"echo '__13_phony.flag'"
			


rule rule12:
	"""get the longest sequence in all.fa"""

	output: 'longest.fa'
	input: 'all.fa'
	
	shell: "awk '/^>/ {{ printf(\"%s%s\\t\",(NR==1?\"\":\"\\n\"),$0);next;}} {{ printf(\"%s\",$0);}} END {{printf(\"\\n\");}}' all.fa | 	awk -F '\\t' '{{printf(\"%d\\t%s\\n\",length($2),$0);}}' | sort -t '	' -k1,1n | tail -n1 | cut -f 2- | 	tr \"\\t\" \"\\n\" > longest.fa"


rule rule11:
	"""concatenate everything"""

	output: 'all.fa'
	input: '52854274.fa' , '156118490.fa' , '290782623.fa' , '209485592.fa' , '149126991.fa' , '254749437.fa' , '269857780.fa' , '14971105.fa' , '256041807.fa' , '269857713.fa'
	
	shell: "cat 52854274.fa 156118490.fa 290782623.fa 209485592.fa 149126991.fa 254749437.fa 269857780.fa 14971105.fa 256041807.fa 269857713.fa > all.fa ; " \
		""


rule rule10:
	"""download gi:269857713 from NCBI as fasta"""

	output: '269857713.fa'
	
	shell: "wget -O \"269857713.fa\"  \"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=269857713&retmode=text&rettype=fasta\""


rule rule9:
	"""download gi:256041807 from NCBI as fasta"""

	output: '256041807.fa'
	
	shell: "wget -O \"256041807.fa\"  \"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=256041807&retmode=text&rettype=fasta\""


rule rule8:
	"""download gi:14971105 from NCBI as fasta"""

	output: '14971105.fa'
	
	shell: "wget -O \"14971105.fa\"  \"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=14971105&retmode=text&rettype=fasta\""


rule rule7:
	"""download gi:269857780 from NCBI as fasta"""

	output: '269857780.fa'
	
	shell: "wget -O \"269857780.fa\"  \"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=269857780&retmode=text&rettype=fasta\""


rule rule6:
	"""download gi:254749437 from NCBI as fasta"""

	output: '254749437.fa'
	
	shell: "wget -O \"254749437.fa\"  \"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=254749437&retmode=text&rettype=fasta\""


rule rule5:
	"""download gi:149126991 from NCBI as fasta"""

	output: '149126991.fa'
	
	shell: "wget -O \"149126991.fa\"  \"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=149126991&retmode=text&rettype=fasta\""


rule rule4:
	"""download gi:209485592 from NCBI as fasta"""

	output: '209485592.fa'
	
	shell: "wget -O \"209485592.fa\"  \"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=209485592&retmode=text&rettype=fasta\""


rule rule3:
	"""download gi:290782623 from NCBI as fasta"""

	output: '290782623.fa'
	
	shell: "wget -O \"290782623.fa\"  \"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=290782623&retmode=text&rettype=fasta\""


rule rule2:
	"""download gi:156118490 from NCBI as fasta"""

	output: '156118490.fa'
	
	shell: "wget -O \"156118490.fa\"  \"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=156118490&retmode=text&rettype=fasta\""


rule rule1:
	"""download gi:52854274 from NCBI as fasta"""

	output: '52854274.fa'
	
	shell: "wget -O \"52854274.fa\"  \"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=52854274&retmode=text&rettype=fasta\""

invoke snakemake

lindenb@hardyweinberg:~/src/xml-patch-make/tests$ ../snakemake/bin/snakemake
Provided cores: 1
Rules claiming more threads will be scaled down.
Job counts:
	count	jobs
	1	rule1
	1	rule10
	1	rule11
	1	rule12
	1	rule13
	1	rule14
	1	rule2
	1	rule3
	1	rule4
	1	rule5
	1	rule6
	1	rule7
	1	rule8
	1	rule9
	14
rule rule1:
	output: 52854274.fa
--2015-07-29 17:25:01--  http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=52854274&retmode=text&rettype=fasta
Resolving cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)... 193.52.104.20, 2001:660:7220:386:193:52:104:20
Connecting to cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)|193.52.104.20|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: `52854274.fa'

    [ <=>                                                                                                       ] 325         --.-K/s   in 0s      

2015-07-29 17:25:01 (18.3 MB/s) - `52854274.fa' saved [325]

1 of 14 steps (7%) done
rule rule3:
	output: 290782623.fa
--2015-07-29 17:25:01--  http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=290782623&retmode=text&rettype=fasta
Resolving cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)... 193.52.104.20, 2001:660:7220:386:193:52:104:20
Connecting to cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)|193.52.104.20|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: `290782623.fa'

    [ <=>                                                                                                       ] 370         --.-K/s   in 0s      

2015-07-29 17:25:01 (20.1 MB/s) - `290782623.fa' saved [370]

2 of 14 steps (14%) done
rule rule5:
	output: 149126991.fa
--2015-07-29 17:25:01--  http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=149126991&retmode=text&rettype=fasta
Resolving cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)... 193.52.104.20, 2001:660:7220:386:193:52:104:20
Connecting to cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)|193.52.104.20|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: `149126991.fa'

    [ <=>                                                                                                       ] 384         --.-K/s   in 0s      

2015-07-29 17:25:02 (22.7 MB/s) - `149126991.fa' saved [384]

3 of 14 steps (21%) done
rule rule4:
	output: 209485592.fa
--2015-07-29 17:25:02--  http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=209485592&retmode=text&rettype=fasta
Resolving cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)... 193.52.104.20, 2001:660:7220:386:193:52:104:20
Connecting to cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)|193.52.104.20|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: `209485592.fa'

    [ <=>                                                                                                       ] 384         --.-K/s   in 0s      

2015-07-29 17:25:02 (22.9 MB/s) - `209485592.fa' saved [384]

4 of 14 steps (29%) done
rule rule9:
	output: 256041807.fa
--2015-07-29 17:25:02--  http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=256041807&retmode=text&rettype=fasta
Resolving cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)... 193.52.104.20, 2001:660:7220:386:193:52:104:20
Connecting to cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)|193.52.104.20|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: `256041807.fa'

    [ <=>                                                                                                       ] 400         --.-K/s   in 0s      

2015-07-29 17:25:03 (23.1 MB/s) - `256041807.fa' saved [400]

5 of 14 steps (36%) done
rule rule2:
	output: 156118490.fa
--2015-07-29 17:25:03--  http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=156118490&retmode=text&rettype=fasta
Resolving cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)... 193.52.104.20, 2001:660:7220:386:193:52:104:20
Connecting to cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)|193.52.104.20|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: `156118490.fa'

    [ <=>                                                                                                       ] 367         --.-K/s   in 0s      

2015-07-29 17:25:05 (31.5 MB/s) - `156118490.fa' saved [367]

6 of 14 steps (43%) done
rule rule10:
	output: 269857713.fa
--2015-07-29 17:25:05--  http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=269857713&retmode=text&rettype=fasta
Resolving cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)... 193.52.104.20, 2001:660:7220:386:193:52:104:20
Connecting to cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)|193.52.104.20|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: `269857713.fa'

    [ <=>                                                                                                       ] 398         --.-K/s   in 0s      

2015-07-29 17:25:05 (21.9 MB/s) - `269857713.fa' saved [398]

7 of 14 steps (50%) done
rule rule6:
	output: 254749437.fa
--2015-07-29 17:25:05--  http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=254749437&retmode=text&rettype=fasta
Resolving cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)... 193.52.104.20, 2001:660:7220:386:193:52:104:20
Connecting to cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)|193.52.104.20|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: `254749437.fa'

    [ <=>                                                                                                       ] 397         --.-K/s   in 0s      

2015-07-29 17:25:05 (22.4 MB/s) - `254749437.fa' saved [397]

8 of 14 steps (57%) done
rule rule7:
	output: 269857780.fa
--2015-07-29 17:25:05--  http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=269857780&retmode=text&rettype=fasta
Resolving cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)... 193.52.104.20, 2001:660:7220:386:193:52:104:20
Connecting to cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)|193.52.104.20|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: `269857780.fa'

    [ <=>                                                                                                       ] 401         --.-K/s   in 0s      

2015-07-29 17:25:05 (22.5 MB/s) - `269857780.fa' saved [401]

9 of 14 steps (64%) done
rule rule8:
	output: 14971105.fa
--2015-07-29 17:25:05--  http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=14971105&retmode=text&rettype=fasta
Resolving cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)... 193.52.104.20, 2001:660:7220:386:193:52:104:20
Connecting to cache.ha.univ-nantes.fr (cache.ha.univ-nantes.fr)|193.52.104.20|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: `14971105.fa'

    [ <=>                                                                                                       ] 379         --.-K/s   in 0s      

2015-07-29 17:25:05 (21.0 MB/s) - `14971105.fa' saved [379]

10 of 14 steps (71%) done
rule rule11:
	input: 52854274.fa, 156118490.fa, 290782623.fa, 209485592.fa, 149126991.fa, 254749437.fa, 269857780.fa, 14971105.fa, 256041807.fa, 269857713.fa
	output: all.fa
11 of 14 steps (79%) done
rule rule12:
	input: all.fa
	output: longest.fa
12 of 14 steps (86%) done
rule rule13:
	input: longest.fa
	output: __13_phony.flag
__13_phony.flag
13 of 14 steps (93%) done
rule rule14:
	input: __13_phony.flag
	output: __14_phony.flag
Done
14 of 14 steps (100%) done