[MRG] fix genome download rule (#233)

* fix parsing of genome csv * update zip rule * update zip info * add genbank_cache/ to something that's tested by make test
dib-lab · Sep 30, 2022 · d8ef5ef · d8ef5ef
1 parent 1ade842
commit d8ef5ef
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 22 deletions.
diff --git a/doc/quickstart.md b/doc/quickstart.md
@@ -93,6 +93,6 @@ Some key output files under the outputs directory are:
 * `trim/{sample}.trim.fq.gz` - trimmed and preprocessed reads.
 * `sigs/{sample}.trim.sig.zip` - sourmash signature for the preprocessed reads.
 
-Note that `genome-grist run <config.yml> zip` will create a file named `transfer.zip` with the above files in it.
+Note that `genome-grist run <config.yml> zip` will create a file named `<output_dir>.zip` with the above files in it.
 
 Please see [the guide to genome-grist output files](output-guide.md) for more information!
diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile
@@ -468,10 +468,13 @@ rule check:
 @toplevel
 rule zip:
     shell: """
-        rm -f transfer.zip
-        zip -r transfer.zip {outdir}/leftover/*.summary.csv \
+        ZIPFILE=$(basename "{outdir}").zip
+        rm -f $ZIPFILE
+        zip -r $ZIPFILE {outdir}/leftover/*.summary.csv \
                 {outdir}/mapping/*.summary.csv {outdir}/*.yaml \
-                {outdir}/gather/*.csv.gz {outdir}/reports/
+                {outdir}/gather/*.csv.gz {outdir}/gather/*.out \
+                {outdir}/reports/
+        echo "Created $ZIPFILE"
     """
 
 
@@ -1046,24 +1049,22 @@ rule download_matching_genome_wc:
     output:
         genome = f"{GENBANK_CACHE}/{{ident}}_genomic.fna.gz"
     run:
-        with gzip.open(input.csvfile, 'rt') as infp:
-            r = csv.DictReader(infp)
-            rows = list(r)
-            assert len(rows) == 1
-            row = rows[0]
-            ident = row['ident']
-            assert wildcards.ident.startswith(ident)
-            url = row['genome_url']
-            name = row['display_name']
-
-            print(f"downloading genome for ident {ident}/{name} from NCBI...",
-                file=sys.stderr)
-            with open(output.genome, 'wb') as outfp:
-                with urllib.request.urlopen(url) as response:
-                    content = response.read()
-                    outfp.write(content)
-                    print(f"...wrote {len(content)} bytes to {output.genome}",
-                        file=sys.stderr)
+        rows = list(load_csv(input.csvfile))
+        assert len(rows) == 1
+        row = rows[0]
+        ident = row['ident']
+        assert wildcards.ident.startswith(ident)
+        url = row['genome_url']
+        name = row['display_name']
+
+        print(f"downloading genome for ident {ident}/{name} from NCBI...",
+              file=sys.stderr)
+        with open(output.genome, 'wb') as outfp:
+            with urllib.request.urlopen(url) as response:
+                content = response.read()
+                outfp.write(content)
+                print(f"...wrote {len(content)} bytes to {output.genome}",
+                      file=sys.stderr)
 
 # summarize_reads_info
 rule summarize_reads_info_wc:

diff --git a/tests/test-data/SRR5950647.conf b/tests/test-data/SRR5950647.conf
@@ -6,3 +6,4 @@ sourmash_databases:
 taxonomies:
 - ../sourmash/gtdb-rs202.taxonomy.v2.csv
 metagenome_trim_memory: 1e9
+genbank_cache: outputs.test/genbank_cache