diff --git a/docs/exhibit/exhibit_dhsi2020.rst b/docs/exhibit/exhibit_dhsi2020.rst
index b5410da41..5d07c9cb2 100644
--- a/docs/exhibit/exhibit_dhsi2020.rst
+++ b/docs/exhibit/exhibit_dhsi2020.rst
@@ -38,7 +38,7 @@ Check that there are 14 samples to be downloaded.
wc -l morelli2010/sqlite_import/assembly_for_download.txt
-Download the samples and reference found in the `Cui et al. 2013 pulication `_.
+Repeat for the `Cui et al. 2013 pulication `_.
**Cui 2013 Dataset**::
@@ -85,73 +85,69 @@ Run the full pipeline, including sample download, aligning to a reference genome
------------
-TimeTree Metadata
+Extract Metadata
-----------------
-Prepare metadata files for timetree/augur.
+Extract metadata from the SQLite database.
-**Morelli 2010 Dataset**::
+**Shell Scripts**::
- mkdir -p morelli2010/nextstrain/
+ project=morelli2010;
+ projectAuthor=Morelli;
+ #project=cui2013;
+ #projectAuthor=Cui;
+ # Extract metadata from sqlite database
+ mkdir -p $project/nextstrain/
scripts/sqlite_NextStrain_tsv.py \
--database results/ncbimeta_db/update/latest/output/database/yersinia_pestis_db.sqlite \
- --query "SELECT BioSampleAccession,AssemblyFTPGenbank,BioSampleStrain,BioSampleCollectionDate,BioSampleGeographicLocation,BioSampleBiovar,BioSampleHost FROM Master WHERE (BioSampleComment LIKE '%Morelli%' AND TRIM(AssemblyFTPGenbank) > '')" \
+ --query "SELECT BioSampleAccession,AssemblyFTPGenbank,BioSampleStrain,BioSampleCollectionDate,BioSampleGeographicLocation,BioSampleBiovar,BioSampleHost FROM Master WHERE (BioSampleComment LIKE '%$projectAuthor%' AND TRIM(AssemblyFTPGenbank) > '' AND BioSampleComment NOT LIKE '%REMOVE%')" \
--no-data-char ? \
- --output morelli2010/nextstrain/metadata_nextstrain.tsv
+ --output $project/nextstrain/metadata_nextstrain.tsv
+ # Add the reference genome metadata as a final line
sqlite3 results/ncbimeta_db/update/latest/output/database/yersinia_pestis_db.sqlite \
"SELECT BioSampleAccession,AssemblyFTPGenbank,BioSampleStrain,BioSampleCollectionDate,BioSampleGeographicLocation,BioSampleBiovar,BioSampleHost FROM Master WHERE BioSampleComment LIKE '%Reference%'" | \
- sed 's/|/\t/g' >> morelli2010/nextstrain/metadata_nextstrain.tsv
+ sed 's/|/\t/g' >> $project/nextstrain/metadata_nextstrain.tsv
- head -n 1 morelli2010/nextstrain/metadata_nextstrain.tsv | \
+ # Write header to a new edited metadata file, add col "strain"
+ head -n 1 $project/nextstrain/metadata_nextstrain.tsv | \
awk -F "\t" '{print "strain\t"$0}' \
- > morelli2010/nextstrain/metadata_nextstrain_edit.tsv
+ > $project/nextstrain/metadata_nextstrain_edit.tsv
- tail -n +2 morelli2010/nextstrain/metadata_nextstrain.tsv | \
+ # Figure out the assembly file names by parsing the FTP url column, save to col "strain"
+ tail -n +2 $project/nextstrain/metadata_nextstrain.tsv | \
awk -F "\t" '{split($2,ftpSplit,"/"); name=ftpSplit[10]"_genomic"; print name"\t"$0}' \
- >> morelli2010/nextstrain/metadata_nextstrain_edit.tsv
-
- sed -i 's/GCA_000009065.1_ASM906v1_genomic/Reference/g' morelli2010/nextstrain/metadata_nextstrain_edit.tsv
+ >> $project/nextstrain/metadata_nextstrain_edit.tsv
-**Cui 2013 Dataset**::
+ # Change reference genome file name to "Reference"
+ sed -i 's/GCA_000009065.1_ASM906v1_genomic/Reference/g' $project/nextstrain/metadata_nextstrain_edit.tsv
+ # Standardize biovar nomenclature
+ sed -i 's/Mediaevalis/Medievalis/g' $project/nextstrain/metadata_nextstrain_edit.tsv
- mkdir -p cui2013/nextstrain/
-
- scripts/sqlite_NextStrain_tsv.py \
- --database results/ncbimeta_db/update/latest/output/database/yersinia_pestis_db.sqlite \
- --query "SELECT BioSampleAccession,AssemblyFTPGenbank,BioSampleStrain,BioSampleCollectionDate,BioSampleGeographicLocation,BioSampleBiovar,BioSampleHost FROM Master WHERE (BioSampleComment LIKE '%Cui%' AND TRIM(AssemblyFTPGenbank) > '')" \
- --no-data-char ? \
- --output cui2013/nextstrain/metadata_nextstrain.tsv;
-
- sqlite3 results/ncbimeta_db/update/latest/output/database/yersinia_pestis_db.sqlite \
- "SELECT BioSampleAccession,AssemblyFTPGenbank,BioSampleStrain,BioSampleCollectionDate,BioSampleGeographicLocation,BioSampleBiovar,BioSampleHost FROM Master WHERE BioSampleComment LIKE '%Reference%'" | \
- sed 's/|/\t/g' >> cui2013/nextstrain/metadata_nextstrain.tsv;
-
- head -n 1 cui2013/nextstrain/metadata_nextstrain.tsv | \
- awk -F "\t" '{print "strain\t"$0}' \
- > cui2013/nextstrain/metadata_nextstrain_edit.tsv;
+------------
- tail -n +2 cui2013/nextstrain/metadata_nextstrain.tsv | \
- awk -F "\t" '{split($2,ftpSplit,"/"); name=ftpSplit[10]"_genomic"; print name"\t"$0}' \
- >> cui2013/nextstrain/metadata_nextstrain_edit.tsv;
+Date Formatting
+---------------
- sed -i 's/GCA_000009065.1_ASM906v1_genomic/Reference/g' cui2013/nextstrain/metadata_nextstrain_edit.tsv;
+Change the BioSampleCollectionDate column to 'date' and change format to 2000-XX-XX.
+Code in the uncertainty dates of the following strains:
+* Pestoides A and Pestoides F to 1950-1984
+* G8786 to be generally in the 1900s.
+* India195 to be 1898-1950.
-Afterwards, change the BioSampleCollectionDate column to 'date', remove uncertainty characters in date (<, >) and change format to 2000-XX-XX.
-Change the uncertainty dates of the following strains:
-India195, Angola, Pestoides A, Pestoides F to 1950-1984
-G8786 to be generally in the 1900s.
+**Shell Script**::
-**Morelli 2010 Dataset**::
+ project=morelli2010;
+ #project=cui2013;
- sed -i 's/BioSampleCollectionDate/date/g' morelli2010/nextstrain/metadata_nextstrain_edit.tsv
+ sed -i 's/BioSampleCollectionDate/date/g' $project/nextstrain/metadata_nextstrain_edit.tsv
awk -F "\t" -v dateCol=5 -v strainCol=4 'BEGIN{OFS=FS}{
if($dateCol != "date" && $dateCol != "?"){
gsub(/>|<|?/,"",$dateCol);
$dateCol=$dateCol"-XX-XX";
}
- if ($strainCol == "Angola" || $strainCol == "Pestoides A" || $strainCol == "Pestoides F"){
+ if ($strainCol == "Pestoides A" || $strainCol == "Pestoides F"){
$dateCol="[1950.00:1983.99]"
}
if ($strainCol == "India195"){
@@ -160,46 +156,21 @@ G8786 to be generally in the 1900s.
if ($strainCol == "G8786"){
$dateCol="[1900.00:1999.99]"
}
- print $0}' morelli2010/nextstrain/metadata_nextstrain_edit.tsv > morelli2010/nextstrain/metadata_nextstrain_dates.tsv
+ print $0}' $project/nextstrain/metadata_nextstrain_edit.tsv > $project/nextstrain/metadata_nextstrain_dates.tsv
+------------
-**Cui 2013 Dataset**::
-
- sed -i 's/BioSampleCollectionDate/date/g' morelli2010/nextstrain/metadata_nextstrain_edit.tsv
- awk -F "\t" -v dateCol=5 -v strainCol=4 'BEGIN{OFS=FS}{
- if($dateCol != "date" && $dateCol != "?"){
- gsub(/>|<|?/,"",$dateCol);
- $dateCol=$dateCol"-XX-XX";
- }
- if ($strainCol == "Angola" || $strainCol == "Pestoides A" || $strainCol == "Pestoides F"){
- $dateCol="[1950.00:1983.99]"
- }
- if ($strainCol == "India195"){
- $dateCol="[1898.99:1950.00]"
- }
- if ($strainCol == "G8786"){
- $dateCol="[1900.00:1999.99]"
- }
- print $0}' cui2013/nextstrain/metadata_nextstrain_edit.tsv > cui2013/nextstrain/metadata_nextstrain_dates.tsv
+Geocoding
+---------------
Edit the BioSampleGeographicLocation column so that location is simply country name. Also change select country names.
+Geocode the GeographicLocation column to get lat lon coordinates.
+Replace the division name 'country' with our column name 'BioSampleGeographicLocation' in the lat lon file.
-**Morelli 2010 Dataset**::
-
- awk -F "\t" -v geoCol=6 'BEGIN{OFS=FS}{
- if($geoCol != "BioSampleGeographicLocation" && $geoCol != "?"){
- geoColLen=split($geoCol,geoColSplit,",");
- $geoCol=geoColSplit[geoColLen];
- gsub(/^ /,"",$geoCol)
- }
- print $0}' morelli2010/nextstrain/metadata_nextstrain_dates.tsv > morelli2010/nextstrain/metadata_nextstrain_country.tsv
-
- sed -i 's/USSR/Russia/g' morelli2010/nextstrain/metadata_nextstrain_country.tsv
- sed -i 's/Kurdistan/Iran/g' morelli2010/nextstrain/metadata_nextstrain_country.tsv
- sed -i 's/USA/United States of America/g' morelli2010/nextstrain/metadata_nextstrain_country.tsv
-
+**Geocoding**::
-**Cui 2013 Dataset**::
+ project=morelli2010;
+ #project=cui2013;
awk -F "\t" -v geoCol=6 'BEGIN{OFS=FS}{
if($geoCol != "BioSampleGeographicLocation" && $geoCol != "?"){
@@ -207,52 +178,20 @@ Edit the BioSampleGeographicLocation column so that location is simply country n
$geoCol=geoColSplit[geoColLen];
gsub(/^ /,"",$geoCol)
}
- print $0}' cui2013/nextstrain/metadata_nextstrain_dates.tsv > cui2013/nextstrain/metadata_nextstrain_country.tsv
-
- sed -i 's/USSR/Russia/g' cui2013/nextstrain/metadata_nextstrain_country.tsv
- sed -i 's/Kurdistan/Iran/g' cui2013/nextstrain/metadata_nextstrain_country.tsv
- sed -i 's/USA/United States of America/g' cui2013/nextstrain/metadata_nextstrain_country.tsv
-
+ print $0}' $project/nextstrain/metadata_nextstrain_dates.tsv > $project/nextstrain/metadata_nextstrain_country.tsv
-Geocode the GeographicLocation column to get lat lon coordinates.
-
-**Morelli 2010 Dataset**::
+ sed -i 's/USSR/Russia/g' $project/nextstrain/metadata_nextstrain_country.tsv
+ sed -i 's/Kurdistan/Iran/g' $project/nextstrain/metadata_nextstrain_country.tsv
+ sed -i 's/USA/United States of America/g' $project/nextstrain/metadata_nextstrain_country.tsv
scripts/geocode_NextStrain.py \
- --in-tsv morelli2010/nextstrain/metadata_nextstrain_country.tsv \
+ --in-tsv $project/nextstrain/metadata_nextstrain_country.tsv \
--loc-col BioSampleGeographicLocation \
- --out-tsv morelli2010/nextstrain/metadata_nextstrain_geocode.tsv \
- --out-lat-lon morelli2010/nextstrain/lat_longs.tsv \
+ --out-tsv $project/nextstrain/metadata_nextstrain_geocode.tsv \
+ --out-lat-lon $project/nextstrain/lat_longs.tsv \
--div country
-**Cui 2013 Dataset**::
-
- scripts/geocode_NextStrain.py \
- --in-tsv cui2013/nextstrain/metadata_nextstrain_country.tsv \
- --loc-col BioSampleGeographicLocation \
- --out-tsv cui2013/nextstrain/metadata_nextstrain_geocode.tsv \
- --out-lat-lon cui2013/nextstrain/lat_longs.tsv \
- --div country
-
-Replace the division name 'country' with our column name 'BioSampleGeographicLocation' in the lat lon file.
-
-**Morelli 2010 Dataset**::
-
- sed -i 's/country/BioSampleGeographicLocation/g' morelli2010/nextstrain/lat_longs.tsv
-
-**Cui 2013 Dataset**::
-
- sed -i 's/country/BioSampleGeographicLocation/g' cui2013/nextstrain/lat_longs.tsv
-
-Last Fixups. Standarize biovar spelling.
-
-**Morelli 2010 Dataset**::
-
- sed -i 's/Mediaevalis/Medievalis/g' morelli2010/nextstrain/metadata_nextstrain_geocode.tsv
-
-**Cui 2013 Dataset**::
-
- sed -i 's/Mediaevalis/Medievalis/g' cui2013/nextstrain/metadata_nextstrain_geocode.tsv
+ sed -i 's/country/BioSampleGeographicLocation/g' $project/nextstrain/lat_longs.tsv
------------