public-database-snapshot

#!/bin/bash

export PATH="$PATH:.:scripts"

phenotype_step=true
report_step=true
clean_step=true

test_run=false

## debug
#phenotype_step=true
#report_step=true
#clean_step=true
#test_run=true

set -e
set -o pipefail

if [ $clean_step = true ] ; then
  rm -rf stage
  rm -rf out-data
  rm -rf scratch
fi


mkdir -p stage
mkdir -p out-data
mkdir -p scratch

if [ $phenotype_step = true ]
then

  # collect survey data
  #
  echo "collecting survey data"
  ./scripts/grab_tap_survey.sh
  ./scripts/create_phenotype.py > stage/survey.tsv

  # get user list
  #
  echo "getting user list"
  ./scripts/grab_user.sh

  # scrape Tapestry for phenotype information
  # and file locations
  #
  echo "scraping tapestry for phenotype info"

  if [ $test_run = true ]
  then
    ./scripts/grab_profile_data.sh <( head -n10 scratch/hid.list )
  else
    ./scripts/grab_profile_data.sh scratch/hid.list
  fi

  #################################
  #################################
  # get specimen list
  #
  #echo "getting specimens list"
  #./scripts/grab_specimens.sh
  #mv ./scratch/specimens.tsv ./stage/specimens.tsv
  echo -e "human_id\thref\ttext\tdescription\tlog_date\tlog_text\tlog_description" > ./scratch/specimens.tsv
  if [ $test_run = true ]
  then
    while read huid ; do
      ./scripts/collect_samples_from_profile_data.py scratch/$huid/$huid.html $huid >> ./scratch/specimens.tsv
    done  < <( head -n10 scratch/hid.list )
  else
    while read huid ; do
      ./scripts/collect_samples_from_profile_data.py scratch/$huid/$huid.html $huid >> ./scratch/specimens.tsv
    done  < scratch/hid.list
  fi
  mv ./scratch/specimens.tsv ./stage/specimens.tsv
  #################################
  #################################


  # collect them into tsv files
  #
  echo "collecting into tsv files"
  ./scripts/collect_profile_data.py

  # scrape Tapestry for enrollment dates
  #
  echo "grabbing enrollment dates"
  ./scripts/grab_enrollment_date.sh
  cp ./scratch/hid-enrolldate.list ./stage/enrollment_date.tsv


  # insert into the untap.db.  Use it to
  # generate the list of download urls for
  # the report collection step.
  #
  #cd stage
  #cat ../db/untap-db.sql | sqlite3 untap.db
  #echo "select id, human_id, report_url from uploaded_data where length(report_url)>0;" | sqlite3 untap.db > huid_report_url.csv
  #cd ..

  echo "inserting into untap.db"
  ./scripts/export_surveys_to_sqlite3.py ./stage/untap.db ./stage

fi


if [ $report_step = true  ]
then
  echo "select id, human_id, report_url from uploaded_data where length(report_url)>0;" | sqlite3 stage/untap.db > stage/huid_report_url.csv

  # download, collect and de-duplicate the report data
  # and insert into untap.db
  #
  echo "downloading report data"
  mv stage/huid_report_url.csv ge-report
  cd ge-report
  ./generate
  cd ..

  # collect results from the report aggregation step.
  #
  echo "collecting results"
  mv ge-report/out-data/* stage/

  # create final database
  #
  echo "creating final database"
  cd stage
  cat ../db/untap-db-report.sql | sqlite3 untap.db
  gzip -c untap.db > untap.sqlite3.gz
  cd ..

fi

if [ $clean_step = true  ]
then
  # finally collect everything into the out-data directory
  #
  rm -f ./hu-pgp.sqlite3.gz
  echo "cleaning up into out-data directory"
  ln -s ./stage/untap.sqlite3.gz ./hu-pgp.sqlite3.gz
  mv stage/* out-data/

fi