diff --git a/.github/workflows/build-sif.yml b/.github/workflows/build-sif.yml new file mode 100644 index 0000000..2ca5621 --- /dev/null +++ b/.github/workflows/build-sif.yml @@ -0,0 +1,31 @@ +name: Build SIF from Def File + +on: + push: + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Check out code + uses: actions/checkout@v2 + + - name: Setup Singularity + uses: eWaterCycle/setup-singularity@v7 + with: + singularity-version: '3.8.3' + + - name: Install debootstrap + run: sudo apt-get update && sudo apt-get install -y debootstrap + + - name: Build SIF from Definition File + run: | + singularity build --fakeroot GThaCk.sif GThaCk.def + + - name: Upload SIF as Artifact + uses: actions/upload-artifact@v2 + with: + name: GThaCk.sif + path: GThaCk.sif \ No newline at end of file diff --git a/GThaCk.def b/GThaCk.def new file mode 100644 index 0000000..22ab4a5 --- /dev/null +++ b/GThaCk.def @@ -0,0 +1,105 @@ +Bootstrap: debootstrap +OSVersion: bionic +MirrorURL: http://us.archive.ubuntu.com/ubuntu/ +Include: apt + +%help + This container contains software for the CCPM biobank pertaining to biobank data qc, freeze generation, and gtc manipulations and validations. See documentation of packages at: https://github.com/tbrunetti/GThaCk and https://github.com/Illumina/GTCtoVCF + +%setup + +%post + echo "Installing all container dependencies!" + apt-get -y update + apt-get install -y software-properties-common + apt-get install -y gpgv gpgsm gnupg-l10n gnupg + apt-get -y install git + add-apt-repository universe + apt-get -y install libpcre2-8-0 libcurl4-openssl-dev + apt-get install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev wget + add-apt-repository ppa:deadsnakes/ppa + apt-get -y update + apt-get -y install autoconf + apt-get -y install automake + apt-get -y install libbz2-dev + apt-get -y install liblzma-dev + apt-get -y install libgsl0-dev + apt-get -y install perl + wget https://github.com/Kitware/CMake/releases/download/v3.15.2/cmake-3.15.2.tar.gz + tar -zxvf cmake-3.15.2.tar.gz + cd cmake-3.15.2 + ./bootstrap + make + make install + apt-get -y install libssl1.1 + apt-get -y install libxml2-dev + apt-get -y install python3 + apt-get -y install python3-pip + apt-get -y install python3-dev + apt-get -y install 2to3 + apt-get -y install python3-lib2to3 + apt-get install python3-toolz + python3 -m pip install --upgrade pip + python3 -m pip install cget + python3 -m pip install numpy + python3 -m pip install pandas + python3 -m pip install matplotlib + python3 -m pip install seaborn + python3 -m pip install pysam + python3 -m pip install pyvcf + python3 -m pip install Cython + which python3 + ln -s /usr/bin/python3 /usr/bin/python + export LC_ALL=C.UTF-8 + export LANG=C.UTF-8 + apt-get -y update + + cd /opt/ + git clone https://github.com/samtools/htslib.git + cd /opt/htslib/ + git submodule update --init --recursive + autoheader + autoconf -Wno-syntax + ./configure + make + make install + + cd /opt/ + git clone https://github.com/samtools/samtools.git + cd /opt/samtools/ + autoheader + autoconf -Wno-syntax + ./configure + make + make install + + cd /opt/ + git clone https://github.com/samtools/bcftools.git + cd bcftools + autoheader && autoconf && ./configure --enable-libgsl + make + make install + cd /opt/ + git clone https://github.com/sgopalan98/GThaCk.git + cd /opt/GThaCk/ + git checkout fixing-bug-manipulate-gtc + cd /opt/GThaCk/modules + mv IlluminaBeadArrayFiles.tar.gz /usr/local/lib/python3.6/dist-packages/ + cd /usr/local/lib/python3.6/dist-packages/ + tar -zxvf IlluminaBeadArrayFiles.tar.gz + cd /opt/ + git clone https://github.com/Illumina/GTCtoVCF.git + cd /opt/GTCtoVCF/scripts + ./download_reference.sh 37 + ./download_reference.sh 38 + +%files + +%environment + export NETCDF_INCLUDE=/usr/include + export PATH="${PATH}:/mnt:/opt/GTCtoVCF:/opt/:/opt/GThaCk/:/usr/bin:/bin:/usr/local/sbin/:/usr/local/bin" + export PYTHONPATH="${PYTHONPATH}:/mnt/:/opt/" + export PYTHONPATH="${PYTHONPATH}:/usr/local/bin/:/usr/local/lib/python3.6/dist-packages/:/usr/local/lib/python3.6/:/usr/bin/:/opt/GThaCk/:/opt/GThaCk/modules" + +%runscript + exec "$@" diff --git a/gtcFuncs.py b/gtcFuncs.py index e4a4741..e60e7ec 100644 --- a/gtcFuncs.py +++ b/gtcFuncs.py @@ -7,9 +7,10 @@ class GtcFunctions: - def __init__(self, bpm, gtcDir, outDir): + def __init__(self, bpm, bpm_csv, gtcDir, outDir): self.logger = logging.getLogger("classObject") self.bpm = bpm + self.bpm_csv = bpm_csv self.gtcDir = gtcDir self.outDir = outDir @@ -133,6 +134,7 @@ def query(): parser = argparse.ArgumentParser(description='Functions and methods for gtc files') parser.add_argument('method', choices=['manipulateGTCs', 'getIntensities', 'sampleInformation', 'createSampleSheet', 'allCombos']) parser.add_argument('--bpm', required=True, type=str, help='Full path to bead pool manifest file (.bpm); must be same one used to generate gtc') + parser.add_argument('--bpm-csv',required=True, type=str, help='Full path to bead pool manifest file in CSV form (.csv); must be same one used to generate gtc') parser.add_argument('--gtcDir', type=str, default=os.getcwd(), help='Full path to location of directory/folder containing gtc files to process (files must end in .gtc) -- will not recursively go into subdirectories') parser.add_argument('--outDir', default=os.getcwd(), type=str,help='Full path to directory or folder to output results. If it path does not exist, program will attempt to create it') parser.add_argument('--updates', default=None, type=str, help='Full path to file containing snps and/or metadata to update') @@ -184,7 +186,7 @@ def query(): if args.method == 'manipulateGTCs': logger.info('method manipulateGTCs selected \n creating new object of class GtcFunctions') - analysisObj = GtcFunctions(args.bpm, args.gtcDir, args.outDir) + analysisObj = GtcFunctions(args.bpm, args.bpm_csv, args.gtcDir, args.outDir) analysisObj.manipulateUpdate(args.updates, args.overrides) elif args.method == 'createSampleSheet': diff --git a/modules/manipulateGTC.py b/modules/manipulateGTC.py index f0fbc0a..48b38eb 100644 --- a/modules/manipulateGTC.py +++ b/modules/manipulateGTC.py @@ -18,6 +18,7 @@ def manipulate_gtc(self): logger.debug('In method manipulate_gtc()') bpm=self.bpm + bpm_csv=self.bpm_csv gtcDir=self.gtcDir outDir=self.outDir snpsToUpdate=self.snpUpdateFile @@ -73,31 +74,64 @@ def snpUpdate(data, line): logger = logging.getLogger('snpUpdate') logger.debug("In sub-method of manipulate_gtc() -- snpUpdate()") + COMPLEMENT_MAP = dict(zip("ABCDGHKMRTVYNID", "TVGHCDMKYABRNID")) + loc = manifest.names.index(line.rstrip().split()[0]) - originalSnp = data[1003][loc] - data[1003][loc] = str(line.rstrip().split()[1]).encode() - if ((str(line.rstrip().split()[1])[0] != str(line.rstrip().split()[1])[1]) and (str(line.rstrip().split()[1])[0] != '-')): + manifest_name = manifest.names[loc] + manifest_name_csv = manifest_csv.names[loc] + assert manifest_name == manifest_name_csv + + + manifestSnpsStr = manifest.snps[loc].decode() + manifestSnps = [manifestSnpsStr[1], manifestSnpsStr[-2]] + if manifest.ref_strands[loc] == RefStrand.Minus: + manifestSnps = [COMPLEMENT_MAP[snp] for snp in manifestSnps] + + + newSnpsStr = str(line.rstrip().split()[1]) + newSnps = [newSnpsStr[0], newSnpsStr[1]] + + # Getting genotype byte + if newSnps[0] != newSnps[1] and newSnps[0] != '-': data[1002][loc] = 2 - elif (str(line.rstrip().split()[1])[0] == '-') and (str(line.rstrip().split()[1])[1] == '-'): + elif newSnps[0] == '-' and newSnps[1] == '-': data[1002][loc] = 0 - elif (str(line.rstrip().split()[1])[0] == str(line.rstrip().split()[1])[1]) and (str(line.rstrip().split()[1])[0] in ['A', 'T', 'G', 'C']) and (str(line.rstrip().split()[1])[1] in ['A', 'T', 'G', 'C']): - if manifest.snps[loc].decode().find(str(line.rstrip().split()[1])[0]) != -1: - data[1002][loc] = manifest.snps[loc].decode().find(str(line.rstrip().split()[1])[0]) + data[1003][loc] = '--'.encode() + elif newSnps[0] == newSnps[1]: + if newSnps[0] in manifestSnps: + if newSnps[0] == manifestSnps[0]: + data[1002][loc] = 1 + else: + data[1002][loc] = 3 else: - logger.warning('WARNING! {} allele possibilities do not match manifest. {}={} and manifest={}. This snp will not be updated.' - .format(line.rstrip().split()[0], - line.rstrip().split()[0], originalSnp, - manifest.snps[loc])) - print( - 'WARNING! {} allele possibilities do not match manifest. {}={} and manifest={}. This snp will not be updated.' - .format(line.rstrip().split()[0], - line.rstrip().split()[0], originalSnp, - manifest.snps[loc])) - sys.stdout.flush() - data[1003][loc] = originalSnp + newSnps = [COMPLEMENT_MAP[snp] for snp in newSnps] + if newSnps[0] == manifestSnps[0]: + data[1002][loc] = 1 + else: + data[1002][loc] = 3 else: - pass + print("NO CONDITION MET; SOMETHING IS WRONG") + # Getting base call byte + if data[1002][loc] == 0: + data[1003][loc] = '--'.encode() + else: + if newSnps[0] in ['I', 'D']: + if data[1002][loc] == 1: + data[1003][loc] = (manifestSnps[0] + manifestSnps[0]).encode() + elif data[1002][loc] == 2: + data[1003][loc] = (manifestSnps[0] + manifestSnps[1]).encode() + else: + data[1003][loc] = (manifestSnps[1] + manifestSnps[1]).encode() + else: + allele_a = manifest_csv.alleles[loc][0] + allele_b = manifest_csv.alleles[loc][1] + if data[1002][loc] == 1: + data[1003][loc] = (allele_a + allele_a).encode() + elif data[1002][loc] == 2: + data[1003][loc] = (allele_a + allele_b).encode() + else: + data[1003][loc] = (allele_b + allele_b).encode() return data @@ -174,6 +208,44 @@ def snpOverride(manifest, overrides): ################################################################################################################### logger.debug('Preparing to read in bpm file...') manifest = BeadPoolManifest(bpm) + def get_manifest_csv(): + import csv + + def read_csv(file_path): + with open(file_path) as f: + reader = csv.reader(f) + for i in range(7): + next(reader) + header = next(reader) + for row in reader: + yield dict(zip(header, row)) + + + file_path = self.bpm_csv + # for row in read_csv(file_path): + # print(row.header) + rows = list(read_csv(file_path)) + class Manifest: + def __init__(self): + self.names = [] + self.alleles = [] + + manifest = Manifest() + for row in rows: + if 'Name' not in row or 'TopGenomicSeq' not in row: + continue + name = row['Name'] + top_genomic_seq = row['TopGenomicSeq'] + # get the string in top_genomic_seq which is in the format [alphabet/alphabet] and which will be in any index in the top_genomic_seq + allele = 'NA' + for i in range(len(top_genomic_seq) - 1): + if top_genomic_seq[i] == '[' and top_genomic_seq[i + 4] == ']': + allele = top_genomic_seq[i + 1], top_genomic_seq[i + 3] + break + manifest.names.append(name) + manifest.alleles.append(allele) + return manifest + manifest_csv = get_manifest_csv() logger.debug('Successfully loaded BPM file') #######################