update

Bill Majoros · Bill Majoros · commit 5cbf2614068b · 2015-10-20T13:05:57.000-04:00
diff --git a/extract-gaa.py b/extract-gaa.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+import sys
+import os
+import gzip
+
+if(len(sys.argv)!=4):
+   print sys.argv[0]+" <in.vcf.gz> <begin> <end>"
+   sys.exit(0)
+[infile,begin,end]=sys.argv[1:]
+begin=int(begin)
+end=int(end)
+
+f=gzip.open(infile)
+for line in f:
+   line.rstrip("\n")
+   fields=line.split()
+   if len(fields)<7: continue
+   if fields[0]=="#CHROM":
+      header=fields
+      print line
+   elif fields[6]=="PASS":
+      [chr,pos,id,ref,alt]=fields[:5]
+      pos=int(pos)
+      if pos>=begin and pos<=end: print line
+
+
diff --git a/get-SNP-IDs.py b/get-SNP-IDs.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+import sys
+import os
+import re
+import basic
+
+name=sys.argv[0];
+if(len(sys.argv)!=2):
+   print name+" <*.vcf>"
+   sys.exit(0)
+[name,vcfFile]=sys.argv;
+
+IN=open(vcfFile,"r")
+while(True):
+  line=IN.readline()
+  #if(line is ""):
+  if not line: break
+  line.rstrip("\n")
+  if(re.search("#",line)): continue
+  fields=line.split()
+  if(len(fields)<10): continue
+  [chr,pos,id,ref,alt]=fields[:5]
+  if(len(ref)!=1 or len(alt)!=1): continue
+  if(not re.search("rs",id)):
+     id=chr+"at"+pos;
+  print id+"\t"+chr+"\t"+pos+"\t"+ref+"\t"+alt
+IN.close();
+
diff --git a/make-fastqc-slurms.py b/make-fastqc-slurms.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+import sys
+import os
+import basic
+import glob
+import re
+
+# Global variables
+samplesDir="/data/chilab/RNAseq_2015-07"
+slurmDir="/data/chilab/bill/slurm-fastqc"
+outputDir="/data/chilab/bill/fastqc"
+fastqc="/data/chilab/bill/software/FastQC/fastqc"
+
+# Make output directory
+if(not os.path.exists(slurmDir)):
+    os.makedirs(slurmDir)
+
+# Get list of sample directories
+samples=glob.glob(samplesDir+"/Sample_*")
+
+# Process each sample
+jobID=1
+for sample in samples:
+  match=re.search("(Sample_\S+)",sample); id=match.group(0)
+  outfile=slurmDir+"/"+id+".slurm"
+  OUT=open(outfile,"w")
+  header="\n".join(["#!/bin/bash",
+  "#",
+  "#SBATCH -J FASTQC%(jobID)i" % locals(),
+  "#SBATCH -o FASTQC%(jobID)i.output" % locals(),
+  "#SBATCH -e FASTQC%(jobID)i.output" % locals(),
+  "#SBATCH -A FASTQC%(jobID)i" % locals(),
+  "#\n"])
+  print >>OUT, header
+  print >>OUT, "cd "+outputDir
+
+  # Process each file
+  files=glob.glob(sample+"/*.fastq.gz")
+  for file in files:
+    command=fastqc+" -o "+outputDir+" "+file
+    print >>OUT, command
+
+  OUT.close()
+  jobID=jobID+1
diff --git a/make-star-slurms.py b/make-star-slurms.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+import sys
+import os
+import basic
+import glob
+import re
+
+# Global variables
+jobName="STAR"
+samplesDir="/data/chilab/RNAseq_2015-07"
+slurmDir="/data/chilab/bill/slurm-STAR"
+starIndex="/data/chilab/bill/STAR-index"
+fastqFiles="/data/chilab/bill/STAR"
+outputDir="/data/chilab/bill/sam"
+sjdbOverhang=125
+numThreads=8
+memory=40000
+STAR="/data/reddylab/software/STAR_2.4.2a/STAR-STAR_2.4.2a/bin/Linux_x86_64/STAR";
+
+# Make output directories
+if(not os.path.exists(slurmDir)):
+    os.makedirs(slurmDir)
+if(not os.path.exists(outputDir)):
+    os.makedirs(outputDir);
+
+# Get list of sample directories
+samples=glob.glob(samplesDir+"/Sample_*")
+
+# Process each sample
+jobID=1
+for sample in samples:
+  match=re.search("(Sample_\S+)",sample); id=match.group(0)
+  outfile=slurmDir+"/"+id+".slurm"
+  OUT=open(outfile,"w")
+  header="\n".join(["#!/bin/bash",
+  "#",
+  "#SBATCH -J %(jobName)s%(jobID)i" % locals(),
+  "#SBATCH -o %(jobName)s%(jobID)i.output" % locals(),
+  "#SBATCH -e %(jobName)s%(jobID)i.output" % locals(),
+  "#SBATCH -A %(jobName)s%(jobID)i" % locals(),
+  "#SBATCH --mem %(memory)i" %locals(),
+  "#SBATCH --cpus-per-task=%(numThreads)s" %locals(),
+  "#"])
+  print >>OUT, header
+  #print >>OUT, "cd "+outputDir
+  print >>OUT, "cd "+outputDir
+
+  # Process each file
+  files=glob.glob(sample+"/*.fastq.gz")
+  for file in files:
+    match=re.search("([^/]+)\s*$",file); 
+    if(match is None): sys.exit("can't parse filename")
+    fileNoPath=match.group(1)
+    match=re.search("(\S+_R)([12])(_\S+.fastq.gz)",fileNoPath);
+    if(match is None): sys.exit("can't parse paired file indicator: "+fileNoPath)
+    prefix=match.group(1)
+    R=int(match.group(2))
+    suffix=match.group(3)
+    if(R!=1): continue
+    firstFile=fastqFiles+"/"+fileNoPath
+    secondFile=fastqFiles+"/"+prefix+"2"+suffix
+    match=re.search("(\S+).fastq.gz",fileNoPath)
+    if(match is None): sys.exit("Can't parse filename")
+    filestem=match.group(1)
+    command=STAR+" --genomeLoad LoadAndKeep --genomeDir %(starIndex)s --readFilesIn %(firstFile)s %(secondFile)s --readFilesCommand zcat --outFileNamePrefix %(filestem)s --outSAMstrandField intronMotif --runThreadN %(numThreads)i" % locals()
+    print >>OUT, command
+
+  OUT.close()
+  jobID=jobID+1
+
diff --git a/parse-vcf.py b/parse-vcf.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+import sys
+import os
+import gzip
+
+if(len(sys.argv)!=2):
+   print sys.argv[0]+" <in.vcf.gz>"
+   sys.exit(0)
+[infile]=sys.argv[1:]
+
+for line in gzip.open(infile):
+   line.rstrip("\n")
+   fields=line.split()
+   if len(fields)<7: continue
+   if fields[0]=="#CHROM":
+      individuals=fields[9:]
+      numIndiv=len(individuals)
+      genotype={}
+      for id in individuals: genotype[id]=[]
+   elif fields[6]=="PASS":
+      [chr,pos,id,ref,alt]=fields[:5]
+      print id+":chr"+chr+":"+pos+":"+ref+":"+alt+"\t",
+      genotypes=fields[9:]
+      for i in range(0,numIndiv):
+         id=individuals[i]
+         gt=genotypes[i]
+         genotype[id].append(gt)
+print "\n"
+for id in individuals:
+   gt=genotype[id]
+   print id+"\t"+"\t".join(gt)
diff --git a/popstar-step1.py b/popstar-step1.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+import sys
+import os
+import distutils
+from distutils.spawn import find_executable
+import subprocess
+import basic
+
+name=sys.argv[0];
+
+if(len(sys.argv)!=5):
+   print "\n"+name+""" <SNPs.vcf> <chrom.fasta> <amplicons.bed> <fastq-dir>
+      SNPs.vcf      = VCF file containing variants
+      chrom.fasta   = FASTA file containing a single chromosome DNA sequence
+      amplicons.bed = BED file containing chromosome coordinates of amplicons
+      fastq-dir     = path to directory containing FASTQ files
+   """
+   sys.exit(0)
+[name,vcfFile,chrFile,ampliconsBed,fastqDir]=sys.argv
+workingDir=os.getcwd()
+alignmentsDir="aligned"
+if(not os.path.exists(alignmentsDir)):
+    os.makedirs(alignmentsDir)
+
+# First, do some sanity checks
+found=distutils.spawn.find_executable("bowtie2");
+if(found is None):
+    print "please install bowtie2\n"
+    sys.exit(0)
+
+# Make sure scripts are executable
+if(not os.path.exists("get-SNP-IDs.pl") or not os.path.exists("fdr.R")):
+   print "please copy *.pl and *.R scripts into the current directory";
+   sys.exit(1)
+os.system("chmod +x *.pl *.R");
+
+def System(cmd):
+   print "Excecuting: "+cmd
+   os.system(cmd)
+
+System("get-SNP-IDs.pl "+vcfFile+" > SNPs.txt");
+
+System("get-amplicons.pl "+chrFile+" "+ampliconsBed+" > amplicons.txt");
+
+System("make-haplotypes.pl "+vcfFile+" haplotypes.fasta > haplotypes.txt");
+
+System("cat haplotypes.txt | awk '{print $2 \"\t\" $1}' > amp-hap.txt");
+
+System("get-SNP-haplotypes.pl haplotypes.txt > SNP-haplotypes.txt");
+
+System("rm -f hap*bt2");
+
+System("bowtie2-build haplotypes.fasta hap");
+
+System("make-bowtie-slurms.pl "+fastqDir+" "+alignmentsDir+" "+workingDir);
+
+print "Please run bowtie2 now -- see commands in bowtie-slurms directory or bowtie-commands.sh\n";
+
diff --git a/popstar-step2.py b/popstar-step2.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+import sys
+import os
+import distutils
+from distutils.spawn import find_executable
+import subprocess
+import basic
+
+name=sys.argv[0];
+
+if(len(sys.argv)!=1):
+   print name+" <dna.sam>";
+   sys.exit(0)
+[name,dnaRep]=sys.argv;
+
+alignmentDir="aligned";
+
+def System(cmd):
+   print "Excecuting: "+cmd
+   os.system(cmd)
+
+System("get-haplo-read-counts.pl "+alignmentDir+" "+dnaRep+" > haplo-read-counts.txt");
+
+System("get-SNP-allele-counts.pl > SNP-read-counts.txt");
+
+System("filter-SNP-read-counts.pl SNP-read-counts.txt > SNP-read-counts-filtered.txt");
+
+System("SNP-fisher.R SNP-read-counts-filtered.txt > SNP-fdr.txt");
+
+System("analyze-haplotypes.pl "+alignmentDir+" "+dnaRep+" > analyze-haplotypes.txt");
+
+System("filter-zeros.pl > nonzero.txt");
+
+System("fdr.R > haplotype-fdr.txt");
+
+System("join2.pl > join.txt");
+
+
+
diff --git a/rank-variants.py b/rank-variants.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+import sys
+import os
+import operator
+
+if(len(sys.argv)!=2):
+   print sys.argv[0]+" <in.txt>"
+   sys.exit(0)
+[infile]=sys.argv[1:]
+
+f=open(infile)
+header=f.readline()
+variants=header.split()
+counts={}
+for line in f:
+   line.rstrip("\n")
+   fields=line.split()
+   n=len(fields)
+   for i in range(1,n):
+      id=variants[i-1]
+      if fields[i]!="1|1": continue
+      if not id in counts: counts[id]=1
+      else: counts[id]+=1
+ranked=sorted(counts.items(), key=operator.itemgetter(1),
+       reverse=True)
+for variant in ranked: print variant[0],"\t",variant[1]