Skip to content

Commit 5cbf261

Browse files
author
Bill Majoros
committed
update
0 parents  commit 5cbf261

8 files changed

+322
-0
lines changed

extract-gaa.py

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/usr/bin/env python
2+
import sys
3+
import os
4+
import gzip
5+
6+
if(len(sys.argv)!=4):
7+
print sys.argv[0]+" <in.vcf.gz> <begin> <end>"
8+
sys.exit(0)
9+
[infile,begin,end]=sys.argv[1:]
10+
begin=int(begin)
11+
end=int(end)
12+
13+
f=gzip.open(infile)
14+
for line in f:
15+
line.rstrip("\n")
16+
fields=line.split()
17+
if len(fields)<7: continue
18+
if fields[0]=="#CHROM":
19+
header=fields
20+
print line
21+
elif fields[6]=="PASS":
22+
[chr,pos,id,ref,alt]=fields[:5]
23+
pos=int(pos)
24+
if pos>=begin and pos<=end: print line
25+
26+

get-SNP-IDs.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/usr/bin/env python
2+
import sys
3+
import os
4+
import re
5+
import basic
6+
7+
name=sys.argv[0];
8+
if(len(sys.argv)!=2):
9+
print name+" <*.vcf>"
10+
sys.exit(0)
11+
[name,vcfFile]=sys.argv;
12+
13+
IN=open(vcfFile,"r")
14+
while(True):
15+
line=IN.readline()
16+
#if(line is ""):
17+
if not line: break
18+
line.rstrip("\n")
19+
if(re.search("#",line)): continue
20+
fields=line.split()
21+
if(len(fields)<10): continue
22+
[chr,pos,id,ref,alt]=fields[:5]
23+
if(len(ref)!=1 or len(alt)!=1): continue
24+
if(not re.search("rs",id)):
25+
id=chr+"at"+pos;
26+
print id+"\t"+chr+"\t"+pos+"\t"+ref+"\t"+alt
27+
IN.close();
28+

make-fastqc-slurms.py

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/usr/bin/env python
2+
import sys
3+
import os
4+
import basic
5+
import glob
6+
import re
7+
8+
# Global variables
9+
samplesDir="/data/chilab/RNAseq_2015-07"
10+
slurmDir="/data/chilab/bill/slurm-fastqc"
11+
outputDir="/data/chilab/bill/fastqc"
12+
fastqc="/data/chilab/bill/software/FastQC/fastqc"
13+
14+
# Make output directory
15+
if(not os.path.exists(slurmDir)):
16+
os.makedirs(slurmDir)
17+
18+
# Get list of sample directories
19+
samples=glob.glob(samplesDir+"/Sample_*")
20+
21+
# Process each sample
22+
jobID=1
23+
for sample in samples:
24+
match=re.search("(Sample_\S+)",sample); id=match.group(0)
25+
outfile=slurmDir+"/"+id+".slurm"
26+
OUT=open(outfile,"w")
27+
header="\n".join(["#!/bin/bash",
28+
"#",
29+
"#SBATCH -J FASTQC%(jobID)i" % locals(),
30+
"#SBATCH -o FASTQC%(jobID)i.output" % locals(),
31+
"#SBATCH -e FASTQC%(jobID)i.output" % locals(),
32+
"#SBATCH -A FASTQC%(jobID)i" % locals(),
33+
"#\n"])
34+
print >>OUT, header
35+
print >>OUT, "cd "+outputDir
36+
37+
# Process each file
38+
files=glob.glob(sample+"/*.fastq.gz")
39+
for file in files:
40+
command=fastqc+" -o "+outputDir+" "+file
41+
print >>OUT, command
42+
43+
OUT.close()
44+
jobID=jobID+1

make-star-slurms.py

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/usr/bin/env python
2+
import sys
3+
import os
4+
import basic
5+
import glob
6+
import re
7+
8+
# Global variables
9+
jobName="STAR"
10+
samplesDir="/data/chilab/RNAseq_2015-07"
11+
slurmDir="/data/chilab/bill/slurm-STAR"
12+
starIndex="/data/chilab/bill/STAR-index"
13+
fastqFiles="/data/chilab/bill/STAR"
14+
outputDir="/data/chilab/bill/sam"
15+
sjdbOverhang=125
16+
numThreads=8
17+
memory=40000
18+
STAR="/data/reddylab/software/STAR_2.4.2a/STAR-STAR_2.4.2a/bin/Linux_x86_64/STAR";
19+
20+
# Make output directories
21+
if(not os.path.exists(slurmDir)):
22+
os.makedirs(slurmDir)
23+
if(not os.path.exists(outputDir)):
24+
os.makedirs(outputDir);
25+
26+
# Get list of sample directories
27+
samples=glob.glob(samplesDir+"/Sample_*")
28+
29+
# Process each sample
30+
jobID=1
31+
for sample in samples:
32+
match=re.search("(Sample_\S+)",sample); id=match.group(0)
33+
outfile=slurmDir+"/"+id+".slurm"
34+
OUT=open(outfile,"w")
35+
header="\n".join(["#!/bin/bash",
36+
"#",
37+
"#SBATCH -J %(jobName)s%(jobID)i" % locals(),
38+
"#SBATCH -o %(jobName)s%(jobID)i.output" % locals(),
39+
"#SBATCH -e %(jobName)s%(jobID)i.output" % locals(),
40+
"#SBATCH -A %(jobName)s%(jobID)i" % locals(),
41+
"#SBATCH --mem %(memory)i" %locals(),
42+
"#SBATCH --cpus-per-task=%(numThreads)s" %locals(),
43+
"#"])
44+
print >>OUT, header
45+
#print >>OUT, "cd "+outputDir
46+
print >>OUT, "cd "+outputDir
47+
48+
# Process each file
49+
files=glob.glob(sample+"/*.fastq.gz")
50+
for file in files:
51+
match=re.search("([^/]+)\s*$",file);
52+
if(match is None): sys.exit("can't parse filename")
53+
fileNoPath=match.group(1)
54+
match=re.search("(\S+_R)([12])(_\S+.fastq.gz)",fileNoPath);
55+
if(match is None): sys.exit("can't parse paired file indicator: "+fileNoPath)
56+
prefix=match.group(1)
57+
R=int(match.group(2))
58+
suffix=match.group(3)
59+
if(R!=1): continue
60+
firstFile=fastqFiles+"/"+fileNoPath
61+
secondFile=fastqFiles+"/"+prefix+"2"+suffix
62+
match=re.search("(\S+).fastq.gz",fileNoPath)
63+
if(match is None): sys.exit("Can't parse filename")
64+
filestem=match.group(1)
65+
command=STAR+" --genomeLoad LoadAndKeep --genomeDir %(starIndex)s --readFilesIn %(firstFile)s %(secondFile)s --readFilesCommand zcat --outFileNamePrefix %(filestem)s --outSAMstrandField intronMotif --runThreadN %(numThreads)i" % locals()
66+
print >>OUT, command
67+
68+
OUT.close()
69+
jobID=jobID+1
70+

parse-vcf.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/usr/bin/env python
2+
import sys
3+
import os
4+
import gzip
5+
6+
if(len(sys.argv)!=2):
7+
print sys.argv[0]+" <in.vcf.gz>"
8+
sys.exit(0)
9+
[infile]=sys.argv[1:]
10+
11+
for line in gzip.open(infile):
12+
line.rstrip("\n")
13+
fields=line.split()
14+
if len(fields)<7: continue
15+
if fields[0]=="#CHROM":
16+
individuals=fields[9:]
17+
numIndiv=len(individuals)
18+
genotype={}
19+
for id in individuals: genotype[id]=[]
20+
elif fields[6]=="PASS":
21+
[chr,pos,id,ref,alt]=fields[:5]
22+
print id+":chr"+chr+":"+pos+":"+ref+":"+alt+"\t",
23+
genotypes=fields[9:]
24+
for i in range(0,numIndiv):
25+
id=individuals[i]
26+
gt=genotypes[i]
27+
genotype[id].append(gt)
28+
print "\n"
29+
for id in individuals:
30+
gt=genotype[id]
31+
print id+"\t"+"\t".join(gt)

popstar-step1.py

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#!/usr/bin/env python
2+
import sys
3+
import os
4+
import distutils
5+
from distutils.spawn import find_executable
6+
import subprocess
7+
import basic
8+
9+
name=sys.argv[0];
10+
11+
if(len(sys.argv)!=5):
12+
print "\n"+name+""" <SNPs.vcf> <chrom.fasta> <amplicons.bed> <fastq-dir>
13+
SNPs.vcf = VCF file containing variants
14+
chrom.fasta = FASTA file containing a single chromosome DNA sequence
15+
amplicons.bed = BED file containing chromosome coordinates of amplicons
16+
fastq-dir = path to directory containing FASTQ files
17+
"""
18+
sys.exit(0)
19+
[name,vcfFile,chrFile,ampliconsBed,fastqDir]=sys.argv
20+
workingDir=os.getcwd()
21+
alignmentsDir="aligned"
22+
if(not os.path.exists(alignmentsDir)):
23+
os.makedirs(alignmentsDir)
24+
25+
# First, do some sanity checks
26+
found=distutils.spawn.find_executable("bowtie2");
27+
if(found is None):
28+
print "please install bowtie2\n"
29+
sys.exit(0)
30+
31+
# Make sure scripts are executable
32+
if(not os.path.exists("get-SNP-IDs.pl") or not os.path.exists("fdr.R")):
33+
print "please copy *.pl and *.R scripts into the current directory";
34+
sys.exit(1)
35+
os.system("chmod +x *.pl *.R");
36+
37+
def System(cmd):
38+
print "Excecuting: "+cmd
39+
os.system(cmd)
40+
41+
System("get-SNP-IDs.pl "+vcfFile+" > SNPs.txt");
42+
43+
System("get-amplicons.pl "+chrFile+" "+ampliconsBed+" > amplicons.txt");
44+
45+
System("make-haplotypes.pl "+vcfFile+" haplotypes.fasta > haplotypes.txt");
46+
47+
System("cat haplotypes.txt | awk '{print $2 \"\t\" $1}' > amp-hap.txt");
48+
49+
System("get-SNP-haplotypes.pl haplotypes.txt > SNP-haplotypes.txt");
50+
51+
System("rm -f hap*bt2");
52+
53+
System("bowtie2-build haplotypes.fasta hap");
54+
55+
System("make-bowtie-slurms.pl "+fastqDir+" "+alignmentsDir+" "+workingDir);
56+
57+
print "Please run bowtie2 now -- see commands in bowtie-slurms directory or bowtie-commands.sh\n";
58+

popstar-step2.py

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/usr/bin/env python
2+
import sys
3+
import os
4+
import distutils
5+
from distutils.spawn import find_executable
6+
import subprocess
7+
import basic
8+
9+
name=sys.argv[0];
10+
11+
if(len(sys.argv)!=1):
12+
print name+" <dna.sam>";
13+
sys.exit(0)
14+
[name,dnaRep]=sys.argv;
15+
16+
alignmentDir="aligned";
17+
18+
def System(cmd):
19+
print "Excecuting: "+cmd
20+
os.system(cmd)
21+
22+
System("get-haplo-read-counts.pl "+alignmentDir+" "+dnaRep+" > haplo-read-counts.txt");
23+
24+
System("get-SNP-allele-counts.pl > SNP-read-counts.txt");
25+
26+
System("filter-SNP-read-counts.pl SNP-read-counts.txt > SNP-read-counts-filtered.txt");
27+
28+
System("SNP-fisher.R SNP-read-counts-filtered.txt > SNP-fdr.txt");
29+
30+
System("analyze-haplotypes.pl "+alignmentDir+" "+dnaRep+" > analyze-haplotypes.txt");
31+
32+
System("filter-zeros.pl > nonzero.txt");
33+
34+
System("fdr.R > haplotype-fdr.txt");
35+
36+
System("join2.pl > join.txt");
37+
38+
39+

rank-variants.py

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/usr/bin/env python
2+
import sys
3+
import os
4+
import operator
5+
6+
if(len(sys.argv)!=2):
7+
print sys.argv[0]+" <in.txt>"
8+
sys.exit(0)
9+
[infile]=sys.argv[1:]
10+
11+
f=open(infile)
12+
header=f.readline()
13+
variants=header.split()
14+
counts={}
15+
for line in f:
16+
line.rstrip("\n")
17+
fields=line.split()
18+
n=len(fields)
19+
for i in range(1,n):
20+
id=variants[i-1]
21+
if fields[i]!="1|1": continue
22+
if not id in counts: counts[id]=1
23+
else: counts[id]+=1
24+
ranked=sorted(counts.items(), key=operator.itemgetter(1),
25+
reverse=True)
26+
for variant in ranked: print variant[0],"\t",variant[1]

0 commit comments

Comments
 (0)