Skip to content

Commit 574df0f

Browse files
author
Bill Majoros
committedJan 4, 2019
update
1 parent 0de3f99 commit 574df0f

9 files changed

+379
-9
lines changed
 

‎DataFrame.py

+131
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
#=========================================================================
2+
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
3+
# License (GPL) version 3, as described at www.opensource.org.
4+
# 2018 William H. Majoros (bmajoros@alumni.duke.edu)
5+
#=========================================================================
6+
from __future__ import (absolute_import, division, print_function,
7+
unicode_literals, generators, nested_scopes, with_statement)
8+
from builtins import (bytes, dict, int, list, object, range, str, ascii,
9+
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
10+
import sys
11+
from DataFrameRow import DataFrameRow
12+
13+
#=========================================================================
14+
# Attributes:
15+
# header
16+
# matrix : array of rows, each of which is an array of data values
17+
# rowHash : dictionary mapping row names to row indices
18+
# colHash : dictionary mapping column names to column indices
19+
# Methods:
20+
# df=DataFrame()
21+
# rowNames=df.getRowNames()
22+
# colNames=df.getColumnNames()
23+
# n=df.nrow()
24+
# n=df.ncol()
25+
# row=df[index]
26+
# elem=df[i][j]
27+
# df.toInt()
28+
# df.toFloat()
29+
# header=df.getHeader()
30+
# df.hashRowNames()
31+
# df.hashColNames()
32+
# row=df.getRow(rowName) # call hashRowNames() first!
33+
# col=df.getColumn(columnName) # call hashColNames() first!
34+
# bool=df.rowExists(rowName) # call hashRowNames() first!
35+
# bool=df.columnExists(colName) # call hashColNames() first!
36+
# Class methods:
37+
# df=DataFrame.readTable(filename,hasHeader=True,hasRowNames=True)
38+
#=========================================================================
39+
40+
class DataFrame:
41+
def __init__(self):
42+
self.header=[]
43+
self.matrix=[]
44+
self.rowHash=None
45+
self.colHash=None
46+
47+
def rowExists(self,rowName):
48+
if(self.rowHash is None): raise Exception("call hashRowNames() first")
49+
return self.rowHash.get(rowName,None) is not None
50+
51+
def columnExists(self,colName):
52+
if(self.colHash is None): raise Exception("call hashColNames() first")
53+
return self.colHash.get(colName,None) is not None
54+
55+
def getRowNames(self):
56+
names=[]
57+
for row in self.matrix:
58+
names.append(row.label)
59+
return names
60+
61+
def getColumnNames(self):
62+
return header
63+
64+
def getRow(self,rowName):
65+
if(self.rowHash is None): raise Exception("call hashRowNames() first")
66+
rowIndex=self.rowHash.get(rowName,None)
67+
if(rowIndex is None): raise Exception("row not found: "+rowName)
68+
return self.matrix[rowIndex]
69+
70+
def getColumn(self,colName):
71+
if(self.colHash is None): raise Exception("call hashColNames() first")
72+
colIndex=self.colHash.get(colName,None)
73+
if(colIndex is None): raise Exception("column not found: "+colName)
74+
column=DataFrameRow()
75+
column.label=colName
76+
for row in self.matrix:
77+
colum.values.append(row[colIndex])
78+
79+
def hashRowNames(self):
80+
h=self.rowHash={}
81+
numRows=self.nrow()
82+
for i in range(numRows):
83+
row=self.matrix[i]
84+
h[row.label]=i
85+
86+
def hashColNames(self):
87+
h=self.colHash={}
88+
numCols=self.ncol()
89+
for i in range(numCols):
90+
h[header[i]]=i
91+
92+
def getHeader(self):
93+
return self.header
94+
95+
def nrow(self):
96+
return len(self.matrix)
97+
98+
def ncol(self):
99+
return len(self.header)
100+
101+
def __getitem__(self,i):
102+
return self.matrix[i]
103+
104+
def toInt(self):
105+
for row in self.matrix: row.toInt()
106+
107+
def toFloat(self):
108+
for row in self.matrix: row.toFloat()
109+
110+
@classmethod
111+
def readTable(cls,filename,hasHeader=True,hasRowNames=True):
112+
df=DataFrame()
113+
with open(filename,"rt") as IN:
114+
if(hasHeader):
115+
df.header=IN.readline()
116+
df.header=df.header.rstrip().split()
117+
for line in IN:
118+
fields=line.rstrip().split()
119+
if(len(fields)<1): continue
120+
label=""
121+
if(hasRowNames):
122+
label=fields[0]
123+
fields=fields[1:]
124+
row=DataFrameRow()
125+
row.label=label
126+
row.values=fields
127+
df.matrix.append(row)
128+
if(len(df.matrix)>0 and df.matrix[0].length()<len(df.header)):
129+
df.header=df.header[1:]
130+
return df
131+

‎DataFrameRow.py

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#=========================================================================
2+
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
3+
# License (GPL) version 3, as described at www.opensource.org.
4+
# 2018 William H. Majoros (bmajoros@alumni.duke.edu)
5+
#=========================================================================
6+
from __future__ import (absolute_import, division, print_function,
7+
unicode_literals, generators, nested_scopes, with_statement)
8+
from builtins import (bytes, dict, int, list, object, range, str, ascii,
9+
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
10+
import sys
11+
12+
#=========================================================================
13+
# Attributes:
14+
# label : string
15+
# values : array of values
16+
# Methods:
17+
# row=DataFrameRow()
18+
# elem=row[i] # first element is at 0 (the label is not counted)
19+
# label=row.getLabel()
20+
# row.rename(label)
21+
# n=row.length()
22+
# row.toInt()
23+
# row.toFloat()
24+
#=========================================================================
25+
26+
class DataFrameRow:
27+
def __init__(self):
28+
self.label=""
29+
self.values=[]
30+
31+
def __getitem__(self,i):
32+
return self.values[i]
33+
34+
def length(self):
35+
return len(self.values)
36+
37+
def getLabel(self):
38+
return self.label
39+
40+
def rename(self,x):
41+
self.label=x
42+
43+
def toInt(self):
44+
self.values=[int(x) for x in self.values]
45+
46+
def toFloat(self):
47+
self.values=[float(x) for x in self.values]
48+

‎FastqReader.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#=========================================================================
2+
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
3+
# License (GPL) version 3, as described at www.opensource.org.
4+
# 2018 William H. Majoros (bmajoros@allumni.duke.edu)
5+
#=========================================================================
6+
from __future__ import (absolute_import, division, print_function,
7+
unicode_literals, generators, nested_scopes, with_statement)
8+
from builtins import (bytes, dict, int, list, object, range, str, ascii,
9+
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
10+
from Rex import Rex
11+
rex=Rex()
12+
import gzip
13+
14+
#=========================================================================
15+
# Attributes:
16+
# fh : file handle
17+
# Instance Methods:
18+
# reader=FastqReader(filename)
19+
# [ID,seq,qual]=reader.nextSequence() # returns None at EOF
20+
# reader.close()
21+
# Class Methods:
22+
#=========================================================================
23+
class FastqReader:
24+
"""FastqReader"""
25+
def __init__(self,filename):
26+
if(filename is not None):
27+
if(rex.find("\.gz$",filename)): self.fh=gzip.open(filename,"rt")
28+
else: self.fh=open(filename,"r")
29+
30+
def close(self):
31+
self.fh.close()
32+
33+
def nextSequence(self):
34+
fh=self.fh
35+
line=fh.readline()
36+
if(line is None): return None
37+
if(len(line)==0): return None
38+
if(not rex.find("^(\S+)",line)):
39+
return None
40+
#raise Exception("Cannot parse fastq line: "+ID)
41+
ID=rex[1]
42+
pair=1
43+
if(rex.find("\s+(\d)",line)): pair=int(rex[1])
44+
seq=fh.readline().rstrip()
45+
junk=fh.readline()
46+
qual=fh.readline().rstrip()
47+
return [ID,seq,qual,pair]
48+
49+
50+

‎SamReader.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#=========================================================================
2+
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
3+
# License (GPL) version 3, as described at www.opensource.org.
4+
# 2018 William H. Majoros (bmajoros@allumni.duke.edu)
5+
#=========================================================================
6+
from __future__ import (absolute_import, division, print_function,
7+
unicode_literals, generators, nested_scopes, with_statement)
8+
from builtins import (bytes, dict, int, list, object, range, str, ascii,
9+
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
10+
from Rex import Rex
11+
rex=Rex()
12+
import gzip
13+
from SamRecord import SamRecord
14+
15+
#=========================================================================
16+
# Attributes:
17+
# fh : file handle
18+
# Instance Methods:
19+
# reader=SamReader(filename)
20+
# [ID,seq,qual]=reader.nextSequence() # returns None at EOF
21+
# reader.close()
22+
# Class Methods:
23+
#=========================================================================
24+
class SamReader:
25+
"""SamReader"""
26+
def __init__(self,filename):
27+
if(filename is not None):
28+
if(rex.find("\.gz$",filename)): self.fh=gzip.open(filename,"rt")
29+
else: self.fh=open(filename,"r")
30+
31+
def close(self):
32+
self.fh.close()
33+
34+
def nextSequence(self):
35+
fh=self.fh
36+
line=fh.readline()
37+
if(line is None): return None
38+
while(line is not None and len(line)>0 and line[0]=="@"):
39+
line=fh.readline()
40+
if(line is None or len(line)==0): return None
41+
fields=line.rstrip().split()
42+
if(len(fields)<11): raise Exception("can't parse sam line: "+line)
43+
(ID,flags,refName,refPos,mapQual,cigar,rnext,pnext,templateLen,
44+
seq,qual)=fields[:11]
45+
refPos=int(refPos)
46+
rec=SamRecord(ID,refName,refPos,cigar,seq)
47+
return rec
48+
49+
# M03884:303:000000000-C4RM6:1:1101:1776:15706 99 chrX:31786371-31797409 6687 44 150M = 6813 271 ATACTATTGCTGCGGTAATAACTGTAACTGCAGTTACTATTTAGTGATTTGTATGTAGATGTAGATGTAGTCTATGTCAGACACTATGCTGAGCATTTTATGGTTGCTATGTACTGATACATACAGAAACAAGAGGTACGTTCTTTTACA BBBBFFFFFFFGGGGGEFGGFGHFHFFFHHHFFHHHFHFHHHGFHEDGGHFHBGFHGBDHFHFFFHHHHFHHHHHGHGFFBGGGHFHFFHHFFFFHHHHGHGFHHGFHGHHHGFHFFHHFHHFFGFFFFGGEHFFEHHFGHHHGHHHHFB AS:i:300 XN:i:0
50+

‎SamRecord.py

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#=========================================================================
2+
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
3+
# License (GPL) version 3, as described at www.opensource.org.
4+
# 2018 William H. Majoros (bmajoros@allumni.duke.edu)
5+
#=========================================================================
6+
from __future__ import (absolute_import, division, print_function,
7+
unicode_literals, generators, nested_scopes, with_statement)
8+
from builtins import (bytes, dict, int, list, object, range, str, ascii,
9+
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
10+
11+
#=========================================================================
12+
# Attributes:
13+
# ID = read identifier
14+
# refName = name of reference sequence the read aligns to
15+
# refPos = position in reference where alignment begins
16+
# cigar = alignment
17+
# seq = read sequence
18+
# Instance Methods:
19+
# rec=SamReader(ID,refName,refPos,cigar,seq)
20+
# Class Methods:
21+
#=========================================================================
22+
class SamRecord:
23+
"""SamRecord"""
24+
def __init__(self,ID,refName,refPos,cigar,seq):
25+
self.ID=ID
26+
self.refName=refName
27+
self.refPos=refPos
28+
self.cigar=cigar
29+
self.seq=seq
30+
31+
# M03884:303:000000000-C4RM6:1:1101:1776:15706 99 chrX:31786371-31797409 6687 44 150M = 6813 271 ATACTATTGCTGCGGTAATAACTGTAACTGCAGTTACTATTTAGTGATTTGTATGTAGATGTAGATGTAGTCTATGTCAGACACTATGCTGAGCATTTTATGGTTGCTATGTACTGATACATACAGAAACAAGAGGTACGTTCTTTTACA BBBBFFFFFFFGGGGGEFGGFGHFHFFFHHHFFHHHFHFHHHGFHEDGGHFHBGFHGBDHFHFFFHHHHFHHHHHGHGFFBGGGHFHFFHHFFFFHHHHGHGFHHGFHGHHHGFHFFHHFHHFFGFFFFGGEHFFEHHFGHHHGHHHHFB AS:i:300 XN:i:0
32+

‎Shuffler.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,10 @@
1313
# Attributes:
1414
#
1515
# Instance Methods:
16-
# Shuffler()
16+
# shuffler=Shuffler()
1717
# Class Methods:
18-
#
18+
# Shuffler.shuffleArray(array)
19+
# s=Shuffler.shuffleString(s)
1920
#=========================================================================
2021
class Shuffler:
2122
"""Shuffler shuffles arrays and strings"""

‎SlurmWriter.py

+32-6
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818
# threadsValue : number of CPUs requested
1919
# Instance Methods:
2020
# SlurmWriter()
21-
# writer.addCommand(cmd)
22-
# writer.nice() # turns on "nice" (sets it to 100 by default)
23-
# writer.mem(1500)
24-
# writer.threads(16)
25-
# writer.setQueue("new,all")
26-
# writer.writeArrayScript(slurmDir,jobName,maxParallel,
21+
# slurm.addCommand(cmd)
22+
# slurm.nice() # turns on "nice" (sets it to 100 by default)
23+
# slurm.mem(1500)
24+
# slurm.threads(16)
25+
# slurm.setQueue("new,all")
26+
# slurm.writeArrayScript(slurmDir,jobName,maxParallel,
2727
# additional_SBATCH_lines)
2828
#=========================================================================
2929
class SlurmWriter:
@@ -95,6 +95,32 @@ def writeArrayScript(self,slurmDir,jobName,maxParallel,moreSBATCH=""):
9595
queue+moreSBATCH+"#",
9696
slurmDir+"/command${SLURM_ARRAY_TASK_ID}.sh\n"
9797
]))
98+
def writeScript(self,slurmFile,outFile,jobName,command,moreSBATCH=""):
99+
if(moreSBATCH is None): moreSBATCH=""
100+
moreSBATCH=moreSBATCH.rstrip()
101+
if(len(moreSBATCH)>0):
102+
moreSBATCH=moreSBATCH.rstrip()+"\n"
103+
if(self.niceValue>0) :
104+
moreSBATCH+="#SBATCH --nice="+str(self.niceValue)+"\n"
105+
if(self.memValue>0):
106+
moreSBATCH+="#SBATCH --mem="+str(self.memValue)+"\n"
107+
if(self.threadsValue>0):
108+
moreSBATCH+="#SBATCH --cpus-per-task="+str(self.threadsValue)+"\n"
109+
queue=""
110+
if(len(self.queue)>0):
111+
queue="#SBATCH -p "+self.queue+"\n"
112+
with open(slurmFile,"w") as OUT:
113+
OUT.write("\n".join(
114+
["#!/bin/sh",
115+
"#",
116+
"#SBATCH --get-user-env",
117+
"#SBATCH -J "+jobName,
118+
"#SBATCH -A "+jobName,
119+
"#SBATCH -o "+outFile,
120+
"#SBATCH -e "+outFile,
121+
queue+moreSBATCH+"#",
122+
command
123+
]))
98124

99125

100126

‎essex-pretty-print.py

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/usr/bin/env python
2+
#=========================================================================
3+
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
4+
# License (GPL) version 3, as described at www.opensource.org.
5+
# Author: William H. Majoros (bmajoros@alumni.duke.edu)
6+
#=========================================================================
7+
from __future__ import (absolute_import, division, print_function,
8+
unicode_literals, generators, nested_scopes, with_statement)
9+
from builtins import (bytes, dict, int, list, object, range, str, ascii,
10+
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
11+
# The above imports should allow this program to run in both Python 2 and
12+
# Python 3. You might need to update your version of module "future".
13+
import sys
14+
import ProgramName
15+
from EssexParser import EssexParser
16+
17+
#=========================================================================
18+
# main()
19+
#=========================================================================
20+
if(len(sys.argv)!=2):
21+
exit(ProgramName.get()+" <in.essex>\n")
22+
(infile,)=sys.argv[1:]
23+
24+
parser=EssexParser(infile)
25+
while(True):
26+
tree=parser.nextElem()
27+
if(tree is None): break
28+
tree.print(sys.stdout)
29+
30+
31+
32+

‎template.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#=========================================================================
33
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
44
# License (GPL) version 3, as described at www.opensource.org.
5-
# Copyright (C)2017 William H. Majoros (martiandna@gmail.com).
5+
# Author: William H. Majoros (bmajoros@alumni.duke.edu)
66
#=========================================================================
77
from __future__ import (absolute_import, division, print_function,
88
unicode_literals, generators, nested_scopes, with_statement)

0 commit comments

Comments
 (0)
Please sign in to comment.