|
| 1 | +#========================================================================= |
| 2 | +# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public |
| 3 | +# License (GPL) version 3, as described at www.opensource.org. |
| 4 | +# 2018 William H. Majoros (bmajoros@alumni.duke.edu) |
| 5 | +#========================================================================= |
| 6 | +from __future__ import (absolute_import, division, print_function, |
| 7 | + unicode_literals, generators, nested_scopes, with_statement) |
| 8 | +from builtins import (bytes, dict, int, list, object, range, str, ascii, |
| 9 | + chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) |
| 10 | +import sys |
| 11 | +from DataFrameRow import DataFrameRow |
| 12 | + |
| 13 | +#========================================================================= |
| 14 | +# Attributes: |
| 15 | +# header |
| 16 | +# matrix : array of rows, each of which is an array of data values |
| 17 | +# rowHash : dictionary mapping row names to row indices |
| 18 | +# colHash : dictionary mapping column names to column indices |
| 19 | +# Methods: |
| 20 | +# df=DataFrame() |
| 21 | +# rowNames=df.getRowNames() |
| 22 | +# colNames=df.getColumnNames() |
| 23 | +# n=df.nrow() |
| 24 | +# n=df.ncol() |
| 25 | +# row=df[index] |
| 26 | +# elem=df[i][j] |
| 27 | +# df.toInt() |
| 28 | +# df.toFloat() |
| 29 | +# header=df.getHeader() |
| 30 | +# df.hashRowNames() |
| 31 | +# df.hashColNames() |
| 32 | +# row=df.getRow(rowName) # call hashRowNames() first! |
| 33 | +# col=df.getColumn(columnName) # call hashColNames() first! |
| 34 | +# bool=df.rowExists(rowName) # call hashRowNames() first! |
| 35 | +# bool=df.columnExists(colName) # call hashColNames() first! |
| 36 | +# Class methods: |
| 37 | +# df=DataFrame.readTable(filename,hasHeader=True,hasRowNames=True) |
| 38 | +#========================================================================= |
| 39 | + |
| 40 | +class DataFrame: |
| 41 | + def __init__(self): |
| 42 | + self.header=[] |
| 43 | + self.matrix=[] |
| 44 | + self.rowHash=None |
| 45 | + self.colHash=None |
| 46 | + |
| 47 | + def rowExists(self,rowName): |
| 48 | + if(self.rowHash is None): raise Exception("call hashRowNames() first") |
| 49 | + return self.rowHash.get(rowName,None) is not None |
| 50 | + |
| 51 | + def columnExists(self,colName): |
| 52 | + if(self.colHash is None): raise Exception("call hashColNames() first") |
| 53 | + return self.colHash.get(colName,None) is not None |
| 54 | + |
| 55 | + def getRowNames(self): |
| 56 | + names=[] |
| 57 | + for row in self.matrix: |
| 58 | + names.append(row.label) |
| 59 | + return names |
| 60 | + |
| 61 | + def getColumnNames(self): |
| 62 | + return header |
| 63 | + |
| 64 | + def getRow(self,rowName): |
| 65 | + if(self.rowHash is None): raise Exception("call hashRowNames() first") |
| 66 | + rowIndex=self.rowHash.get(rowName,None) |
| 67 | + if(rowIndex is None): raise Exception("row not found: "+rowName) |
| 68 | + return self.matrix[rowIndex] |
| 69 | + |
| 70 | + def getColumn(self,colName): |
| 71 | + if(self.colHash is None): raise Exception("call hashColNames() first") |
| 72 | + colIndex=self.colHash.get(colName,None) |
| 73 | + if(colIndex is None): raise Exception("column not found: "+colName) |
| 74 | + column=DataFrameRow() |
| 75 | + column.label=colName |
| 76 | + for row in self.matrix: |
| 77 | + colum.values.append(row[colIndex]) |
| 78 | + |
| 79 | + def hashRowNames(self): |
| 80 | + h=self.rowHash={} |
| 81 | + numRows=self.nrow() |
| 82 | + for i in range(numRows): |
| 83 | + row=self.matrix[i] |
| 84 | + h[row.label]=i |
| 85 | + |
| 86 | + def hashColNames(self): |
| 87 | + h=self.colHash={} |
| 88 | + numCols=self.ncol() |
| 89 | + for i in range(numCols): |
| 90 | + h[header[i]]=i |
| 91 | + |
| 92 | + def getHeader(self): |
| 93 | + return self.header |
| 94 | + |
| 95 | + def nrow(self): |
| 96 | + return len(self.matrix) |
| 97 | + |
| 98 | + def ncol(self): |
| 99 | + return len(self.header) |
| 100 | + |
| 101 | + def __getitem__(self,i): |
| 102 | + return self.matrix[i] |
| 103 | + |
| 104 | + def toInt(self): |
| 105 | + for row in self.matrix: row.toInt() |
| 106 | + |
| 107 | + def toFloat(self): |
| 108 | + for row in self.matrix: row.toFloat() |
| 109 | + |
| 110 | + @classmethod |
| 111 | + def readTable(cls,filename,hasHeader=True,hasRowNames=True): |
| 112 | + df=DataFrame() |
| 113 | + with open(filename,"rt") as IN: |
| 114 | + if(hasHeader): |
| 115 | + df.header=IN.readline() |
| 116 | + df.header=df.header.rstrip().split() |
| 117 | + for line in IN: |
| 118 | + fields=line.rstrip().split() |
| 119 | + if(len(fields)<1): continue |
| 120 | + label="" |
| 121 | + if(hasRowNames): |
| 122 | + label=fields[0] |
| 123 | + fields=fields[1:] |
| 124 | + row=DataFrameRow() |
| 125 | + row.label=label |
| 126 | + row.values=fields |
| 127 | + df.matrix.append(row) |
| 128 | + if(len(df.matrix)>0 and df.matrix[0].length()<len(df.header)): |
| 129 | + df.header=df.header[1:] |
| 130 | + return df |
| 131 | + |
0 commit comments