-
Notifications
You must be signed in to change notification settings - Fork 8
/
covariate_correct.py
executable file
·166 lines (138 loc) · 4.6 KB
/
covariate_correct.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env python
import argparse, sys
import gzip
from argparse import RawTextHelpFormatter
from scipy import stats
import numpy as np
import statsmodels.api as sm
# from sklearn import datasets, linear_model
__author__ = "Colby Chiang (cchiang@genome.wustl.edu)"
__version__ = "$Revision: 0.0.1 $"
__date__ = "$Date: 2016-03-30 18:18 $"
# --------------------------------------
# define functions
def get_args():
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="\
covariate_correct.py\n\
author: " + __author__ + "\n\
version: " + __version__ + "\n\
description: correct a matrix with residualization by linear regression of covariates")
parser.add_argument('-i', '--input',
metavar='FILE', dest='input_path',
type=str, default=None, required=False,
help='matrix of uncorrected values')
parser.add_argument('-c', '--covar',
metavar='FILE', dest='covar_path',
required=True,
type=str, default=None,
help='covariates')
parser.add_argument('-C', '--skip_cols',
metavar='INT', dest='skip_cols',
required=False,
type=int, default=0,
help='number of leading columns to skip [0]')
parser.add_argument('-z', '--z_norm',
dest='z_norm',
action='store_true',
help = 'report row-normalized z-scores')
# parser.add_argument('-R', '--skip_rows',
# metavar='INT', dest='skip_rows',
# required=False,
# type=int, default=0,
# help='number of leading rows to skip [0]')
# parse the arguments
args = parser.parse_args()
# if no input file, check if part of pipe and if so, read stdin.
if args.input_path == None:
if sys.stdin.isatty():
parser.print_help()
exit(1)
# send back the user input
return args
# open file (either plaintext or zip)
def get_file(filename):
if filename.endswith('.gz'):
data = gzip.open(filename, 'rb')
else:
data = open(filename, 'r')
return data
# primary function
def resid_lin_reg(
matrix_file,
covar_file,
skip_cols,
z_norm
):
# read through the covariates
in_header = True
cov_header = []
cov_list = []
cov_list_sorted = []
for line in covar_file:
v = line.rstrip().split('\t')
if in_header:
cov_header = v[1:]
in_header = False
continue
cov_list.append(map(float, v[1:]))
# read through the matrix
in_header = True
mat_header = []
for line in matrix_file:
v = line.rstrip().split('\t')
if in_header:
print line.rstrip()
mat_header = v[skip_cols:]
in_header = False
# rearrange the covariate table so it matches the matrix
reorder = [cov_header.index(x) for x in mat_header]
for row in cov_list:
cov_list_sorted.append([row[x] for x in reorder])
# X is numpy array of sorted covariates
X = np.transpose(np.array(cov_list_sorted))
# print X
continue
# expression
y = np.array(map(float, v[skip_cols:]))
results = sm.OLS(y, X).fit()
# Inspect the results
# print results.summary()
# store residuals of matrix values
resid = results.resid.tolist()
# normalize by z-score if requested
if z_norm:
corrected = stats.zscore(resid)
else:
corrected = resid
# print results
print '\t'.join(v[:skip_cols] + map(str, corrected))
return
# --------------------------------------
# main function
def main():
# parse the command line args
args = get_args()
# if no input file, check if part of pipe and if so, read stdin.
if args.input_path == None:
input_file = sys.stdin
else:
input_file = get_file(args.input_path)
# get permutation data
covar_file = get_file(args.covar_path)
# call primary function
resid_lin_reg(
input_file,
covar_file,
args.skip_cols,
args.z_norm
)
# close the files
input_file.close()
covar_file.close()
# initialize the script
if __name__ == '__main__':
try:
sys.exit(main())
except IOError, e:
if e.errno != 32: # ignore SIGPIPE
raise