-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathZ_score_21.py
executable file
·76 lines (56 loc) · 2.73 KB
/
Z_score_21.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python
# imports
import csv
import sys
import pandas as pd
from scipy import stats
import os
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
# turn off warning for dividing by 0
pd.options.mode.chained_assignment = None
# Open df positive strand, give column names. This is a two column space delimited csv file containing the length distribution for all sRNAs mapping to the positive strand of the EVE. One file for each EVE.
df_sense = pd.read_csv(open(sys.argv[1]), sep=' ', names = ['length', 'number'])
# Open df negative strand, give column names. This is a two column space delimited csv file containing the length distribution for all sRNAs mapping to the negative strand of the EVE. One file for each EVE.
df_antisense = pd.read_csv(open(sys.argv[2]), sep=' ', names = ['length', 'number'])
outfile = sys.argv[3]
# get filename of opened file
EVE_number = os.path.basename(sys.argv[1])
# Open standard frame with values 18-36
df_standard = pd.read_csv(open('/directory_containing_standard.txt/standard.txt'),
sep=' ', names = ['length', 'number_x'])
# merge with df from length reads, fill NaN with 0s
new_df_sense = pd.merge(df_standard, df_sense, on='length', how='left')
new_df_sense= new_df_sense.fillna(0)
# merge with df from length reads, fill NaN with 0s
new_df_antisense = pd.merge(df_standard, df_antisense, on='length', how='left')
new_df_antisense= new_df_antisense.fillna(0)
#Anneliek ter Horst, 2018
# remove column with only 0 from the standard frame
new_df_sense = new_df_sense.drop('number_x', 1)
new_df_antisense = new_df_antisense.drop('number_x', 1)
# only take numbers between 18-24 into account for this one
df_siRNA_sense = new_df_sense.query('24 >= length')
df_siRNA_antisense = new_df_antisense.query('24 >= length')
# define columns for Z score
columns = list(df_sense)
# calculate Z score of all values between 18-24, append to df positive strand
for col in columns:
col_zscore = col + 'z_score'
df_siRNA_sense[col_zscore] = stats.zscore(df_siRNA_sense[col])
for col in columns:
col_zscore = col + 'z_score'
df_siRNA_antisense[col_zscore] = stats.zscore(df_siRNA_antisense[col])
# get Z score only for length 21, = index number 3
z_score_sense = df_siRNA_sense.get_value(3, 'numberz_score')
z_score_antisense = df_siRNA_antisense.get_value(3, 'numberz_score')
# make a list of the file name, both z scores
if z_score_sense >= 1.96 and z_score_antisense >= 1.96:
z_score_list = [EVE_number, z_score_sense, z_score_antisense]
print 'significant'
else:
z_score_list = [0,0,0]
# put Z score of both positive and negative in new df, with distinctive file name
with open(outfile, 'a') as f:
wr = csv.writer(f)
wr.writerow(z_score_list)