-
Notifications
You must be signed in to change notification settings - Fork 0
/
KnockdownFeatures_class.py
192 lines (149 loc) · 6.72 KB
/
KnockdownFeatures_class.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 7 11:19:25 2019
@author: max
"""
import os
import re
import functools
import pandas as pd
import numpy as np
class KnockdownFeatures:
'''
class that holds all the features of one knockdown
Usage: Initialize with the path to the experiment and a list of the folder names
you want to search for features
either use find_csv() and then load_features(feature) to load an
individual feature, or use load_all() to load all features.
'''
def __init__(self, kd_path, KD):
self.kd_path=kd_path
self.KD=KD
self.KD_pattern=re.compile('({}{}[0-9]+)'.format(self.KD, os.sep))
#use this if you have uniform path architecture, but different group namings
#self.experiment_identifier=kd_path.split(os.sep)[-2]
#use this if you have different path architectures in your folders
self.Experiment_pattern=re.compile('SiRNA_[0-9]+')
self.experiment_identifier=re.search(self.Experiment_pattern, self.kd_path).group()
def info(self):
'''
prints info about this instance of the class
'''
print('experiment folder: \n', self.kd_path)
print('experiment name: \n', self.experiment_identifier)
print('Knockdown: \n', self.KD)
def find_csv(self):
'''
returns a list with all csv files in the KD folder
'''
#pattern: must end with '.csv'
csv_find=re.compile('\.csv$')
#finds the directory with that specific name
find_dir='GCAFeatureExtraction'
self.i_dirs=[]
#Knockdown pattern, must match inserted variable self.__KD followed by '/'
#and one or more digits
for root, dirs, files in os.walk(self.kd_path):
#looks in each folder from given path
#if folder is matching the KD pattern, the find_dir pattern and
#if there are csv files
if re.search(self.KD_pattern, root) and find_dir in root and len([x for x in files if re.search(csv_find, x)])!=0:
#finds the csv files in the folder and adds them to a list
csv_files=[x for x in files if re.search(csv_find, x)]
for csv in csv_files:
#each csv file is added to the path and appended to a list
self.i_dirs.append(os.path.join(root, csv))
return self.i_dirs
def get_features(self):
'''
creates a list of all the feature names
calls find_csv to get i_dirs variable
for interactive feature selection, change here!
'''
#pattern: must start with a '/' followed by characters that are not '/'
#and end with '.csv'
self.features=[]
csv_pattern=re.compile('[{}][^{}]+\.csv$'.format(os.sep, os.sep))
self.i_dirs=self.find_csv()
for file in self.i_dirs:
#making tuples for what to replace with what
repls = ('.csv', ''), (os.sep, '')
filename=re.search(csv_pattern, file).group()
#applies repls to replace name parts
filename=functools.reduce(lambda a, kv:a.replace(*kv), repls, filename)
if filename not in self.features:
self.features.append(filename)
return self.features
def load_feature(self, feature):
'''
loads all csvs of a single feature
needs to be called by load_all
'''
GC_list=[]
time_pattern=re.compile('(n[0-9]+)')
for file in self.i_dirs:
if feature in file:
identifier=re.search(self.KD_pattern, file).group()
try:
temp=pd.read_csv(file, header=None)
except:
break
rows, columns=temp.shape
num_identifier=[]
#creates a list of identfiers to match the columns
for i in range(0, columns):
num_identifier.append(self.experiment_identifier+'/'+identifier+'n'+str(i))
#renames columns with identifier
temp.columns=num_identifier
#adding each loaded file to a list
GC_list.append(temp)
#concatonating the list to a dataframe
full_feature = pd.concat(GC_list, axis=1, sort=True)
#creating an index for melting
rows, columns=full_feature.shape
full_feature['meltid']= range(0, rows)
#melting to long format
long_feature=pd.melt(full_feature, id_vars='meltid')
#dropping NAs
long_feature=long_feature.dropna()
long_feature=long_feature.reset_index(drop=True)
#dropping all values that cannot be converted to a float
for n, i in enumerate(long_feature['value']):
try:
long_feature.loc[n, 'value']=float(i)
except ValueError:
long_feature=long_feature.drop(n)
#reset index again
long_feature=long_feature.reset_index(drop=True)
long_feature['experiment']=self.experiment_identifier
long_feature['KD']=self.KD
long_feature['value']=long_feature['value'].astype('float')
long_feature['item']=long_feature['variable'].str.extract(self.KD_pattern)
long_feature['timepoint']=long_feature['variable'].str.extract(time_pattern)
return long_feature
# =============================================================================
# long_feature=long_feature.reset_index(drop=True)
# long_feature['experiment']=self.experiment_identifier
# long_feature['KD']=self.KD
# long_feature['item']='placeholder'
# long_feature['timepoint']='placeholder'
# long_feature['value']=long_feature['value'].astype('float')
# for n, var in enumerate(long_feature['variable']):
# long_feature.loc[[n],['item']]=re.search(self.KD_pattern, var).group()
# long_feature.loc[[n],['timepoint']]=re.search(time_pattern, var).group()
# return long_feature
# =============================================================================
def load_all(self):
'''
loops over load_feature for each feature
calls get_features to get the features and i_dirs
'''
self.get_features()
self.all_features={}
for i in self.features:
#for each element in feature the load_feature function is called
#and its output is added to a dictionary with the feature as a key
feature=self.load_feature(i)
self.all_features.update({i:feature})
return self.all_features