-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluate-incucyte-raw-data.py
127 lines (94 loc) · 4.44 KB
/
evaluate-incucyte-raw-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#evaluate-incucate-raw-data.py
import numpy
import pandas
FILENAME_OF_RAW_DATA = "incucyte_raw_data/cv001.csv"
SHEETNAME_OF_COMPOUND_MAP = "compound map Incucyte1"
FILENAME_OF_META_DATA = "20210920 Overview CG plates and compounds.xlsx"
SEPARATOR_CHARACTER = ";"
DECIMAL_SEPARATOR = ","
## quality control
with open(FILENAME_OF_RAW_DATA) as f:
assert f.readline().replace(SEPARATOR_CHARACTER,"") != "\n"
assert f.readline().replace(SEPARATOR_CHARACTER,"") == "\n"
assert f.readline().split(":")[0] == "Vessel Name"
assert f.readline().split(":")[0] == "Metric"
assert f.readline().split(":")[0] == "Cell Type"
assert f.readline().split(":")[0] == "Passage"
assert f.readline().split(":")[0] == "Notes"
assert f.readline().split(":")[0] == "Analysis"
assert f.readline().replace(SEPARATOR_CHARACTER,"") == "\n"
assert f.readline().replace(SEPARATOR_CHARACTER,"").startswith("Date TimeElapsed")
remaining_lines = f.readlines()
## read
name_of_file = pandas.read_csv(FILENAME_OF_RAW_DATA, sep=SEPARATOR_CHARACTER, decimal=DECIMAL_SEPARATOR, nrows=1, names=["name"], usecols=[0])
metadata = pandas.read_csv(FILENAME_OF_RAW_DATA, sep=SEPARATOR_CHARACTER, decimal=DECIMAL_SEPARATOR, skiprows=2, nrows=6, names=["meta"], usecols=[0])
df = pandas.read_csv(FILENAME_OF_RAW_DATA, sep=SEPARATOR_CHARACTER, decimal=DECIMAL_SEPARATOR, skiprows=9)
well_ids_with_data = list([col for col in df.columns if col != "Date Time" and col != "Elapsed"])
number_of_timepoints = len(df)
timepoint_columns = list([x for x in range(number_of_timepoints)])
#print(well_ids_with_data)
## read
df_compound_map = pandas.read_excel(FILENAME_OF_META_DATA, sheet_name=SHEETNAME_OF_COMPOUND_MAP)
#print(df_compound_map)
## treat
## "second" measurement is actual t0; the point before was a "blank"
df["Elapsed"] = df["Elapsed"] - df["Elapsed"].iloc[1]
## transpose for better access
df = df.transpose()
df_timings = df.loc[["Date Time", "Elapsed"],:]
df = df.drop(index = ["Date Time", "Elapsed"])
df["well name"] = df.index
## deeper quality control
# catch value == 0, because this would yield invalid log2 transforms...
problem = df[timepoint_columns].applymap(lambda x: True if x == 0 else False)
if problem.any(axis="columns").any():
print("---------------------------------")
print("THERE IS A PROBLEM ('datapoint==0') IN YOUR DATA: ")
print(df[problem.any(axis="columns")])
print("---------------------------------")
## map compounds
df = df.merge(df_compound_map, on="well name", how="left")
## create control = base line values
selector = df["compound name"] == "DMSO"
assert (df[selector]["concentration"] == 10.).all()
control = df[selector][timepoint_columns].mean()
control_std = df[selector][timepoint_columns].std()
## calculate growth via Amelie's formula
df_vs_blank = df[timepoint_columns].div(df[0], axis="index")
df_vs_blank_log2 = df_vs_blank.applymap(numpy.log2)
#print(df_vs_blank_log2)
control_vs_blank = control.div(control[0])
control_vs_blank_log2 = control_vs_blank.apply(numpy.log2)
#print(control_vs_blank_log2)
df_vs_blank_log2_div_control_vs_blank_log2 = df_vs_blank_log2.div(control_vs_blank_log2)
df_vs_blank_log2_div_control_vs_blank_log2_derived = df_vs_blank_log2_div_control_vs_blank_log2.applymap(lambda x: 2**x-1)
#print(df_vs_blank_log2_div_control_vs_blank_log2_derived)
final = df_vs_blank_log2_div_control_vs_blank_log2_derived.copy()
final[ [c for c in df.columns if not c in timepoint_columns] ] = df[ [c for c in df.columns if not c in timepoint_columns] ]
print(final)
final.to_excel("incucyte-output.xlsx")
## groupby
#for groupname, groupseries in final.groupby(["eubopen ID", "concentration"]):
# print(groupname)
## look into negative control performance (DMSO)
# print(final[ final["compound name"]=="DMSO" ])
# import matplotlib.pyplot as plt
# for i, s in final[ final["compound name"]=="DMSO" ].iterrows():
# s[timepoint_columns].plot(label=s["well name"])
# plt.legend()
# plt.show()
## look into a specific compound by EUbOPEN ID
# eubopen_id = "EUB0000502a"
# print(final[ final["eubopen ID"]==eubopen_id ])
# import matplotlib.pyplot as plt
# for i, s in final[ final["eubopen ID"]==eubopen_id ].iterrows():
# s[timepoint_columns].plot(label=s["well name"])
# plt.legend()
# plt.show()
## look into all compounds
# import matplotlib.pyplot as plt
# for i, s in final.iterrows():
# s[timepoint_columns].plot() #label=s["well name"])
# ax = plt.gca()
# ax.set_ylim(0,2)
# plt.show()