-
Notifications
You must be signed in to change notification settings - Fork 1
/
DMLS Chapter 5 Feature Engineering.mm
240 lines (239 loc) · 18.2 KB
/
DMLS Chapter 5 Feature Engineering.mm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
<map version="freeplane 1.9.13">
<!--To view this file, download free mind mapping software Freeplane from https://www.freeplane.org -->
<node TEXT="DMLS Chapter 5: Feature Engineering" LOCALIZED_STYLE_REF="AutomaticLayout.level.root" FOLDED="false" ID="ID_1090958577" CREATED="1409300609620" MODIFIED="1665677025850" VGAP_QUANTITY="2 pt"><hook NAME="MapStyle" background="#003333" zoom="0.841">
<properties show_icon_for_attributes="true" edgeColorConfiguration="#808080ff,#ff0000ff,#0000ffff,#00ff00ff,#ff00ffff,#00ffffff,#7c0000ff,#00007cff,#007c00ff,#7c007cff,#007c7cff,#7c7c00ff" show_note_icons="true" associatedTemplateLocation="file:/E:/MindMaps/DMLS/DMLS%20Chapter%201%20Overview.mm" fit_to_viewport="false"/>
<map_styles>
<stylenode LOCALIZED_TEXT="styles.root_node" STYLE="oval" UNIFORM_SHAPE="true" VGAP_QUANTITY="24 pt">
<font SIZE="24"/>
<stylenode LOCALIZED_TEXT="styles.predefined" POSITION="right" STYLE="bubble">
<stylenode LOCALIZED_TEXT="default" ID="ID_671184412" ICON_SIZE="12 pt" FORMAT_AS_HYPERLINK="false" COLOR="#484747" BACKGROUND_COLOR="#eceff4" STYLE="bubble" SHAPE_HORIZONTAL_MARGIN="8 pt" SHAPE_VERTICAL_MARGIN="5 pt" NUMBERED="false" FORMAT="markdownPatternFormat" TEXT_ALIGN="DEFAULT" BORDER_WIDTH_LIKE_EDGE="false" BORDER_WIDTH="1.9 px" BORDER_COLOR_LIKE_EDGE="true" BORDER_COLOR="#f0f0f0" BORDER_DASH_LIKE_EDGE="true" BORDER_DASH="SOLID" MAX_WIDTH="10 cm" MIN_WIDTH="0 cm" VGAP_QUANTITY="2 pt">
<arrowlink SHAPE="CUBIC_CURVE" COLOR="#88c0d0" WIDTH="2" TRANSPARENCY="255" DASH="" FONT_SIZE="9" FONT_FAMILY="SansSerif" DESTINATION="ID_671184412" STARTARROW="NONE" ENDARROW="DEFAULT"/>
<font NAME="SansSerif" SIZE="11" BOLD="false" STRIKETHROUGH="false" ITALIC="false"/>
<edge STYLE="bezier" COLOR="#81a1c1" WIDTH="3" DASH="SOLID"/>
<richcontent CONTENT-TYPE="plain/auto" TYPE="DETAILS"/>
<richcontent TYPE="NOTE" CONTENT-TYPE="plain/auto"/>
</stylenode>
<stylenode LOCALIZED_TEXT="defaultstyle.details" BORDER_WIDTH="1.9 px">
<edge STYLE="bezier" COLOR="#81a1c1" WIDTH="3"/>
</stylenode>
<stylenode LOCALIZED_TEXT="defaultstyle.attributes">
<font SIZE="10"/>
</stylenode>
<stylenode LOCALIZED_TEXT="defaultstyle.note" COLOR="#000000" BACKGROUND_COLOR="#ebcb8b">
<icon BUILTIN="clock2"/>
<font SIZE="10"/>
</stylenode>
<stylenode LOCALIZED_TEXT="defaultstyle.floating" COLOR="#484747">
<edge STYLE="hide_edge"/>
<cloud COLOR="#f0f0f0" SHAPE="ROUND_RECT"/>
</stylenode>
<stylenode LOCALIZED_TEXT="defaultstyle.selection" COLOR="#e5e9f0" BACKGROUND_COLOR="#5e81ac" BORDER_COLOR_LIKE_EDGE="false" BORDER_COLOR="#5e81ac"/>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.user-defined" POSITION="right" STYLE="bubble">
<stylenode LOCALIZED_TEXT="styles.important" ID="ID_779275544" BORDER_COLOR_LIKE_EDGE="false" BORDER_COLOR="#bf616a">
<icon BUILTIN="yes"/>
<arrowlink COLOR="#bf616a" TRANSPARENCY="255" DESTINATION="ID_779275544"/>
<font SIZE="14"/>
</stylenode>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.AutomaticLayout" POSITION="right" STYLE="bubble">
<stylenode LOCALIZED_TEXT="AutomaticLayout.level.root" COLOR="#ffffff" BACKGROUND_COLOR="#484747" STYLE="bubble" SHAPE_HORIZONTAL_MARGIN="10 pt" SHAPE_VERTICAL_MARGIN="10 pt">
<font NAME="Ubuntu" SIZE="18"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,1" COLOR="#eceff4" BACKGROUND_COLOR="#d08770" STYLE="bubble" SHAPE_HORIZONTAL_MARGIN="8 pt" SHAPE_VERTICAL_MARGIN="5 pt">
<font NAME="Ubuntu" SIZE="16"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,2" COLOR="#3b4252" BACKGROUND_COLOR="#ebcb8b">
<font SIZE="14"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,3" COLOR="#2e3440" BACKGROUND_COLOR="#a3be8c">
<font SIZE="12"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,4" COLOR="#2e3440" BACKGROUND_COLOR="#b48ead">
<font SIZE="11"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,5" BACKGROUND_COLOR="#81a1c1">
<font SIZE="10"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,6" BACKGROUND_COLOR="#88c0d0">
<font SIZE="10"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,7" BACKGROUND_COLOR="#8fbcbb">
<font SIZE="10"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,8" BACKGROUND_COLOR="#d8dee9">
<font SIZE="10"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,9" BACKGROUND_COLOR="#e5e9f0">
<font SIZE="9"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,10" BACKGROUND_COLOR="#eceff4">
<font SIZE="9"/>
</stylenode>
</stylenode>
</stylenode>
</map_styles>
</hook>
<hook NAME="accessories/plugins/AutomaticLayout.properties" VALUE="ALL"/>
<font BOLD="true"/>
<node TEXT="Key" LOCALIZED_STYLE_REF="defaultstyle.floating" POSITION="right" ID="ID_1507003841" CREATED="1665493125752" MODIFIED="1665677025850" VGAP_QUANTITY="2 pt" HGAP_QUANTITY="-45.75 pt" VSHIFT_QUANTITY="-222.74999 pt">
<hook NAME="FreeNode"/>
<node TEXT="examples" ID="ID_412345450" CREATED="1665493155008" MODIFIED="1666023942687" STYLE="bubble">
<node TEXT="Oval shapes" ID="ID_551535462" CREATED="1665493168850" MODIFIED="1665493222966" STYLE="oval"/>
</node>
<node TEXT="tools" ID="ID_1508415031" CREATED="1665493161711" MODIFIED="1666023942685" STYLE="bubble" HGAP_QUANTITY="26 pt" VSHIFT_QUANTITY="3 pt">
<node TEXT="Hexagonal Shapes" ID="ID_340131492" CREATED="1665493176259" MODIFIED="1665493201185" STYLE="wide_hexagon"/>
</node>
</node>
<node TEXT="Learned Features vs Engineered Features" POSITION="right" ID="ID_612723436" CREATED="1665669959730" MODIFIED="1665669971475">
<node TEXT="Deep Learning is also called feature learning" ID="ID_1981879729" CREATED="1665673251816" MODIFIED="1665673262741"/>
<node TEXT="Feature Engineering can be very iterative" ID="ID_204389983" CREATED="1665673305814" MODIFIED="1665673315369">
<node TEXT="what feature" ID="ID_823839218" CREATED="1665673458223" MODIFIED="1665673464473"/>
<node TEXT="how many" ID="ID_1352863209" CREATED="1665673424680" MODIFIED="1665673430560"/>
<node TEXT="requires domain expertise" ID="ID_589681462" CREATED="1665673460997" MODIFIED="1665673473192"/>
</node>
</node>
<node TEXT="Common Feature Engineering Operations" POSITION="right" ID="ID_885328154" CREATED="1665669973494" MODIFIED="1665669981250">
<node TEXT="Handling missing values" ID="ID_1949599162" CREATED="1665673505226" MODIFIED="1665673509078">
<node TEXT="types" ID="ID_1251951860" CREATED="1665673865537" MODIFIED="1665673868075">
<node TEXT="Missing not at random" ID="ID_676556025" CREATED="1665673557635" MODIFIED="1665673561780">
<node TEXT="the values missing are missing because of the value itself; _e.g._ undisclosed high salary" ID="ID_399155449" CREATED="1665673565247" MODIFIED="1665673719831"/>
<node TEXT="The fact that values are missing **is** information" ID="ID_1836593754" CREATED="1665673595407" MODIFIED="1665673628134"/>
</node>
<node TEXT="Missing at random" ID="ID_1015626833" CREATED="1665673727116" MODIFIED="1665673733646">
<node TEXT="The value is missing because of another factor; _e.g._ people of gender A don't like to disclose their age" ID="ID_853586528" CREATED="1665673733981" MODIFIED="1665673777186"/>
</node>
<node TEXT="Missing completely at random" ID="ID_278872618" CREATED="1665673782168" MODIFIED="1665673789454">
<node TEXT="There's simply no pattern to be found among sample with missing values" ID="ID_294611800" CREATED="1665673792906" MODIFIED="1665673817756"/>
<node TEXT="This is pretty rare, so this should be **investigated when encountered**" ID="ID_860889463" CREATED="1665673826151" MODIFIED="1665673844618"/>
</node>
</node>
<node TEXT="handling" ID="ID_75665593" CREATED="1665673890740" MODIFIED="1665673901422">
<node TEXT="deletion" ID="ID_563598383" CREATED="1665673902604" MODIFIED="1665673911300">
<node TEXT="easy" ID="ID_751154653" CREATED="1665674034382" MODIFIED="1665674036861"/>
<node TEXT="columns" ID="ID_1335958526" CREATED="1665673912704" MODIFIED="1665673915656">
<node TEXT="if the number is high" ID="ID_902077253" CREATED="1665673946753" MODIFIED="1665673952536">
<node TEXT="may remove important information" ID="ID_1542633339" CREATED="1665673970229" MODIFIED="1665673986311">
<icon BUILTIN="messagebox_warning"/>
</node>
</node>
</node>
<node TEXT="rows" ID="ID_1240335889" CREATED="1665673929663" MODIFIED="1665673934513">
<node TEXT="if the number is low" ID="ID_801952749" CREATED="1665673953984" MODIFIED="1665673958087">
<node TEXT="may remove important corner cases" ID="ID_40148950" CREATED="1665673978060" MODIFIED="1665673986310">
<icon BUILTIN="messagebox_warning"/>
</node>
</node>
</node>
<node TEXT="can create biases" ID="ID_938047319" CREATED="1665674004425" MODIFIED="1665674010992">
<icon BUILTIN="messagebox_warning"/>
</node>
</node>
<node TEXT="imputation" ID="ID_554606209" CREATED="1665674025294" MODIFIED="1665674027275">
<node TEXT="common choices" ID="ID_502279506" CREATED="1665674061100" MODIFIED="1665674063902">
<node TEXT="empty string" ID="ID_744418203" CREATED="1665674064666" MODIFIED="1665674067621"/>
<node TEXT="mean, median, mode" ID="ID_698279611" CREATED="1665674068407" MODIFIED="1665674075047"/>
</node>
<node TEXT="risks adding noise, biases, or data leakage with aggregations" ID="ID_1440053115" CREATED="1665674112274" MODIFIED="1665674127010">
<icon BUILTIN="messagebox_warning"/>
</node>
</node>
</node>
</node>
<node TEXT="Scaling" ID="ID_508853903" CREATED="1665674149380" MODIFIED="1665674154112">
<node TEXT="bring all your features to similar ranges, _e.g._ [0, 1]" ID="ID_1603122279" CREATED="1665674181831" MODIFIED="1665674222722"/>
<node TEXT="boost ML algos performance" ID="ID_958137065" CREATED="1665674224348" MODIFIED="1665674231478"/>
<node TEXT="if your distribution is skewed, you can apply a **log-transformation** to try and make it normally distributed" ID="ID_807988921" CREATED="1665674369873" MODIFIED="1665674409070"/>
</node>
<node TEXT="discretization" ID="ID_979070363" CREATED="1665674427341" MODIFIED="1665674431552">
<node TEXT="setting thresholds to define intervals that turn continuous data into discrete data" ID="ID_1366442129" CREATED="1665674440399" MODIFIED="1665674467866">
<node TEXT="can be used to bucket discrete values too" ID="ID_470600852" CREATED="1665674510413" MODIFIED="1665674522452"/>
<node TEXT="introduces discontinuities, _e.g._ 34.9!=35" ID="ID_1306106033" CREATED="1665674535614" MODIFIED="1665674558065"/>
<node TEXT="subject matter expertise is required" ID="ID_1249237549" CREATED="1665674563876" MODIFIED="1665674570144"/>
</node>
</node>
<node TEXT="Encoding categorical features" ID="ID_1894687224" CREATED="1665674591403" MODIFIED="1665674596068">
<node TEXT="considered categories may be static or dynamic" ID="ID_920346425" CREATED="1665674618806" MODIFIED="1665674654550">
<node TEXT="brands on Amazon" ID="ID_1127369111" CREATED="1665674635799" MODIFIED="1665674647473" STYLE="oval"/>
</node>
<node TEXT="handling dynamic" ID="ID_1227102343" CREATED="1665674670116" MODIFIED="1665674673179">
<node TEXT="add **unknown** category" ID="ID_935943364" CREATED="1665674673378" MODIFIED="1665674683431">
<node TEXT="yields very bad estimates for this category" ID="ID_78619873" CREATED="1665674697857" MODIFIED="1665674708279"/>
</node>
<node TEXT="The hashing trick" ID="ID_1739886199" CREATED="1665674763453" MODIFIED="1665674768257">
<node TEXT="randomly hash categories into a predifined range of indices" ID="ID_94423724" CREATED="1665674810583" MODIFIED="1665674827439">
<node TEXT="According to Booking.com, for 50% colliding categories, performance loss is only 0.5%." ID="ID_880634967" CREATED="1665674851582" MODIFIED="1665674882476"/>
</node>
<node TEXT="Vowpal Wabbit" ID="ID_286278921" CREATED="1665674946680" MODIFIED="1665674980605" STYLE="wide_hexagon"/>
<node TEXT="sklearn" ID="ID_897098008" CREATED="1665674959931" MODIFIED="1665674980605" STYLE="wide_hexagon"/>
<node TEXT="Tensorflow" ID="ID_797359415" CREATED="1665674963840" MODIFIED="1665674980605" STYLE="wide_hexagon"/>
<node TEXT="gensim" ID="ID_1234576288" CREATED="1665674971289" MODIFIED="1665674980603" STYLE="wide_hexagon"/>
</node>
</node>
</node>
<node TEXT="feature crossing" ID="ID_1108740022" CREATED="1665675012885" MODIFIED="1665675015918">
<node TEXT="for features with non-linear relations" ID="ID_284980671" CREATED="1665675028720" MODIFIED="1665675061118"/>
<node TEXT="make a cartesian product of features" ID="ID_830587430" CREATED="1665675066535" MODIFIED="1665675074244"/>
<node TEXT="good when using linear models" ID="ID_1790581784" CREATED="1665675089651" MODIFIED="1665675096230"/>
<node TEXT="occasionally helps NNs learn non-linear relationships faster" ID="ID_10814912" CREATED="1665675107252" MODIFIED="1665675122477"/>
<node TEXT="DeepFM and xDeepFM" ID="ID_1107604298" CREATED="1665675139642" MODIFIED="1665675151150" STYLE="wide_hexagon"/>
<node TEXT="Yields very large feature spaces" ID="ID_426731480" CREATED="1665675164245" MODIFIED="1665675172914">
<icon BUILTIN="messagebox_warning"/>
<node TEXT="can cause overfit" ID="ID_287459334" CREATED="1665675181549" MODIFIED="1665675186170"/>
</node>
</node>
<node TEXT="discrete and continuous positional embeddings" ID="ID_1991059253" CREATED="1665675190279" MODIFIED="1665675199081">
<node TEXT="In sequences, create position information to each sequence element as an embedded feature" ID="ID_129321553" CREATED="1665675288713" MODIFIED="1665675416601">
<node TEXT="use multidimensional sinusoidal functions of the position as is done by Vaswani et al., 2017" ID="ID_1477784369" CREATED="1665675345522" MODIFIED="1665675481010"/>
<node TEXT="one embedding per position, as in recent HuggingFace Transformer implementations" ID="ID_1912207323" CREATED="1665675369359" MODIFIED="1665675388471"/>
</node>
<node TEXT="Add this information through addition with the original feature vector" ID="ID_1580172484" CREATED="1665675417787" MODIFIED="1665675433567"/>
<node TEXT="generalizes to multidimensional sequential inputs(images, 3D structures ...)" ID="ID_1062719895" CREATED="1665675511161" MODIFIED="1665675530199"/>
</node>
</node>
<node TEXT="Data Leakage" POSITION="left" ID="ID_541004283" CREATED="1665669985888" MODIFIED="1665669989821">
<node TEXT="refers to a phenomenon where labels **leak** into the set of features in a manner that does not happen during inference" ID="ID_681898247" CREATED="1665675644286" MODIFIED="1665675703652">
<node TEXT="e.g. covid infested hospital had patient lie down during tests. this made lying down a feature that's highly correlated with the "covid positive" label." ID="ID_378928779" CREATED="1665675690244" MODIFIED="1665675759371"/>
</node>
<node TEXT="common causes" ID="ID_1514012442" CREATED="1665675799984" MODIFIED="1665675802689">
<node TEXT="data splits: splitting time-correlated data randomly instead of training on past and testing on future" ID="ID_1707878678" CREATED="1665675803095" MODIFIED="1665675839135"/>
<node TEXT="using test data for scaling" ID="ID_445918735" CREATED="1665675872781" MODIFIED="1665675877614"/>
<node TEXT="filling in missing data with stats for test set" ID="ID_741214371" CREATED="1665675887791" MODIFIED="1665675899034"/>
<node TEXT="duplicates left and split between train and test" ID="ID_434014897" CREATED="1665675902336" MODIFIED="1665675929703"/>
<node TEXT="Group leakage: a group has highly correlated labels, but is split between train and test (2 CT scan of the same patient)" ID="ID_1635272005" CREATED="1665675938055" MODIFIED="1665675981560"/>
<node TEXT="Data generation process; _e.g._ hardware used to generate data" ID="ID_1845062319" CREATED="1665676001958" MODIFIED="1665676025936"/>
</node>
<node TEXT="detecting data leakage" ID="ID_520247934" CREATED="1665676077860" MODIFIED="1665676082365">
<node TEXT="measure the predictive power of each feature" ID="ID_926129027" CREATED="1665676090405" MODIFIED="1665676106696"/>
<node TEXT="data leakage can be in a tuple of features instead of just a single feature" ID="ID_225021133" CREATED="1665676119112" MODIFIED="1665676140609"/>
<node TEXT="Only measure performance on the test set in the end. do not use it to make design choices." ID="ID_180490280" CREATED="1665676213352" MODIFIED="1665676261720"/>
</node>
</node>
<node TEXT="Engineering Good Features" POSITION="left" ID="ID_1515069594" CREATED="1665669996741" MODIFIED="1665670004719">
<node TEXT="generally, adding features performs better" ID="ID_1508301488" CREATED="1665676273968" MODIFIED="1665676301309"/>
<node TEXT="However too many features could" ID="ID_206952298" CREATED="1665676319565" MODIFIED="1665676329271">
<node TEXT="overfit" ID="ID_486232894" CREATED="1665676329537" MODIFIED="1665676331550"/>
<node TEXT="cause data leakage" ID="ID_1203982991" CREATED="1665676336507" MODIFIED="1665676340844"/>
<node TEXT="increase memory requirements" ID="ID_15692724" CREATED="1665676349369" MODIFIED="1665676360927"/>
<node TEXT="increase feature calculation overhead" ID="ID_932200582" CREATED="1665676361879" MODIFIED="1665676373181"/>
<node TEXT="useless features become technical debt" ID="ID_762055571" CREATED="1665676390713" MODIFIED="1665676397102">
<node TEXT="L1 regularization can help detect them" ID="ID_789676850" CREATED="1665676411401" MODIFIED="1665676423106"/>
</node>
</node>
<node TEXT="Feature importance" ID="ID_247898419" CREATED="1665676454384" MODIFIED="1665676458540">
<node TEXT="can be measured with built-in XGBoost functions" ID="ID_858200294" CREATED="1665676459863" MODIFIED="1665676481932"/>
<node TEXT="SHAP: measure feature importance for the entire model, but also for each prediction" ID="ID_1967365734" CREATED="1665676495735" MODIFIED="1665676512401"/>
</node>
<node TEXT="Feature generalization" ID="ID_421488870" CREATED="1665676588610" MODIFIED="1665676594209">
<node TEXT="feature coverage: percentage of samples that have this feature" ID="ID_994602568" CREATED="1665676617507" MODIFIED="1665676634679">
<node TEXT="rule of thumb: if it's not in a lot of samples, the feature's not going to generalize well" ID="ID_1308558732" CREATED="1665676661235" MODIFIED="1665676702041"/>
</node>
<node TEXT="distribution of feature values" ID="ID_153171148" CREATED="1665676634787" MODIFIED="1665676642319">
<node TEXT="Does this feature have the same distribution in seen and unseen data ? If not, it might not generalize well" ID="ID_1465573792" CREATED="1665676855616" MODIFIED="1665676881282">
<node TEXT="this can be mitigated through discretization of continuous values to diminish distribution discrepancy" ID="ID_750605033" CREATED="1665676911115" MODIFIED="1665676948489"/>
</node>
</node>
</node>
</node>
</node>
</map>