forked from andkret/Cookbook
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data Engineering Cookbook.aux
359 lines (359 loc) · 31.2 KB
/
Data Engineering Cookbook.aux
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
\relax
\providecommand\hyper@newdestlabel[2]{}
\providecommand*\new@tpo@label[2]{}
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
\global\let\oldcontentsline\contentsline
\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
\global\let\oldnewlabel\newlabel
\gdef\newlabel#1#2{\newlabelxx{#1}#2}
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
\AtEndDocument{\ifx\hyper@anchor\@undefined
\let\contentsline\oldcontentsline
\let\newlabel\oldnewlabel
\fi}
\fi}
\global\let\hyper@last\relax
\gdef\HyperFirstAtBeginDocument#1{#1}
\providecommand\HyField@AuxAddToFields[1]{}
\providecommand\HyField@AuxAddToCoFields[2]{}
\@writefile{toc}{\contentsline {part}{\numberline {I}Introduction}{9}{part.1}}
\@writefile{toc}{\contentsline {chapter}{\numberline {1}How To Use This Cookbook}{10}{chapter.1}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {chapter}{\numberline {2}Data Engineer vs Data Scientists}{11}{chapter.2}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {2.1}Data Scientist}{11}{section.2.1}}
\@writefile{lof}{\contentsline {figure}{\numberline {2.1}{\ignorespaces The Machine Learning Pipeline}}{12}{figure.2.1}}
\newlabel{fig:Bild1}{{2.1}{12}{The Machine Learning Pipeline}{figure.2.1}{}}
\@writefile{toc}{\contentsline {section}{\numberline {2.2}Data Engineer}{12}{section.2.2}}
\@writefile{toc}{\contentsline {section}{\numberline {2.3}Who Companies Need}{13}{section.2.3}}
\@writefile{toc}{\contentsline {part}{\numberline {II}Basic Data Engineering Skills}{14}{part.2}}
\@writefile{toc}{\contentsline {chapter}{\numberline {3}Learn To Code}{15}{chapter.3}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {chapter}{\numberline {4}Get Familiar With Github}{16}{chapter.4}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {chapter}{\numberline {5}Agile Development -- available}{17}{chapter.5}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {5.1}Why is agile so important?}{17}{section.5.1}}
\@writefile{toc}{\contentsline {section}{\numberline {5.2}Agile rules I learned over the years -- available}{18}{section.5.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.1}Is the method making a difference?}{18}{subsection.5.2.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.2}The problem with outsourcing}{18}{subsection.5.2.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.3}Knowledge is king: A lesson from Elon Musk}{19}{subsection.5.2.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.4}How you really can be agile}{19}{subsection.5.2.4}}
\@writefile{toc}{\contentsline {section}{\numberline {5.3}Agile Frameworks}{20}{section.5.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.3.1}Scrum}{20}{subsection.5.3.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.3.2}OKR}{20}{subsection.5.3.2}}
\@writefile{toc}{\contentsline {chapter}{\numberline {6}Learn how a Computer Works}{21}{chapter.6}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {6.1}CPU,RAM,GPU,HDD}{21}{section.6.1}}
\@writefile{toc}{\contentsline {section}{\numberline {6.2}Differences between PCs and Servers}{21}{section.6.2}}
\@writefile{toc}{\contentsline {chapter}{\numberline {7}Computer Networking - Data Transmission}{22}{chapter.7}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {7.1}OSI Model}{22}{section.7.1}}
\@writefile{toc}{\contentsline {paragraph}{\nonumberline Which protocol lives on which layer?}{22}{section*.2}}
\@writefile{toc}{\contentsline {section}{\numberline {7.2}IP Subnetting}{22}{section.7.2}}
\@writefile{toc}{\contentsline {section}{\numberline {7.3}Switch, Level 3 Switch}{23}{section.7.3}}
\@writefile{toc}{\contentsline {section}{\numberline {7.4}Router}{23}{section.7.4}}
\@writefile{toc}{\contentsline {section}{\numberline {7.5}Firewalls}{23}{section.7.5}}
\@writefile{toc}{\contentsline {chapter}{\numberline {8}Security and Privacy}{24}{chapter.8}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {8.1}SSL Public \& Private Key Certificates}{24}{section.8.1}}
\@writefile{toc}{\contentsline {section}{\numberline {8.2}What is a certificate authority}{24}{section.8.2}}
\@writefile{toc}{\contentsline {section}{\numberline {8.3}JSON Web Tokens}{24}{section.8.3}}
\@writefile{toc}{\contentsline {section}{\numberline {8.4}GDPR regulations}{24}{section.8.4}}
\@writefile{toc}{\contentsline {section}{\numberline {8.5}Privacy by design}{24}{section.8.5}}
\@writefile{toc}{\contentsline {chapter}{\numberline {9}Linux}{25}{chapter.9}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {9.1}OS Basics}{25}{section.9.1}}
\@writefile{toc}{\contentsline {section}{\numberline {9.2}Shell scripting}{25}{section.9.2}}
\@writefile{toc}{\contentsline {section}{\numberline {9.3}Cron jobs}{25}{section.9.3}}
\@writefile{toc}{\contentsline {section}{\numberline {9.4}Packet management}{25}{section.9.4}}
\@writefile{toc}{\contentsline {chapter}{\numberline {10}The Cloud}{26}{chapter.10}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {10.1}IaaS vs PaaS vs SaaS}{26}{section.10.1}}
\@writefile{toc}{\contentsline {section}{\numberline {10.2}AWS,Azure, IBM, Google Cloud basics}{26}{section.10.2}}
\@writefile{toc}{\contentsline {section}{\numberline {10.3}cloud vs on premise}{26}{section.10.3}}
\@writefile{toc}{\contentsline {section}{\numberline {10.4}up \& downsides}{26}{section.10.4}}
\@writefile{toc}{\contentsline {section}{\numberline {10.5}Security}{26}{section.10.5}}
\@writefile{toc}{\contentsline {chapter}{\numberline {11}Security Zone Design}{27}{chapter.11}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {11.1}How to secure a multi layered application}{27}{section.11.1}}
\@writefile{toc}{\contentsline {section}{\numberline {11.2}Cluster security with Kerberos}{27}{section.11.2}}
\@writefile{toc}{\contentsline {section}{\numberline {11.3}Kerberos Tickets}{27}{section.11.3}}
\@writefile{toc}{\contentsline {chapter}{\numberline {12}Stream Processing}{28}{chapter.12}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {12.1}Three methods of streaming | available}{28}{section.12.1}}
\@writefile{toc}{\contentsline {section}{\numberline {12.2}At Least Once}{28}{section.12.2}}
\@writefile{toc}{\contentsline {section}{\numberline {12.3}At Most Once}{29}{section.12.3}}
\@writefile{toc}{\contentsline {section}{\numberline {12.4}Exactly Once}{29}{section.12.4}}
\@writefile{toc}{\contentsline {section}{\numberline {12.5}Check The Tools!}{29}{section.12.5}}
\@writefile{toc}{\contentsline {chapter}{\numberline {13}Big Data}{30}{chapter.13}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {13.1}What is big data and where is the difference to data science and data analytics?}{30}{section.13.1}}
\@writefile{toc}{\contentsline {section}{\numberline {13.2}The 4Vs of Big Data | available}{30}{section.13.2}}
\@writefile{toc}{\contentsline {section}{\numberline {13.3}Why Big Data? | available}{31}{section.13.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {13.3.1}Planning is Everything}{32}{subsection.13.3.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {13.3.2}The Problem With ETL}{32}{subsection.13.3.2}}
\@writefile{lof}{\contentsline {figure}{\numberline {13.1}{\ignorespaces Common SQL Platform Architecture}}{32}{figure.13.1}}
\newlabel{fig:Bild1}{{13.1}{32}{Common SQL Platform Architecture}{figure.13.1}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {13.3.3}Scaling Up}{33}{subsection.13.3.3}}
\@writefile{lof}{\contentsline {figure}{\numberline {13.2}{\ignorespaces Scaling up a SQL Database}}{33}{figure.13.2}}
\newlabel{fig:Bild1}{{13.2}{33}{Scaling up a SQL Database}{figure.13.2}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {13.3.4}Scaling Out}{34}{subsection.13.3.4}}
\@writefile{lof}{\contentsline {figure}{\numberline {13.3}{\ignorespaces Scaling out a SQL Database}}{34}{figure.13.3}}
\newlabel{fig:Bild1}{{13.3}{34}{Scaling out a SQL Database}{figure.13.3}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {13.3.5}Please Don\IeC {\textquoteright }t go Big Data}{35}{subsection.13.3.5}}
\@writefile{toc}{\contentsline {chapter}{\numberline {14}Data Warehouse vs Data Lake}{36}{chapter.14}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {chapter}{\numberline {15}Hadoop Platforms | available}{37}{chapter.15}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {15.1}What is Hadoop}{37}{section.15.1}}
\@writefile{toc}{\contentsline {section}{\numberline {15.2}What makes Hadoop so popular? | available}{37}{section.15.2}}
\@writefile{lof}{\contentsline {figure}{\numberline {15.1}{\ignorespaces Hadoop Ecosystem Components}}{38}{figure.15.1}}
\newlabel{fig:Bild1}{{15.1}{38}{Hadoop Ecosystem Components}{figure.15.1}{}}
\@writefile{toc}{\contentsline {section}{\numberline {15.3}Hadoop Ecosystem Components}{38}{section.15.3}}
\@writefile{lof}{\contentsline {figure}{\numberline {15.2}{\ignorespaces Connections between tools}}{39}{figure.15.2}}
\newlabel{fig:Bild1}{{15.2}{39}{Connections between tools}{figure.15.2}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {15.3}{\ignorespaces Flume Integration}}{40}{figure.15.3}}
\newlabel{fig:Bild1}{{15.3}{40}{Flume Integration}{figure.15.3}{}}
\@writefile{toc}{\contentsline {section}{\numberline {15.4}Hadoop Is Everywhere?}{40}{section.15.4}}
\@writefile{toc}{\contentsline {section}{\numberline {15.5}Should you learn Hadoop?}{41}{section.15.5}}
\@writefile{toc}{\contentsline {subsubsection}{\nonumberline How does a Hadoop System architecture look like}{41}{section*.3}}
\@writefile{toc}{\contentsline {subsubsection}{\nonumberline What tools are usually in a with Hadoop Cluster}{41}{section*.4}}
\@writefile{toc}{\contentsline {section}{\numberline {15.6}How to select Hadoop Cluster Hardware}{41}{section.15.6}}
\@writefile{toc}{\contentsline {chapter}{\numberline {16}Is ETL still relevant for Analytics?}{42}{chapter.16}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {chapter}{\numberline {17}Docker}{43}{chapter.17}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {17.1}What is docker and what do you use it for | available}{43}{section.17.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {17.1.1}Don\IeC {\textquoteright }t Mess Up Your System}{43}{subsection.17.1.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {17.1.2}Preconfigured Images}{43}{subsection.17.1.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {17.1.3}Take It With You}{44}{subsection.17.1.3}}
\@writefile{toc}{\contentsline {section}{\numberline {17.2}Kubernetes Container Deployment}{44}{section.17.2}}
\@writefile{toc}{\contentsline {section}{\numberline {17.3}How to create, start,stop a Container}{45}{section.17.3}}
\@writefile{toc}{\contentsline {section}{\numberline {17.4}Docker micro services?}{45}{section.17.4}}
\@writefile{toc}{\contentsline {section}{\numberline {17.5}Kubernetes}{45}{section.17.5}}
\@writefile{toc}{\contentsline {section}{\numberline {17.6}Why and how to do Docker container orchestration}{45}{section.17.6}}
\@writefile{toc}{\contentsline {chapter}{\numberline {18}REST APIs}{46}{chapter.18}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {18.1}HTTP Post/Get}{46}{section.18.1}}
\@writefile{toc}{\contentsline {section}{\numberline {18.2}API Design}{46}{section.18.2}}
\@writefile{toc}{\contentsline {section}{\numberline {18.3}Implementation}{46}{section.18.3}}
\@writefile{toc}{\contentsline {section}{\numberline {18.4}OAuth security}{46}{section.18.4}}
\@writefile{toc}{\contentsline {chapter}{\numberline {19}Databases}{47}{chapter.19}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {19.1}SQL Databases}{47}{section.19.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {19.1.1}Database Design}{47}{subsection.19.1.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {19.1.2}SQL Queries}{47}{subsection.19.1.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {19.1.3}Stored Procedures}{47}{subsection.19.1.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {19.1.4}ODBC/JDBC Server Connections}{47}{subsection.19.1.4}}
\@writefile{toc}{\contentsline {section}{\numberline {19.2}NoSQL Stores}{47}{section.19.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {19.2.1}KeyValue Stores (HBase)}{47}{subsection.19.2.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {19.2.2}Document Store HDFS | available}{47}{subsection.19.2.2}}
\@writefile{lof}{\contentsline {figure}{\numberline {19.1}{\ignorespaces HDFS Master and Data Nodes}}{48}{figure.19.1}}
\newlabel{fig:Bild1}{{19.1}{48}{HDFS Master and Data Nodes}{figure.19.1}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {19.2}{\ignorespaces Distribution of Blocks for a 512MB File}}{49}{figure.19.2}}
\newlabel{fig:Bild1}{{19.2}{49}{Distribution of Blocks for a 512MB File}{figure.19.2}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {19.2.3}Document Store MongoDB}{49}{subsection.19.2.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {19.2.4}Hive Warehouse}{49}{subsection.19.2.4}}
\@writefile{toc}{\contentsline {subsection}{\numberline {19.2.5}Impala}{49}{subsection.19.2.5}}
\@writefile{toc}{\contentsline {subsection}{\numberline {19.2.6}Kudu}{49}{subsection.19.2.6}}
\@writefile{toc}{\contentsline {subsection}{\numberline {19.2.7}Time Series Databases}{49}{subsection.19.2.7}}
\@writefile{toc}{\contentsline {subsection}{\numberline {19.2.8}MPP Databases (Greenplum)}{49}{subsection.19.2.8}}
\@writefile{toc}{\contentsline {chapter}{\numberline {20}Data Processing / Analytics - Frameworks}{50}{chapter.20}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {20.1}MapReduce}{50}{section.20.1}}
\@writefile{lof}{\contentsline {figure}{\numberline {20.1}{\ignorespaces Mapping of input files and reducing of mapped records}}{51}{figure.20.1}}
\newlabel{fig:Bild1}{{20.1}{51}{Mapping of input files and reducing of mapped records}{figure.20.1}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {20.1.1}How does MapReduce work -- available}{52}{subsection.20.1.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {20.1.2}Example}{52}{subsection.20.1.2}}
\@writefile{lof}{\contentsline {figure}{\numberline {20.2}{\ignorespaces MapReduce Example of Time Series Data}}{53}{figure.20.2}}
\newlabel{fig:Bild1}{{20.2}{53}{MapReduce Example of Time Series Data}{figure.20.2}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {20.1.3}What is the limitation of MapReduce? -- available}{54}{subsection.20.1.3}}
\@writefile{lof}{\contentsline {figure}{\numberline {20.3}{\ignorespaces The Map Reduce Process}}{54}{figure.20.3}}
\newlabel{fig:Bild1}{{20.3}{54}{The Map Reduce Process}{figure.20.3}{}}
\@writefile{toc}{\contentsline {section}{\numberline {20.2}Apache Spark}{54}{section.20.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {20.2.1}What is the difference to MapReduce? -- available}{55}{subsection.20.2.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {20.2.2}How does Spark fit to Hadoop? -- available}{55}{subsection.20.2.2}}
\@writefile{lof}{\contentsline {figure}{\numberline {20.4}{\ignorespaces Hadoop vs Spark capabilities}}{55}{figure.20.4}}
\newlabel{fig:Bild1}{{20.4}{55}{Hadoop vs Spark capabilities}{figure.20.4}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {20.2.3}Where's the difference?}{55}{subsection.20.2.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {20.2.4}Spark and Hadoop is a perfect fit}{56}{subsection.20.2.4}}
\@writefile{toc}{\contentsline {subsection}{\numberline {20.2.5}Spark on YARN:}{56}{subsection.20.2.5}}
\@writefile{toc}{\contentsline {subsection}{\numberline {20.2.6}My simple rule of thumb:}{57}{subsection.20.2.6}}
\@writefile{toc}{\contentsline {subsection}{\numberline {20.2.7}Available Languages -- available}{57}{subsection.20.2.7}}
\@writefile{toc}{\contentsline {subsection}{\numberline {20.2.8}How to do stream processing}{57}{subsection.20.2.8}}
\@writefile{toc}{\contentsline {subsection}{\numberline {20.2.9}How to do batch processing}{57}{subsection.20.2.9}}
\@writefile{toc}{\contentsline {subsection}{\numberline {20.2.10}How does Spark use data from Hadoop -- available}{57}{subsection.20.2.10}}
\@writefile{lof}{\contentsline {figure}{\numberline {20.5}{\ignorespaces Spark Using Hadoop Data Locality}}{58}{figure.20.5}}
\newlabel{fig:Bild1}{{20.5}{58}{Spark Using Hadoop Data Locality}{figure.20.5}{}}
\@writefile{toc}{\contentsline {section}{\numberline {20.3}What is a RDD and what is a DataFrame?}{59}{section.20.3}}
\@writefile{toc}{\contentsline {section}{\numberline {20.4}Spark coding with Scala}{59}{section.20.4}}
\@writefile{toc}{\contentsline {section}{\numberline {20.5}Spark coding with Python}{59}{section.20.5}}
\@writefile{toc}{\contentsline {section}{\numberline {20.6}How and why to use SparkSQL?}{59}{section.20.6}}
\@writefile{toc}{\contentsline {section}{\numberline {20.7}Machine Learning on Spark? (Tensor Flow)}{59}{section.20.7}}
\@writefile{toc}{\contentsline {section}{\numberline {20.8}MLlib:}{59}{section.20.8}}
\@writefile{toc}{\contentsline {section}{\numberline {20.9}Spark Setup -- available}{59}{section.20.9}}
\@writefile{toc}{\contentsline {section}{\numberline {20.10}Spark Resource Management -- available}{60}{section.20.10}}
\@writefile{lof}{\contentsline {figure}{\numberline {20.6}{\ignorespaces Spark Resource Management With YARN}}{60}{figure.20.6}}
\newlabel{fig:Bild1}{{20.6}{60}{Spark Resource Management With YARN}{figure.20.6}{}}
\@writefile{toc}{\contentsline {chapter}{\numberline {21}Apache Kafka}{61}{chapter.21}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {21.1}Why a message queue tool?}{61}{section.21.1}}
\@writefile{toc}{\contentsline {section}{\numberline {21.2}Kakfa architecture}{61}{section.21.2}}
\@writefile{toc}{\contentsline {section}{\numberline {21.3}What are topics}{61}{section.21.3}}
\@writefile{toc}{\contentsline {section}{\numberline {21.4}What does Zookeeper have to do with Kafka}{61}{section.21.4}}
\@writefile{toc}{\contentsline {section}{\numberline {21.5}How to produce and consume messages}{61}{section.21.5}}
\@writefile{toc}{\contentsline {chapter}{\numberline {22}Machine Learning}{62}{chapter.22}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {22.1}Training and Applying models}{62}{section.22.1}}
\@writefile{toc}{\contentsline {section}{\numberline {22.2}What is deep learning}{62}{section.22.2}}
\@writefile{toc}{\contentsline {section}{\numberline {22.3}How to do Machine Learning in production | available}{62}{section.22.3}}
\@writefile{toc}{\contentsline {section}{\numberline {22.4}Why machine learning in production is harder then you think -- available}{63}{section.22.4}}
\@writefile{toc}{\contentsline {section}{\numberline {22.5}Models Do Not Work Forever}{63}{section.22.5}}
\@writefile{toc}{\contentsline {section}{\numberline {22.6}Where The Platforms That Support This?}{63}{section.22.6}}
\@writefile{toc}{\contentsline {section}{\numberline {22.7}Training Parameter Management}{64}{section.22.7}}
\@writefile{toc}{\contentsline {section}{\numberline {22.8}What\IeC {\textquoteright }s Your Solution?}{64}{section.22.8}}
\@writefile{toc}{\contentsline {section}{\numberline {22.9}How to convince people machine learning works | available}{64}{section.22.9}}
\@writefile{toc}{\contentsline {section}{\numberline {22.10}No Rules, No Physical Models}{65}{section.22.10}}
\@writefile{toc}{\contentsline {section}{\numberline {22.11}You Have The Data. USE IT!}{65}{section.22.11}}
\@writefile{toc}{\contentsline {section}{\numberline {22.12}Data is Stronger Than Opinions}{66}{section.22.12}}
\@writefile{toc}{\contentsline {chapter}{\numberline {23}Data Visualization}{67}{chapter.23}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {23.1}Android \& IOS}{67}{section.23.1}}
\@writefile{toc}{\contentsline {section}{\numberline {23.2}How to design APIs for mobile apps}{67}{section.23.2}}
\@writefile{toc}{\contentsline {section}{\numberline {23.3}How to use Webservers to display content}{67}{section.23.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {23.3.1}Tomcat}{68}{subsection.23.3.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {23.3.2}Jetty}{68}{subsection.23.3.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {23.3.3}NodeRED}{68}{subsection.23.3.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {23.3.4}React}{68}{subsection.23.3.4}}
\@writefile{toc}{\contentsline {section}{\numberline {23.4}Business Intelligence Tools}{68}{section.23.4}}
\@writefile{toc}{\contentsline {subsection}{\numberline {23.4.1}Tableau}{68}{subsection.23.4.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {23.4.2}PowerBI}{68}{subsection.23.4.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {23.4.3}Quliksense}{68}{subsection.23.4.3}}
\@writefile{toc}{\contentsline {section}{\numberline {23.5}Identity \& Device Management}{68}{section.23.5}}
\@writefile{toc}{\contentsline {subsection}{\numberline {23.5.1}What is a digital twin?}{68}{subsection.23.5.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {23.5.2}Active Directory}{68}{subsection.23.5.2}}
\@writefile{toc}{\contentsline {part}{\numberline {III}Building A Data Platform Example}{69}{part.3}}
\@writefile{toc}{\contentsline {chapter}{\numberline {24}My Big Data Platform Blueprint}{70}{chapter.24}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {paragraph}{\nonumberline THE BLUEPRINT}{70}{section*.5}}
\@writefile{lof}{\contentsline {figure}{\numberline {24.1}{\ignorespaces Platfrom Blueprint}}{70}{figure.24.1}}
\newlabel{fig:Bild1}{{24.1}{70}{Platfrom Blueprint}{figure.24.1}{}}
\@writefile{toc}{\contentsline {section}{\numberline {24.1}Ingest}{71}{section.24.1}}
\@writefile{toc}{\contentsline {section}{\numberline {24.2}Analyse / Process}{71}{section.24.2}}
\@writefile{toc}{\contentsline {section}{\numberline {24.3}Store}{72}{section.24.3}}
\@writefile{toc}{\contentsline {section}{\numberline {24.4}Display}{73}{section.24.4}}
\@writefile{toc}{\contentsline {chapter}{\numberline {25}Lambda Architecture}{74}{chapter.25}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {25.1}Batch Processing}{74}{section.25.1}}
\@writefile{lof}{\contentsline {figure}{\numberline {25.1}{\ignorespaces Batch Processing Pipeline}}{74}{figure.25.1}}
\newlabel{fig:Bild1}{{25.1}{74}{Batch Processing Pipeline}{figure.25.1}{}}
\@writefile{toc}{\contentsline {section}{\numberline {25.2}Stream Processing}{75}{section.25.2}}
\@writefile{lof}{\contentsline {figure}{\numberline {25.2}{\ignorespaces Stream Processing Pipeline}}{75}{figure.25.2}}
\newlabel{fig:Bild1}{{25.2}{75}{Stream Processing Pipeline}{figure.25.2}{}}
\@writefile{toc}{\contentsline {section}{\numberline {25.3}Should you do stream or batch processing?}{75}{section.25.3}}
\@writefile{toc}{\contentsline {section}{\numberline {25.4}Lambda Architecture Alternative}{76}{section.25.4}}
\@writefile{toc}{\contentsline {subsection}{\numberline {25.4.1}Kappa Architecture}{76}{subsection.25.4.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {25.4.2}Kappa Architecture with Kudu}{76}{subsection.25.4.2}}
\@writefile{toc}{\contentsline {chapter}{\numberline {26}(}{77}{chapter.26}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {chapter}{\numberline {27}Thoughts On Choosing The Target Environment}{78}{chapter.27}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {27.1}Cloud vs On-Premise}{78}{section.27.1}}
\@writefile{toc}{\contentsline {section}{\numberline {27.2}Cloud Native or Independent Vendors}{78}{section.27.2}}
\@writefile{toc}{\contentsline {chapter}{\numberline {28}Thoughts On Choosing A Development Environment}{79}{chapter.28}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {28.1}Cloud As Dev Environment}{79}{section.28.1}}
\@writefile{toc}{\contentsline {section}{\numberline {28.2}Local Dev Environment}{79}{section.28.2}}
\@writefile{toc}{\contentsline {section}{\numberline {28.3}Data Architecture}{79}{section.28.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {28.3.1}Source Data}{79}{subsection.28.3.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {28.3.2}Analytics Requirements For Streaming}{79}{subsection.28.3.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {28.3.3}Analytics Requirements For Batch Processing}{79}{subsection.28.3.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {28.3.4}Data Visualization}{79}{subsection.28.3.4}}
\@writefile{toc}{\contentsline {section}{\numberline {28.4}Milestone 1 | Tool Decisions}{79}{section.28.4}}
\@writefile{toc}{\contentsline {part}{\numberline {IV}Case Studies}{80}{part.4}}
\@writefile{toc}{\contentsline {chapter}{\numberline {29}How I do Case Studies}{81}{chapter.29}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {29.1}Data Science @Airbnb}{81}{section.29.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {29.1.1}Data Science @Amazon}{81}{subsection.29.1.1}}
\@writefile{toc}{\contentsline {section}{\numberline {29.2}Data Science @Baidu}{81}{section.29.2}}
\@writefile{toc}{\contentsline {section}{\numberline {29.3}Data Science @Blackrock}{81}{section.29.3}}
\@writefile{toc}{\contentsline {section}{\numberline {29.4}Data Science @BMW}{82}{section.29.4}}
\@writefile{toc}{\contentsline {section}{\numberline {29.5}Data Science @Booking.com}{82}{section.29.5}}
\@writefile{toc}{\contentsline {section}{\numberline {29.6}Data Science @CERN}{82}{section.29.6}}
\@writefile{toc}{\contentsline {section}{\numberline {29.7}Data Science @Disney}{83}{section.29.7}}
\@writefile{toc}{\contentsline {section}{\numberline {29.8}Data Science @Drivetribe}{83}{section.29.8}}
\@writefile{toc}{\contentsline {section}{\numberline {29.9}Data Science @Dropbox}{83}{section.29.9}}
\@writefile{toc}{\contentsline {section}{\numberline {29.10}Data Science @Ebay}{83}{section.29.10}}
\@writefile{toc}{\contentsline {section}{\numberline {29.11}Data Science @Expedia}{83}{section.29.11}}
\@writefile{toc}{\contentsline {section}{\numberline {29.12}Data Science @Facebook}{83}{section.29.12}}
\@writefile{toc}{\contentsline {subsection}{\numberline {29.12.1}Data Science @Google}{84}{subsection.29.12.1}}
\@writefile{toc}{\contentsline {section}{\numberline {29.13}Data Science @@Grammarly}{84}{section.29.13}}
\@writefile{toc}{\contentsline {section}{\numberline {29.14}Data Science @ING Fraud}{84}{section.29.14}}
\@writefile{toc}{\contentsline {section}{\numberline {29.15}Data Science @Instagram}{84}{section.29.15}}
\@writefile{toc}{\contentsline {section}{\numberline {29.16}Data Science @LinkedIn}{84}{section.29.16}}
\@writefile{toc}{\contentsline {section}{\numberline {29.17}Data Science @Lyft}{84}{section.29.17}}
\@writefile{toc}{\contentsline {section}{\numberline {29.18}Data Science @NASA}{84}{section.29.18}}
\@writefile{toc}{\contentsline {section}{\numberline {29.19}Data Science @Netflix -- available}{85}{section.29.19}}
\@writefile{toc}{\contentsline {paragraph}{\nonumberline The Netflix batch processing pipeline}{85}{section*.6}}
\@writefile{lof}{\contentsline {figure}{\numberline {29.1}{\ignorespaces Old Netflix Batch Processing Pipeline}}{86}{figure.29.1}}
\newlabel{fig:Bild1}{{29.1}{86}{Old Netflix Batch Processing Pipeline}{figure.29.1}{}}
\@writefile{toc}{\contentsline {paragraph}{\nonumberline Know what customers want:}{86}{section*.7}}
\@writefile{toc}{\contentsline {paragraph}{\nonumberline Batch processing is not enough}{86}{section*.8}}
\@writefile{toc}{\contentsline {paragraph}{\nonumberline The trending now feature}{87}{section*.9}}
\@writefile{lof}{\contentsline {figure}{\numberline {29.2}{\ignorespaces Netflix Trending Now Feature}}{87}{figure.29.2}}
\newlabel{fig:Bild1}{{29.2}{87}{Netflix Trending Now Feature}{figure.29.2}{}}
\@writefile{toc}{\contentsline {paragraph}{\nonumberline Netflix real-time streaming architecture}{87}{section*.10}}
\@writefile{lof}{\contentsline {figure}{\numberline {29.3}{\ignorespaces Netflix Streaming Pipeline}}{88}{figure.29.3}}
\newlabel{fig:Bild1}{{29.3}{88}{Netflix Streaming Pipeline}{figure.29.3}{}}
\@writefile{toc}{\contentsline {section}{\numberline {29.20}Data Science @OLX}{88}{section.29.20}}
\@writefile{toc}{\contentsline {section}{\numberline {29.21}Data Science @OTTO}{89}{section.29.21}}
\@writefile{toc}{\contentsline {section}{\numberline {29.22}Data Science @Paypal}{89}{section.29.22}}
\@writefile{toc}{\contentsline {section}{\numberline {29.23}Data Science @Pinterest}{89}{section.29.23}}
\@writefile{toc}{\contentsline {section}{\numberline {29.24}Data Science @Salesforce}{90}{section.29.24}}
\@writefile{toc}{\contentsline {section}{\numberline {29.25}Data Science @Slack}{90}{section.29.25}}
\@writefile{toc}{\contentsline {section}{\numberline {29.26}Data Science @Spotify}{90}{section.29.26}}
\@writefile{toc}{\contentsline {section}{\numberline {29.27}Data Science @Symantec}{90}{section.29.27}}
\@writefile{toc}{\contentsline {section}{\numberline {29.28}Data Science @Tinder}{90}{section.29.28}}
\@writefile{toc}{\contentsline {section}{\numberline {29.29}Data Science @Twitter}{90}{section.29.29}}
\@writefile{toc}{\contentsline {section}{\numberline {29.30}Data Science @Uber}{90}{section.29.30}}
\@writefile{toc}{\contentsline {section}{\numberline {29.31}Data Science @Upwork}{91}{section.29.31}}
\@writefile{toc}{\contentsline {section}{\numberline {29.32}Data Science @Woot}{91}{section.29.32}}
\@writefile{toc}{\contentsline {section}{\numberline {29.33}Data Science @Zalando}{91}{section.29.33}}
\bibcite{einstein}{1}
\global\csname @altsecnumformattrue\endcsname
\global\@namedef{scr@dte@part@lastmaxnumwidth}{25.4799pt}
\global\@namedef{scr@dte@chapter@lastmaxnumwidth}{18.89992pt}
\global\@namedef{scr@dte@section@lastmaxnumwidth}{31.46349pt}
\global\@namedef{scr@dte@subsection@lastmaxnumwidth}{40.60228pt}
\global\@namedef{scr@dte@figure@lastmaxnumwidth}{25.58855pt}