-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makatun_Thesis_Statement_Presentation_2.tex
706 lines (607 loc) · 25.4 KB
/
Makatun_Thesis_Statement_Presentation_2.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
\documentclass{beamer}
\usepackage{graphicx}
\usepackage{caption}
%\usepackage{subcaption}
\usepackage{color}
\usepackage{amssymb}
\usepackage{xcolor}
\usepackage{wrapfig}
\usepackage{amsmath}
\usepackage[
backend=bibtex,
sorting=none,
style=authoryear
]{biblatex}
\addbibresource{bibliography.bib}
\setbeamertemplate{navigation symbols}{}
\usetheme{CambridgeUS}
\usecolortheme{beaver}
\beamersetuncovermixins{\opaqueness<1>{25}}{\opaqueness<2->{15}}
%\pgfdeclareimage[height=2cm]{i4}{pic/acat.pdf}
%\author[Dzmitry Makatun]{\textbf{Dzmitry~Makatun} \inst{1} $^{,}$ \inst{2}}
\author[Dzmitry Makatun]{\textbf{Dzmitry~Makatun} \inst{1} \inst{3} \and J\'er\^ome~Lauret\inst{2}\\ \and Michal~\v{S}umbera \inst{1} \and Hana~Rudov\'a \inst{4}}
\title[FJFI, \v{C}VUT]{Distributed Data Processing for High Energy Physics}
\institute [KM, FJFI]
{
\inst{1}%
Nuclear Physics Institute, Academy of Sciences, Czech Republic
\pgfdeclareimage[height=2cm]{i1}{pic/ujf.pdf}
\and
\inst{2}%
Brookhaven National Laboratory, USA
\pgfdeclareimage[height=2cm]{i2}{pic/STAR.pdf}
\and
\inst{3}
Czech Technical University in Prague, Czech Republic
\pgfdeclareimage[height=2cm]{i3}{pic/ctu.pdf}
\and
\inst{4}
Faculty of Informatics, Masaryk University, Czech Republic
\pgfdeclareimage[height=2cm]{i6}{pic/fjfi.pdf}
\pgfdeclareimage[height=2cm]{i8}{pic/doe.pdf}
\vspace{-4mm}
\pgfuseimage{i1}
\pgfuseimage{i2}
\pgfuseimage{i3}
%\pgfuseimage{i4}
%\pgfuseimage{i5}
%\pgfuseimage{i6}
%\pgfuseimage{i8}
\vspace{-7mm}
\begin{center}
d.i.makatun@gmail.com
\end{center}
\vspace{-5mm}
}
\begin{document}
\date{\today}
\begin{frame}
\titlepage
\end{frame}
\begin{frame}\frametitle{Outline}\tableofcontents
\end{frame}
\section{Introduction}
\begin{frame}\frametitle{Computations in HEP: what do we compute?}
\begin{figure}
\begin{center}
%\vspace{-1 cm}
\includegraphics[ height=0.4\textheight]{pic/rhic.jpg}
\hspace{0.5cm}
\includegraphics[ height=0.4\textheight]{pic/star.jpg}
\end{center}
\end{figure}
\vspace{-1 cm}
\begin{block}{}
\begin{itemize}
\item Brookhaven National Laboratory (\textbf{BNL}) Long Island, NY, USA
\item Relativistic Heavy Ion Collider (\textbf{RHIC}). In Gold-Gold ion collisions a quark-gluon plasma is created to study the primordial form of matter that existed in the universe shortly after the Big Bang.
\item Solenoid Tracker at RHIC (\textbf{STAR}). Collisions occur millions of times per second. Events of size 200 MB are processed at input rates up to 100Hz. Output data rate is $\sim$ \textcolor{red} {30 MB/sec}.
\end{itemize}
\end{block}
\end{frame}
\begin{frame}\frametitle{Computations in HEP: how do we compute?}
\begin{block}{}
\textbf{Data Production:} The raw output data is processed to reconstruct events. ($\sim$ ones)\\
\textbf{User Analysis:} Then the reconstructed events are analyzed by scientists to discover new physics. (Each file many times)\\
All the data: raw, reconstructed and analysis output are stored.\\
31 PB of data stored on tape, $\sim$ 12 000 jobs running simultaneously (at RCF only).\\
\end{block}
\begin{figure}
\begin{center}
\includegraphics[ height=0.35\textheight]{pic/grid-net.png}
\hspace{0.1cm}
\includegraphics[ height=0.35\textheight]{pic/tiers.jpg}
\hspace{0.1cm}
\includegraphics[ height=0.35\textheight]{pic/cern-tier-chart.jpg}
\end{center}
\end{figure}
\end{frame}
\subsection{Motivation}
\begin{frame}\frametitle{Motivation}
\begin{footnotesize}
\begin{block}{[\cite{Zerola}]}
Efficient and controlled movement of replicated datasets within Grid to satisfy multiple requests in the shortest time.
\begin{itemize}
% add picture
\item Select between several data sources.
\item Create optimal transfer paths, merge shared transfer paths of the same file.
\item Schedule transfers on links.
\end{itemize}
It was shown that global planning of \textbf{data-transferring} over Grid can outperform well known heuristics (e.g. P2P, Xrootd reasoning)
\end{block}
\begin{block}{\textbf{New goal: Problem extension}}
Global planning for \textbf{jobs coupled with transfers} in distributed environment.
\end{block}
\vspace{-2mm}
\begin{block}{Example of decisions}
\begin{itemize}
\item[?] Send a job to a site with slow connection \textbf{or} wait for a free slot at local site?
\item[?] Access data remotely \textbf{or} transfer it before the job starts?
\end{itemize}
\vspace{-2mm}
Heuristics in use \texttt{[Pull a job when CPU slot is free]} will not give the answer.
\end{block}
\end{footnotesize}
\end{frame}
\subsection{Problem analysis}
\begin{frame}\frametitle{Case 1: Data production. Planning remote site usage. }
\begin{columns}[c] % the "c" option specifies center vertical alignment
\column{.6\textwidth} % column designated by a command
\begin{footnotesize}
\vspace{-11mm}
\begin{block}{}
\begin{itemize}
\item RAW data is located at BNL.
\item Computational resources are available at BNL and several remote sites.
\item 1 job per file.
\item 1 CPU per job.
\item Input size $\approx$ Output size
\item Output file has to be transferred back to BNL.
\item \textbf{How should we distribute a given set of files between sites to complete the processing faster?}
\end{itemize}
\end{block}
\begin{block}{}
Manually adjust the number of remote jobs to meet the network throughput, \textbf{but} what if:
\begin{itemize}
\item[-] More sites
\item[-] Changing network load
\end{itemize}
This should be automated.
\end{block}
\end{footnotesize}
\column{.4\textwidth}
\vspace{-5mm}
\begin{figure}
\includegraphics[trim = 25mm 10mm 0mm 30mm, clip, width=1.2\textwidth]{pic/Data_production_schema2.pdf}
\end{figure}
\end{columns}
\end{frame}
\begin{frame}\frametitle{Case 2: Data production. Optimization. }
\begin{columns}[c] % the "c" option specifies center vertical alignment
\column{.5\textwidth} % column designated by a command
\begin{footnotesize}
\vspace{-20mm}
\begin{block}{New dimensions of the problem}
\begin{itemize}
\item Several possible data sources.
\item Real network topology: shared links, links between remote sites.
\item Limited storage at sites.
\item \textbf{Which file source to select?}
\item \textbf{What is the optimal transfer path?}
\end{itemize}
\end{block}
\begin{block}{Example: data-production at ANL [\cite{Balewski}]}
\begin{itemize}
\item ANL: many CPU's, but slow connection and small disk space.
\item NERSC: fast connection, large disk.
\item Optimization: Feed ANL from both BNL and NERCS sites.
\end{itemize}
\end{block}
\end{footnotesize}
\column{.45\textwidth}
\begin{figure}
\begin{center}
\vspace{-15mm}
\includegraphics [width=1.2\textwidth]{pic/Data_production_schema_ANL2.pdf}
\end{center}
\end{figure}
\end{columns}
\end{frame}
\begin{frame}\frametitle{Case 3: User analysis. }
\begin{columns}[c] % the "c" option specifies center vertical alignment
\column{.5\textwidth} % column designated by a command
\begin{small}
\vspace{-10mm}
\begin{block}{New dimensions of the problem}
\begin{itemize}
\item Many copies of files exist.
\item Each file can be requested by multiple jobs.
\item The size of output of analysis is negligible compared to input size.
\item The processing time estimates are imprecise.
\item \textbf{When and where to replicate the data?}
\end{itemize}
\end{block}
\end{small}
\column{.45\textwidth}
\begin{figure}
\begin{center}
\vspace{-15mm}
\includegraphics[ width=1.15\textwidth]{pic/user_analysis.pdf}
\end{center}
\end{figure}
\end{columns}
\end{frame}
\subsection{Existing solutions}
\begin{frame}\frametitle{STAR: setup for data production at a remote site (KISTI)}
\begin{figure}
\begin{center}
\includegraphics[ width=\textwidth]{pic/kisti_production.jpg}
\end{center}
\end{figure}
\begin{block}{}
\begin{itemize}
\item For better efficiency an ad-hock setup is used. \cite{KISTI}
\end{itemize}
\end{block}
\end{frame}
\begin{frame}\frametitle{Existing solutions (used in HENP)}
\begin{block}{Batch System + Distributed Data Management System (Independent)}
\begin{itemize}
\item PBS, Condor. [Pull a job from global queue]
\item Xrootd, DPM. [Site which replies first is selected as a source]
\end{itemize}
\end{block}
\begin{block}{Data Trains (For user analysis)}
\begin{itemize}
\item Group jobs by input data $\longrightarrow$ preplace data $\longrightarrow$ start jobs simultaneously $\longrightarrow$ kill latest x\% of jobs.
\item Train runs periodically. ($\sim$ ones per day)
\item Controlled by train operators.
\end{itemize}
\end{block}
\begin{block}{Globus (Decoupling jobs and transfers)}
\begin{itemize}
\item Sends jobs to data.
\item Replicate most "popular" files. Relies on usage history.
\item Where to replicate? When to replicate? No transfer planning.
\end{itemize}
\end{block}
\end{frame}
\begin{frame}\frametitle{Related work (not in use in HENP)}
\begin{footnotesize}
\begin{block}{Bandwidth-Centric Allocation of Independent Tasks on Heterogeneous Platforms [\cite{Trees}]}
\begin{itemize}
\item Exact solution for maximum steady-state throughput.
\item Grid network is modeled as a tree (no alternative paths). Single source/destination. Input path $=$ Output path. Equal size of jobs/files.
\end{itemize}
\end{block}
\begin{block}{XSufferage [\cite{casanova2000heuristics}]
}
\begin{itemize}
\item Considers I/O transfer latency. Assigns jobs to hosts based on $Sufferage = SecondBestEstimatedMakespan - BestEstimatedMakespan$
\item No path/source selection or transfer planning. Simplified network model. No storage model.
\end{itemize}
\end{block}
\begin{block}{Storage Affinity [\cite{santos2005exploiting}]}
\begin{itemize}
\item XSufferage + job replication: Executes copies of the same job at several clusters concurrently.
\item Simplified network model. CPU waste $\sim 25-60$\%.
\end{itemize}
\end{block}
\end{footnotesize}
\end{frame}
\begin{frame}\frametitle{Distributed resource management system. Architecture (Globus example) [\cite{Globus_scheduler}]}
\begin{block}{}
\begin{itemize}
\item The resource broker is acting as a middle tier
between a user and the resources by doing resource
matching and job submission for the user.
\end{itemize}
\end{block}
\begin{figure}
\begin{center}
\includegraphics[ width=0.8\textwidth]{pic/resource-broker.jpg}
\end{center}
\end{figure}
\end{frame}
\section{Constraint Programming approach}
\subsection{Model}
\begin{frame}\frametitle{Data production problem}
\begin{block}{}
Create a global scheduler for Grid which will reason about:\\
\hspace{1cm} 1.data~transferring, \hspace{1cm} 2.~CPU~allocation,\hspace{1cm} 3.~data~storage.
\end{block}
\begin{block}{Optimization}
\begin{itemize}
\item None of the resources (network links, data storages and CPUs) are over-saturated at any moment of time.
\item The jobs are executed where the data is pre-placed.
\item No excessive transfers or data replication.
\item Minimal overall makespan for a given set of tasks.
\end{itemize}
\end{block}
\begin{block}{}
\textbf{To solve the problem we applied Constraint programming due to its techniques for scheduling, planning and optimization.}
\end{block}
\end{frame}
\begin{frame}\frametitle{Data-production problem: Input.}
\vspace{-1cm}
\begin{center}
\includegraphics[trim= 30mm 0mm 10mm 0mm,clip,angle =-90, width=0.7\textwidth]{pic/network.pdf}
\end{center}
\vspace{-12mm}
\begin{block}{Assumptions}
In previous work [\cite{Zerola}] it was proved that:
\begin{itemize}
\item There is advantage to plan and schedule jobs by chunks (split the whole set by portions).
\begin{itemize}
\item[+]More adaptability to changing environment.
\item[+]Faster plan creation.
\end{itemize}
\item The network links can be considered as unary resources: one file-transfer at a time over link.
\end{itemize}
\end{block}
\end{frame}
\begin{frame}\frametitle{Solving procedure overview.}
\begin{enumerate}
\item \textbf{Initialization Stage}. Estimate $TimeLimit$.
\item \textbf{Planning Stage}. Instantiate a part of domain variables with the help of simplified constraints.
\begin{enumerate}[a.]
\item Assign jobs to computational nodes.
\item Select transfer paths for input and output files.
\item Additional constraints: load balance, etc.
\item Find a solution for the sub-problem.
\end{enumerate}
\item \textbf{Scheduling stage}: define start time for each operation.
\begin{enumerate} [a.]
\item Constraints on order of operations.
\item Cumulative constraints.
\item Minimize target function: (e.g. makespan).
\end{enumerate}
\end{enumerate}
\end{frame}
\subsection{Testing simulations}
\begin{frame}\frametitle{Simulations based on log data: problem setup.}
\begin{columns}[c] % the "c" option specifies center vertical alignment
\column{.6\textwidth} % column designated by a command
\begin{small}
\vspace{-5mm}
\begin{block}{Input for simulations}
\begin{itemize}
\item Parameters of 2000 jobs taken from log files of data production for STAR at KISTI.
\item Jobs scheduled by chunks of 200.
\item Slowdown of the link to the remote site proportional to the \textbf{slowdown factor}.
\item Makespan compared to local processing.
\end{itemize}
\end{block}
\begin{block}{Tested algorithms}
\begin{itemize}
\item Equal CPU load.\\ Processed by input order.
\item Data transferred by job. \\ Processed by input order.
\item Optimized.\\ Planner: minimize estimated makespan. Scheduler: minimize makespan.
\end{itemize}
\end{block}
\end{small}
\column{.4\textwidth}
\begin{figure}
\begin{center}
\vspace{-20mm}
\includegraphics [width=1.2\textwidth]{pic/test_environment2.pdf}
\end{center}
\end{figure}
\vspace{-1cm}
\begin{tiny}
\begin{center}
Constraints for storage capacity are omitted.
\end{center}
\end{tiny}
\end{columns}
\end{frame}
\begin{frame}\frametitle{Simulations based on log data: results.}
%\vspace{-1cm}
\begin{center}
\includegraphics[trim =5mm 5mm 5mm 10mm ,clip,width=0.8\textwidth]{pic/makespan_vs_slowdown4.pdf}
\end{center}
\end{frame}
\begin{frame}\frametitle{Results of simulations}
\begin{footnotesize}
\begin{itemize}
\item In simulated environment, where a remote site has the same CPU number as a local site, but data transfer overhead is comparable to job duration:
\begin{itemize}
\item Maintaining \textbf{equal CPU load} at local and remote sites increases the makespan more then twice;
\item Scheduling with \textbf{consideration of transfer overhead} can reduce makespan by 15\%. \end{itemize}
compared to \textbf{local only} processing.
\item The simulations based on log files has shown that the proposed approach systematically provides a smaller makespan and adapts to the increase of transfer overheads better then the other simulated heuristics.
\item Proposed approach can provide \textbf{optimization} and \textbf{automatic adaptation} to fluctuating resources with no need for manual adjustment of work-flow at each site or tuning of heuristics.
\end{itemize}
\end{footnotesize}
\end{frame}
\section{Network flow model}
\begin{frame}\frametitle{Motivation}
\begin{block}{Drawbacks of CP model}
\begin{itemize}
\item Slow performance (NP-hard problem).
\item Unnecessary calculations:
\begin{itemize}
\item Path for each job.
\item Node selection for a particular job.
\item Order of jobs.
\item Makespan of a particular job.
\end{itemize}
\end{itemize}
\end{block}
\begin{block}{Network flow model}
\begin{itemize}
\item Idea: plan resource load only and then distribute particular jobs accordingly. Planning time interval $\Delta T$, $Flow$ - amount of data, link capacity - maximum bandwidth.
\item Network flow maximization problem can be solved within polynomial time.
\end{itemize}
\end{block}
\end{frame}
\begin{frame}\frametitle{Input transfer planning: How much data can be transferred during the next planning interval $\Delta T$?}
Dummy edges - constraints on storage and CPUs at each site.
\begin{figure}[h]
\begin{center}
\includegraphics [trim= 30mm 30mm 30mm 30mm , clip, angle =-90, width=0.7\textwidth]{pic/real_network.pdf}
\end{center}
\label{real_network}
\end{figure}
\vspace{-3mm}
Output flow problem can be formulated similarly.
\end{frame}
\begin{frame}\frametitle{Solving procedure}
\begin{block}{}
Problems for input and output transfers can be solved independently under assumptions:
\begin{itemize}
\item Full-duplex links,
\item In a steady state at each node $Processed\_input\_data= \beta \cdot created\_output\_data$, where~$\beta = const.\leq 1$.
\end{itemize}
\end{block}
\begin{block}{Solving procedure}
\begin{enumerate}
\item Calculate capacities of dummy edges using monitoring data.
\item Solve the problem for output data flows.
\item Recalculate remaining network capacity.
\item Solve the problem for input transfers.
\end{enumerate}
\end{block}
\end{frame}
\begin{frame}\frametitle{Plan execution}
\begin{columns}[c] % the "c" option specifies center vertical alignment
\column{.6\textwidth} % column designated by a command
\begin{block}{}
\begin{itemize}
\item A local handler at each node receives the plan:
\begin{itemize}
\footnotesize
\item Flows of outgoing edges - how much data of each type should be send over that link.
\item Flow of dummy edges - how much data should be processed at this node.
\end{itemize}
\item When a new file arrives, the handler decides according to the plan and current state:
\begin{itemize}
\footnotesize
\item Process the file (transfer over dummy edge)
\item OR forward it over one of the links
\end{itemize}
\item Decrease remaining flow of the link by the size of the file after transfer.
\end{itemize}
\end{block}
\column{.4\textwidth}
\begin{figure}[h]
\includegraphics [trim= 30mm 30mm 30mm 30mm , clip, angle =-90, width=\textwidth]{pic/handler_process.pdf}
\label{real_network}
\end{figure}
\begin{figure}[h]
\includegraphics [trim= 30mm 30mm 30mm 30mm , clip, angle =-90, width=\textwidth]{pic/handler_forwarddia.pdf}
\label{real_network}
\end{figure}
\end{columns}
\end{frame}
\begin{frame}\frametitle{Network flow model: summary}
\begin{small}
\begin{block}{}
\begin{itemize}
\item This procedure is expected to compute feasible data transfers such that
CPUs in Grid are busy with computational jobs while not exceeding local disk capacities.
\item The model can provide scalability and adaptability to changing environment.
\item A planer using the model has been implemented and tested.
\item Simulations using real log data of the STAR experiment are being implemented using GridSim (modeling tool widely used in Grid community).
\item Extend the model to meet the broader scope of requirements: several data sources, real network topology, heterogeneous resources and load.
\item Improve the planner performance in order to enable online scheduling in real environment.
\item Deploy to data production system of the Star experiment. Test and collect statistics.
\end{itemize}
\end{block}
\end{small}
\end{frame}
\section{Enabling Caching}
\begin{frame}\frametitle{Enabling Caching}
\begin{block}{}
\vspace{-5mm}
\begin{itemize}
\item Performance of cache algorithms implemented with watermarking concept was simulated for a wide scope of cache size and low marks. 3 access patterns of 2 different experiments were used as input for simulations. 27 algorithms were tested in 90 simulation setups.
\end{itemize}
\end{block}
\vspace{-2mm}
\begin{block}{Motivation}
\begin{itemize}
\item Cache of data-transfer tools.
\item Management of local data replicas.
\end{itemize}
\end{block}
\begin{footnotesize}
\vspace{-2mm}
\begin{block}{Access patterns used for simulation}
\begin{itemize}
\item[]\textbf{\textcolor{red}{STAR1:}} RCF@BNL, \textbf{Tier-0} for STAR experiment, Xrootd log, user analysis, 3 months period (June-August 2012).
\item[]\textbf{\textcolor{green}{STAR2:}} RCF@BNL, \textbf{Tier-0} for STAR experiment, Xrootd log, user analysis, 7 months period (August 2012 - February 2013).
\item[]\textbf{\textcolor{blue}{GOLIAS:}} FZU Prague, part of \textbf{Tier-2} of ATLAS. ATLAS and AUGER experiments, DPM log, user analysis + production, 3 months period (November 2012 - February 2013). AUGER makes less than 1\% of total requests.
\end{itemize}
\end{block}
\end{footnotesize}
\end{frame}
\begin{frame}\frametitle{Selected caching algorithms}
\begin{footnotesize}
\begin{itemize}
\item \textbf{First-In-First-Out (FIFO)}: evicts files in the same order they entered the cache.
\item\textbf{Least-Recently-Used (LRU)}: evicts the set of files which were not used for the longest period of time.
\item \textbf{Least-Frequently-Used (LFU)}: evicts the set of files which were requested less times since they entered the cache.
\item \textbf{Most Size (MS)}: evicts the set of files which have the largest size.
\item \textbf{A}daptive \textbf{R}eplacement \textbf{C}ache (\textbf{ARC}). 2 lists: L1 - files with $access~count=1$, and L2 - files with $access~count>1$. LRU is applied to both list. The length of each list depends on $p = cache~hits~in~L1 / cache~hits~in~L2$.
\item \textbf{L}east \textbf{V}alue based on \textbf{C}aching \textbf{T}ime (\textbf{LVCT}).
Deletes files according to the value of the Utility Function.
\begin{equation}
Utility Function = \frac{1}{Caching Time \times File Size}
\end{equation}
where \textbf{Caching Time} of a file F is the sum of size of all files accessed after the last request for the file F.
\end{itemize}
\end{footnotesize}
\end{frame}
\begin{frame}\frametitle{What caching algorithm is the best? }
\vspace{-5mm}
\begin{block}{Average improvement over FIFO}
\begin{table}\begin{tabular}{|l|c|c|c|c|}
\hline
Algorithm & cache hits & cache data hits\\ \hline
MS & \textcolor{green}{116} \% & \textcolor{red}{-20} \% \\ \hline
LRU & 8 \% & 5 \% \\ \hline
LFU & -27 \% & -18 \% \\ \hline
ARC & 13\% & \textcolor{green}{11}\%\\ \hline
LVCT & \textcolor{green}{86} \%& 2 \%\\ \hline
\end{tabular}
\end{table}
\end{block}
\begin{footnotesize}
\begin{block}{For studied access patterns}
\begin{itemize}
\item Regardless of the cache size, Tier-level and specificity of experiment the LVCT and ARC appear to be the most efficient caching algorithms.
\item If the goal is to minimize makespan due to a transfer startup overhead the LVCT algorithm should be selected.
\item If the goal is to minimize the network load the ARC algorithm is an option.
\end{itemize}
\end{block}
\end{footnotesize}
\end{frame}
\begin{frame}\frametitle{Dependence of cache performance on low mark}
\vspace{-0.4cm}
\begin{tiny} cache size / storage size = 0.1 ,high mark = 0.95 \end{tiny}
\begin{figure}
\vspace{-0.4cm}
\begin{center}
\centering
\includegraphics[width=\textwidth]{pic/low-basic_color.pdf}
%\includegraphics[trim = 0mm 0mm 145mm 0mm,clip,height = 60mm]{pic/legendReady.pdf}
\end{center}
\end{figure}
\vspace{-0.5cm}
$\bullet $ Difference between Tier-2 and Tier-0 leads to distinct cache performance.\\
$\bullet $ With higher low mark the number of clean-ups increases as well as overall performance.\\
\end{frame}
\section{Conclusion}
\begin{frame}\frametitle{Conclusion}
\begin{block}{}
\begin{itemize}
\item Automated and scalable planning and optimization of distributed computations are highly required in data intensive computational fields such as High Energy and Nuclear Physics. Resent works has revealed the potential of global planning for this task.
\item A CSP for scheduling of data production in Grid was formulated.
\item The simulations based on log data has shown that the proposed approach systematically provides a smaller makespan and adapts to the increase of transfer overheads better then the other simulated heuristics.
\item Network flow model has been developed to address the limitations of CP model. Its further development and testing is ongoing.
\item Performance of a wide scope of caching algorithms was simulated using access patterns derived from log files of HENP experiments. The most appropriate ones were selected for data managemend in HENP computations.
\end{itemize}
\end{block}
\end{frame}
\section{Aims of the PhD research}
\begin{frame}\frametitle{Aims of the PhD research}
\begin{footnotesize}
to develop and implement a general methodology for \textbf{planning of data intense computations in a distributed environment} with primary focus on \textbf{data production} in HENP.
\begin{block}{Goals}
\begin{itemize}
\item \textbf{Mathematical model.} General mathematical model for data production planning. Suitable level of abstraction, extensible.
\item \textbf{Scalable planer:} Petabytes of data, hundreds of computational sites, real-time planning.
\item \textbf{Adaptability to dynamic changes:} Background network traffic, cluster load by other tasks, downtime, errors, failures.
\item \textbf{Extended versions of the problem.} Several data sources, real network topology, several experiments (users), user analysis case, resource heterogeneity.
\item \textbf{Evaluation using real log data.} STAR data production, other HENP experiments, scalability tests.
\item \textbf{Deployment.} Integration into distributed RMS. Deployment to the data production system of the STAR experiment. Testing, statistics collection.
\end{itemize}
\end{block}
\end{footnotesize}
\end{frame}
\begin{tiny}
\renewcommand*{\bibfont}{\tiny}
\printbibliography
\end{tiny}
\end{document}