-
Notifications
You must be signed in to change notification settings - Fork 0
/
FYPReport.tex
2166 lines (1948 loc) · 141 KB
/
FYPReport.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[a4paper,oneside,12pt]{book}
\usepackage[utf8]{inputenc} % Use UTF-8 encoding
\usepackage{textgreek} % For Greek letters
\newcommand{\thesistitle}{Multi-Pivots QuickSort} % Your thesis title, this is used in the title and abstract
\newcommand{\degree}{B.A. Integrated Computer Science} % Replace with your degree name, this is used in the title page and abstract
\newcommand{\typeofthesis}{Final Year Project} % dissertation, Final Year Project, report, etc.
\newcommand{\authorname}{Shi Su} % Your name, this is used in the title page and PDF stuff
%% Do not put your Student ID in the document, as TCD will not publish
%% documents that contain both your name and your Student ID.
\newcommand{\supervisor}{Dr. David Gregg} % replace with the name of your supervisor
\newcommand{\keywords}{Sort, Pivot, Multi-Pivots, Rust, Perf, Cachegrind} % Replace with keywords for your thesis
\newcommand{\school}{\href{https://www.tcd.ie/scss/}{School of Computer Science and Statistics}} % Your school's name and URL, this is used in the title page
%Edited by HS for engineering
%% Comment out the next line if you don't want a department to appear
%\newcommand{\department}{\href{http://researchgroup.university.com}{Department Name}} % Your research group's name and URL, this is used in the title page
%% Language and font encodings
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[english]{babel}
%% Bibliographical stuff
\usepackage[]{cite}
%% Document size
% include showframe as an option if you want to see the boxes
\usepackage[a4paper,top=2.54cm,bottom=2.54cm,left=2.54cm,right=2.54cm,headheight=16pt]{geometry}
\setlength{\marginparwidth}{2cm}
%% Useful packages
\usepackage{amsmath}
\usepackage[autostyle=true]{csquotes} % Required to generate language-dependent quotes in the bibliography
\usepackage[pdftex]{graphicx}
\usepackage[colorinlistoftodos]{todonotes}
\usepackage[colorlinks=true, allcolors=black]{hyperref}
\usepackage{hyperxmp}
\usepackage{caption} % if no caption, no colon
% \usepackage{sfmath} %use sans-serif in the maths sections too, but this line would cause error TODO:
\usepackage[parfill]{parskip} % Begin paragraphs with an empty line rather than an indent
\usepackage{setspace} % to permit one-and-a-half or double spacing
\usepackage{enumerate} % fancy enumerations like (i) (ii) or (a) (b) and suchlike
\usepackage{booktabs} % To thicken table lines
\usepackage{fancyhdr}
\usepackage{xcolor} % to get TCD colour on headings
% \numberwithin{equation}{chapter} %HS edit for (chapter.equation)
\pagestyle{plain} % Embrace simplicity!
\definecolor{tcd_blue}{RGB}{5, 105, 185}
%% It's personal taste but...
%% Uncomment the following block if you want your name and ID at the top of
%% (almost) every page.
%\pagestyle{fancy}
%\fancyhf{} % sets both header and footer to nothing
%\renewcommand{\headrulewidth}{0pt}
%\cfoot{\thepage}
%\ifdefined\authorid
%\chead{\it \authorname\ (\authorid)}
%\else
%\chead{\it \authorname}
%\fi
%% End of block
%% It is good practise to make your font sans-serif to improve the accessibility of your document. Comment out the following line to disable it (but you really should not)
\renewcommand{\familydefault}{\sfdefault} %use the sans-serif font as default
%% If you insist on not using sans-serif (please don't), consider using Palatino instead of the LaTeX standard
%\usepackage{mathpazo} % Use the Palatino font by default if you prefer it to Computer Modern
%% Format Chapter headings appropriately
\usepackage{titlesec}
\titleformat{\chapter}[hang]{\normalfont\huge\bfseries\color{tcd_blue}}{\thechapter}{1cm}{}{}
\title{\thesistitle}
\author{\authorname}
\hypersetup{
pdftitle=\thesistitle, % Set the PDF's title to your title
pdfauthor=\authorname, % Set the PDF's author to your name
pdfkeywords=\keywords, % Set the PDF's keywords to your keywords
pdfsubject=\degree, % Set the PDF's keywords to your keywords
pdfinfo={
pdfsupervisor=\supervisor, % Set the PDF's supervisor to your supervisor
%pdfcosupervisor=\cosupervisor, % Set the PDF's cosupervisor to your cosupervisor if using
}
}
\usepackage{float}
\usepackage{microtype}
\usepackage{xspace}
\bibliographystyle{plainurl}% the recommended bibstyle
\usepackage{placeins}
\usepackage{algorithm}% http://ctan.org/pkg/algorithms
\usepackage{algpseudocode}
\usepackage{hyperref}
\usepackage{fourier}
\usepackage{tikz}
\usetikzlibrary{decorations.pathreplacing}
\usetikzlibrary{patterns}
\usepackage{array}
\usepackage{mathtools}
\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}
\DeclareMathAlphabet{\mathcal}{OMS}{cmsy}{m}{n}
\SetMathAlphabet{\mathcal}{bold}{OMS}{cmsy}{b}{n}
\newcommand{\bigO}{\mathcal{O}}
\newcommand{\cousize}{{\bf uint}\xspace}
\newcommand{\bs}{\ensuremath{B}\xspace}
\newcommand{\LineFor}[2]{%
\State\algorithmicfor\ {#1}\ \algorithmicdo\ {#2} \algorithmicend\ \algorithmicfor%
}
\newcommand{\LineIf}[2]{%
\State\algorithmicif\ {#1}\ \algorithmicthen\ {#2} \algorithmicend\ \algorithmicif%
}
\newcommand{\LineWhile}[2]{%
\State\algorithmicwhile\ {#1}\ \algorithmicdo\ {#2} \algorithmicend\ \algorithmicwhile%
}
\newcommand{\pluseq}{\ensuremath{\mathrel{{+}\!\!\;{=}}}}
\newcommand{\minuseq}{\ensuremath{\mathrel{{-}\!\!\;{=}}}}
\newcommand{\plusplus}{\ensuremath{{+}\!\!\;{+}}}
\newcommand{\minusminus}{\ensuremath{{-}\!\!\;{-}}}
\newcommand\ips{\textsf{IPS$^4$o}}
\newcommand\blockqs{\textsf{BlockQS}}
\newcommand\stdsort{\textsf{stdsort}}
\newcommand\lomutoone{\textsf{BlockLomuto1}}
\newcommand\lomutotwo{\textsf{BlockLomuto2}}
\definecolor{gruen}{rgb}{0.95,0.95,0.95}
\definecolor{dunkelgruen}{rgb}{0.35, 0.7, 0.67}
\definecolor{hellgruen}{rgb}{0.84, 0.7, 0.39}
\newcommand{\gruen}{\color{gruen}}
\newcommand{\dunkelgruen}{\color{dunkelgruen}}
\newcommand{\hellgruen}{\color{hellgruen}}
\begin{document}
\input{title.tex}
\pagenumbering{arabic}
\section*{\Huge\textcolor{tcd_blue}{Declaration}}
\vspace{1cm}
I hereby declare that this \typeofthesis\ is entirely my own work and that it has not been submitted as an exercise for a degree at this or any other university.
\vspace{1cm}
I have read and I understand the plagiarism provisions in the General Regulations of the University Calendar for the current year, found at \url{http://www.tcd.ie/calendar}.
\vspace{1cm}
I have completed the Online Tutorial on avoiding plagiarism `Ready Steady Write', located at \url{http://tcd-ie.libguides.com/plagiarism/ready-steady-write}.
\vspace{1cm}
I consent / do not consent to the examiner retaining a copy of the thesis beyond the examining period, should they so wish (EU GDPR May 2018).
\vspace{1cm}
% I agree that this thesis will not be publicly available, but will be available to TCD staff and students in the University’s open access institutional repository on the Trinity domain only, subject to Irish Copyright Legislation and Trinity College Library conditions of use and acknowledgement. \textbf{Please consult with your supervisor on this last item before agreeing, and delete if you do not consent}
% \vspace{3cm}
\hfill\includegraphics[height=2cm]{signature.png}
\chapter*{Abstract}
\begin{itemize}
\item \textbf{Purpose of the study}
This study aims to comprehensively explore the efficiency and effectiveness of QuickSort algorithms utilizing multiple pivots and block partitioning techniques. The goal is to elucidate the advantages and limitations of Multi-Pivot QuickSort and BlockQuickSort, with a focus on balancing performance considerations and analysing the impact of varying the number of pivots on performance.
\item \textbf{Methodology:}
The study implements various QuickSort algorithms using the Rust programming language. These include:
\begin{itemize}
\item Hoare and Lomuto's 1-pivot, Yaroslavskiy's 2-pivot, Kushagra's 3-pivot, author's custom 4-pivot classical schemes
\item BlockQuickSort with 1-pivot Hoare, 1 \& 2-pivot Lomuto partition
\item BlockQuicksort with a novel 2-pivot layout
\end{itemize}
The performance of these algorithms will be evaluated using tools such as perf, cachegrind, and AMD uProf. Detailed analysis, including flamegraphs depicting execution time costs at each code line, will be conducted to provide insights into algorithmic efficiency and behavior.
\item \textbf{Findings \& Implications}
\begin{itemize}
\item The improvements achieved through the utilization of multiple pivots have seemingly reached a plateau. Despite the 4-Pivot method demonstrating the most outperforming runtime costs, the incremental gains do not compensate for the increased complexity.
\item Most of the cache misses have been eliminated for QuickSort algorithms thanks to the modern CPU cache managements, with branch misses emerging as the primary bottleneck. The 1-Pivot Hoare's Block Partition scheme shows significant potential in mitigating branch misses,
and ranks as the top performer among the tested algorithms, indicating its promising future, though the greatest challenge of complexities brought by increased pivots still remain.
\item The novel layout of 2-Pivot Lomuto's Block Partition exposed the limitations introduced by more pivots introduced. The pattern of comparing with one pivot at a time is still the only way to go by far for block partitioning.
\end{itemize}
\end{itemize}
% \chapter*{Lay Abstract}
% Similar to the actual abstract in terms of the information, but written for a non-specialist. So no jargon, no acronyms. Explain to a member of the general public what this project entailed. Should be no longer than the actual abstract.
% This must be on a separate page.
\newpage
\onehalfspacing\raggedright %\raggedright turns off justification and hyphenation
\section*{\Huge\textcolor{tcd_blue}{Acknowledgements}}
I would like to thank my supervisor Dr. David Gregg for his rich insights, guidance, and support throughout the course of this project.
I am grateful for his patience, encouragement, and constructive feedback, which have been invaluable in shaping this work.
My families and friends have been a constant source of support and encouragement, and I am deeply grateful for their unwavering belief in me.
\newpage \tableofcontents
\clearpage
\section{Introduction}
Sorting algorithms on modern computers have been playing an important role in Computer Science and can date back to the early 50s and 60s.
It is the great efforts that humans have put into the field of developing faster sorting algorithms that shapes the vital parts that formed
all the significant infrastructures of the vast computer world like databases, data centers, cluster networks etc.
In the realm of sorting algorithms, QuickSort, introduced by computer scientist Tony Hoare \cite{HoareQuickSort} to the public,
has made its way into the libc of the GNU/Linux system and stood the test of time as one of the most classical and widely used methods for sorting.
Its divide-and-conquer strategy has been widely used and taken into research to fully utilize the performance of sorting.
The basic QuickSort picks an arbitrary pivot element,
partitions the array into 2 segments, one with elements less than the pivot and the other greater.
It then recursively sorts these segments using the same method. While the original QuickSort method has proven effective,
the demand of further optimizations and the endless pursuit of efficiency has led to many variations to explore, one of which is the Multi-Pivot QuickSort.
This thesis is my shallow attempt to unveil the benefits of Multi-Pivot QuickSort, exploring the effects more pivots can bring,
followed by some leveraged implementations of Multi-Pivot QuickSort and other variations alike and derived, analysis on the pros and cons that arise from multiple pivots.
\subsection{History and Related work}
When QuickSort was first brought to the fore a great number of variations and modifications were put into research over the years,
such as sorting with a better partition method and a more optimal strategy to select the arbitrary pivot element.
The innovative approach of Multi-pivot QuickSort bloomed, exemplified by Vladimir Yaroslavskiy\cite{Yaroslavskiy},
Bentley and Bloch's algorithm (YBB Alogorithm) in 2009, which was also highlighted by Wild and Nebel, adopted in Sun's Java 7 runtime library,
introduces the use of two pivots. Contrary to initial expectations, studies have revealed that employing multiple pivots can enhance the performance,
challenging prior assumptions derived from the dual-pivot proposal by Sedgewick \cite{Sedgewick} in 1978 where he analyzed a dual-pivot approach,
which was deemed inferior to the classical QuickSort in his research. Furthermore, Kushagra presented a 3-pivot QuickSort \cite{Kushagra} algorithm at ALENEX,
has attracted significant attention, further pushing the researches to broaden the boundaries of multi-pivot strategy. But how does it deal with the worst cases and how does it perform in practice?
One common pitfall in the performance downgrade of the conventional QuickSort is the degenerated case,
which could often arise when the input array is already mostly sorted or when certain patterns in the data cause the algorithm to consistently choose poor pivots
that fail to split the data into evenly sized partitions.
In such cases, the partitioning process may result in highly imbalanced partitions where the size of one partition is significantly larger than the other, leading to suboptimal performance and
potentially degrading the time complexity to $\bigO(n^2)$ instead of the expected $\bigO(n\log n)$ for QuickSort. This can be a serious issue when it comes to scalability on large datasets, thus
wiser choices of pivot selections could decrease the possiblities of such case from happening, such as the median-of-3 method and its variations of median-of-5 and median-of-7.
But on mostly ascending or descending arrays, whichever pivot we choose doesn't matter, the worst case is inevitable,
and it might not the best ideal approach to prevent the worst case from happening. Real world applications also bring concerns to us that duplicated elements are not rare,
and the median-of-n method could be easily affected by the duplicated elements, which could lead to the same worst case as the original QuickSort.
Of course we can set a threshold to set the minium length to use median-of-n method, but it doesn't fix the problem permanently.
There are some other variations of QuickSort that have been proposed to mitigate these specific issues, such as the Introsort algorithm \cite{Introsort} by David Musser,
which switches to Heapsort when the recursion depth exceeds a certain threshold. Thus the worst running time is guaranteed to be $\bigO(n\log n)$.
And this is where Multi-pivot QuickSort comes into play. Assuming we are dealing with randomly generated arrays,
since we don't know the exact size of each partition, the more pivots we choose, the more partitions we divide the array into,
the less chance it will be that we encounter imbalanced sizes of partitions.
Moreover, choosing more pivots will significantly reduce the chance we access the same elements again and the maximum depth of recursion,
which can be translated into better running time. With n pivots we are dividing the array into n+1 partitions, the same things happen on the sub-partitions
in the next level of recursion and we can conclude the maximum depth of recursion would be $\log_{n+1} L$ given \textbf{n} as the number of pivots and \textbf{L} for the length of array.
Greater the n is, less the depth. Better memory behaviors can bring effects that can not be achieved by any of other traditional means such as choosing a better pivot value
among samples extracted from different parts of the array, but the drawbacks are also obvious,
branch misprediction could be terrible as the different outcomes of comparisons can largely impact on the runtime performance.
\vspace{1em}
\hypertarget{ref:BranchMisses}{}
\textit{This phenomenon was first observed by Kaligosi and Sanders (2006) on a Pentium 4 Prescott CPU, a processor with an extremely long pipeline, where the branch misprediction penalty was so high that a very skewed pivot choice outperformed the typically optimal median pivot, even though the latter leads to much fewer executed instructions in total.
The effect was not reproducible, however, on the slightly different Pentium 4 Willamette. Here two effects counteract: a biased pivot makes branches easier to predict but also gives unbalanced partition sizes. Brodal and Moruz have shown that this is a general trade-off in comparison-based sorting: One can only save comparisons at the price of additional branch misses.}
--- Reference: The Analysis of Branch Misses In QuickSort \cite{AnalysisOfBranchMissesInQuickSort}
\vspace{1em}
The above phenomenon could also cause a delay of 14 to 19 cycles on a typical Intel desktop CPU or 20 to 25 cycles on AMD's Zen CPUs could lead to huge efficiency loss for our algorithm,
and this number only grows as the number of pivots increases and on more complicatad architectures. These are the trade-offs we have to consider when we are choosing the number of pivots,
although these side effects can be mitigated by the use of branchless comparisons and modern CPUs are able to run conditional move operations (CMOV family instructions on x86 architecture) to avoid branches,
these overheads still remain as concerns, but it could relief the conflicting situations between total comparisons and the cost of branch mispredictions in some ways.
Apart from that, it could be really tricky when it comes to moving the elements to the right places among multiple partitions, especially when the number of pivots is large.
Additionally, comparisons are the second expensive things next to swaps we are dealing with, and the more pivots we choose, the more comparisons we have to make.
Even for the case of using only one pivot, the average number of comparisons remain unoptimial, being $1.38n\log n + \bigO(n)$ compared with the $n\log n + \bigO(n)$ of the Merge Sort,
but the overall instructions are much less than the average of Merge Sort and with proper pivot selection strategy, it can still be reduced to a certain extent.
% As the number of pivots increase, it is important that we take the worst cases into consideration.
% Will we end up wasting too much time on comparing with badly-chosen similar pivots that split the array into imbalanced partitions,
% one of which might have much larger size than the rest? Or Even with the relatively higher amount of branch mispredictions,
% will we still conquer the barriers of cache misses caused by long-lasting swaps of elements far away and reach a shorter time as the n grows?
% That's the riddle I'm going to research in the next section.
Fortunately, an efficient approach has been put forward by Stefan Edelkamp and Armin Weiß,
in their thesis of BlockQuickSort \cite{BlockQuickSort}
which evens the odds of both branch mis-prediction and cache misses which uses one or more buffer blocks of constant sizes on the stack
to store the index offsets of out-of-order elements, swap them altogether in a second pass to rearrange them in order
after scanning a large chunk of array with the size of the block.
It has eliminated almost all the $if$ branches during comparisons, except inevitable branches produced by terminating situation of $for$ loops which traverse the whole array.
Their approach realized a significant speedup up to 80\% faster compared with the GCC implementation of std sort
with only one single pivot. The BlockQuickSort has been adapted to the Multi-Pivot QuickSort and the results are also promising,
the improvements on contiguous memory access significantly improves the spatial locality and reduces both the cache misses and branch mis-predictions.
Branchless comparisons are also adapted which plays an important role in contributing to the huge performance boost by reducing the overhead
when increasing the counter of elements stored in the block buffer(s).
This novel approach has also developed variations that could effectively make the use of different partitioning methods and multiple pivots selected.
Meanwhile, in the pursuit of advancing the efficiency of the QuickSort algorithm,
my investigation led me to explore various modifications,
with a particular focus on the pattern-defeating QuickSort (PDQSort) proposed by Orson Peters.
PDQSort leverages the average-case performance of randomized QuickSort with the rapid worst-case execution of heapsort,
while concurrently achieving linear time complexity for slices exhibiting specific patterns.
Notably, PDQSort employs a judicious application of randomization to prevent the degenerated case from happening.
When it does, PDQSort will strategically shuffle or reverse the entire array to simplify the sorting and potentially could increase the speed.
If all the counter measures fail, it also has the final backup plan of switching to Heapsort when bad choices of pivots or the depth of recursions exceed the limit,
learning the lesson from Introsort.
In terms of pivot selection, it uses a fixed seed when generating a pseudo-random index to ensure the reproducibility and deterministic behavior.
Furthermore, in instances where randomly selected elements from the original array appear in a descending order.
The combination of these optimizations has demonstrated a significant enhancement in performance, showcasing PDQSort as up to twice faster than the original QuickSort algorithm.
This outcome illuminates the transformative impact that a comprehensive analysis on the pattern of arrays has on the speed of sorting algorithms,
not to mention it is a successful combination of BlockQuickSort's partition method,
Heap sort when dealing with worst cases and Insertion sort when it comes to small sub-arrays of sizes less than the cache line size.
Inspired by PDQSort's success, I dive into the diverse variations of QuickSort,
motivating a search for novel patterns finely tweaked to enhance the existing QuickSort algorithms.
\clearpage
\subsection{Contributions}
\begin{itemize}
\item Present the variants of Multi-Pivot QuickSort and implement a 4-Pivots QuickSort and evaluate N-Pivots QuickSort from N = 1, 2, 3, 4 experimentally.
Comparing the implementations, times, instructions, cache misses and branch mis-predictions with a duration of 3 seconds warm-up time.
\item Present a new layout of Multi-Pivot BlockQuickSort that increases the block buffer usage rate while turned out to have worse performance than the original BlockQuickSort.
Analyze this method and compare it with other block partitioning variations, list the pros and cons, conclude what lessons we can learn from then.
\end{itemize}
\section{Preliminaries}
\subsection{Branch Misses}
Branch mispredictions are one of the most common performance bottlenecks in modern CPU architectures.
When the CPU encounters a conditional branch instruction like if statement, while and for loops, etc., it has to predict the outcome of the branch before the condition is evaluated.
A typical modern CPU will decide which branch to follow, prefetch the instructions and data from the memory, decode the subsequent instructions and execute them in advance to reduce the latency of the execution.
Because of the speculative execution, the CPU can execute the instructions in parallel, and if the prediction is correct, a lot of time can be saved, otherwise it has to stall the pipeline, flush the instructions and start over.
The branch misprediction penalty is usually around 10 to 20 cycles, depending on the runtime cycle length of pipelines, which can cause a huge overhead.
\subsection{How to avoid branch mispredictions}
One way to avoid branch mispredictions is to use branchless comparisons. The branchless comparison is a technique that uses bitwise operations to compare two values and generate a mask that represents the result of the comparison.
For example, to compare two integers a and b, we can use the following code:
\begin{verbatim}
int mask = (a < b) - (a > b);
\end{verbatim}
The mask will be -1 if a < b, 0 if a == b, and 1 if a > b. We can then use the mask to perform conditional operations without using branches.
This technique can be used to avoid branch mispredictions in sorting algorithms, where comparisons are the most common operations.
Another way to go around is the CMOVE instruction, which is a conditional move instruction that is available on modern CPUs. In x86 assembly, the CMOVE instruction can be used to conditionally move a value from one register to another based on a mask similar to the one generated by the branchless comparison.
An example of using it to swap two integers a and b if a < b is shown below:
\begin{verbatim}
mov eax, a
mov ebx, b
cmp eax, ebx
cmovl eax, ebx
cmovl ebx, eax
\end{verbatim}
which in readable C code would be:
\begin{verbatim}
if (a < b) {
int temp = a;
a = b;
b = temp;
}
\end{verbatim}
but without the branch misprediction overhead, this technique is particularly useful when optimizing sorting algorithms that rely heavily on comparisons.
A straightforward variation could be casting boolean values to integers and add them up to the accumulator variable, which could be used to determine how many elements are less than the pivot.
The pseudo code is given as:
\begin{verbatim}
int less = 0;
for (int i = 0; i < n; i++) {
less += (A[i] < pivot);
}
\end{verbatim}
Here the less variable will add 1 if A[i] is less than the pivot, and 0 otherwise, and this could be used to determine the index where the pivot should be placed in the branch-free way.
\subsection{Cache Misses}
Caches, which are small and fast memory units located on the CPU, are used to store frequently accessed data and instructions to reduce the latency of memory access.
When the CPU needs to access data or instructions, it will first check the cache, if the data is found in the cache, it is called a cache hit, otherwise, it is called a cache miss.
Closer it is to the CPU, the more expensive the manufacturing cost is, and the smaller the size is. The cache is usually divided into several levels, L1, L2, L3, etc.
L1 and L2 caches are usually private to each core, while L3 cache is shared among all the cores in the same CPU.
When the CPU needs to access data, it will first check the L1 cache, if the data is not found, it will check the L2 cache, and then the L3 cache, and finally the main memory, accelerating the memory access.
With the context of sorting algorithms, it is a convention to switch to insertion sort when it comes to arrays with sizes less than a cache line size, the exact threshold could be around 9 to 30 elements, varying on the architecture.
\subsection{Classical QuickSort}
\begin{algorithm}[H]
\caption{QuickSort with Hoare Partition}\label{HoarePartition}
\begin{algorithmic}[1]
\Procedure{QuickSort}{$A, l, r$}
\If{$l < r$}
\State $p \gets \textsc{HoarePartition}(A, l, r)$
\State \textsc{QuickSort}(A, l, p-1)
\State \textsc{QuickSort}(A, p+1, r)
\EndIf
\EndProcedure
\Procedure{HoarePartition}{$A, l, r$}
\State $P \gets A[l]$ \Comment{Pivot}
\State $i \gets l - 1$
\State $j \gets r + 1$
\While{$i < j$}
\Repeat
\State $j \gets j - 1$
\Until{$A[j] < P$}
\Repeat
\State $i \gets i + 1$
\Until{$A[i] > P$}
\State \textsc{Swap}($A[i], A[j]$)
\EndWhile
\State \textsc{Swap}($A[l], A[j]$) \Comment{Put pivot in the middle}
\State \textbf{return} $j$
\EndProcedure
\end{algorithmic}
\end{algorithm}
For QuickSorts with single pivot value, the partitioning procedure splits the array into 2 parts where the left part are elements less than the pivot and the right part being greater,
separated by the pivot value in the middle. Then the main body recursively sorts the left and right part to rearrange all the elements in the correct order. The hoare's partition method
uses double pointers starting at the leftmost and the rightmost element in the array and scanning towards the middle until they meet. Each pointer scans for mis-placed elements
which belong to the other partition by comparing the current element to the pivot.
Out-of-order elements are swapped in pairs to the right place and the scanning procedure continues until all elements
are moved to the places where they belong.
\begin{algorithm}[H]
\caption{QuickSort with Lomuto Partition}\label{LomutoPartition}
\begin{algorithmic}[1]
\Procedure{QuickSort}{$A, l, r$}
\If{$l < r$}
\State $p \gets \textsc{LomutoPartition}(A, l, r)$
\State \textsc{QuickSort}(A, l, p-1)
\State \textsc{QuickSort}(A, p+1, r)
\EndIf
\EndProcedure
\Procedure{LomutoPartition}{$A, l, r$}
\State $P \gets A[r]$ \Comment{Pivot}
\State $i \gets l$
\For{$j \gets l$ \textbf{to} $r-1$}
\If{$A[j] < P$}
\State \textsc{Swap}($A[i], A[j]$)
\State $i \gets i + 1$
\EndIf
\EndFor
\State \textsc{Swap}($A[i], A[r]$) \Comment{Put pivot in the middle}
\State \textbf{return} $i$
\EndProcedure
\end{algorithmic}
\end{algorithm}
While the Lomuto partition method uses another way, it scans the array in sequential order from left to right using one single pointer, incrementing the index by 1 each time.
and swaps all the elements less than the pivot to the left side discriminately,
resulting in rebundant swaps as it fails to check if the elements are already in their correct positions. Despite its simplicity, both of these 2 partition methods are straightforward,
and share the same idea of partitioning with similar structures: scanning, identifying misplaced elements, swapping and recursively repeating the procedure until the whole array is sorted.
It is generally believed that Hoare's partition method is more efficient than the Lomuto's, primarily due to its reduced number of swaps, enhanced partitioning strategy and memory efficiency.
While Hoare's partition scheme minimizes unnecessary swaps compared to Lomuto's method, which tends to perform rebundant swaps for arrays with many duplicated elements or already sorted elements.
Nevertheless, Lomuto's scheme boasts simpler implementuses with less auxiliary variables used, contributing to wide adoption in many programming languages and libraries.
However, these theoretical analysis must be validated through empirical experiments and brought into practice to see if the assumptions hold true, as the performance of the two partition methods could vary depending on the dataset's characteristics and the size of the array.
Empirical studies are crucial to verify the assumptions and to determine the most suitable method for certain scenarios and contexts.
\subsection{rotate-n operations}
\hypertarget{rotate-n}
The rotate-n operation is a common operation in the QuickSort algorithm, where we shift the elements by their indices in batch to get rid of the duplicated swaps, and to make the partitioning process more plain and simple to understand.
This improves slightly the performance of the QuickSort algorithm, and is widely used in tweaked Multi-Pivot QuickSort. The rotate-n operation is defined as follows:
\begin{verbatim}
rotaten(A, [index1, index2, ... indexn]);
\end{verbatim}
which is equivalent to the following code:
\begin{verbatim}
T temp = A[index1];
A[index1] = A[index2];
...
A[indexn] = temp;
\end{verbatim}
In Rust I implemented the rotate-n operation as follows:
\begin{verbatim}
#[inline(always)]
pub unsafe fn rotate_$n<T>(arr: *mut T, idx: [usize; $n]) {
let tmp = ptr::read(arr.add(idx[0]));
for i in 1..$n {
ptr::copy_nonoverlapping(arr.add(idx[i]), arr.add(idx[i - 1]), 1);
}
ptr::copy_nonoverlapping(&tmp, arr.add(idx[$n - 1]), 1);
}
\end{verbatim}
Using direct pointer manipulation, we can avoid the overhead of the Rust borrow checker and array boundary checking to maximize the performance.
We will see how this technique fully displays its power in the Cyclic Permutation\hyperlink{CyclicPermutation}{[→]}.
\subsection{Block Partitioning}
The following pseudocode is extracted from the BlockQuickSort paper by Stefan Edelkamp and Armin Weiß \cite{BlockQuickSort} who hold the copyrights of the original code.
This method introduces a parallel-looking and branch misprediction friendly version of the classical QuickSort algorithm that makes use of a block buffer of constant size to store the index offsets of out-of-order elements on both sides of the array, and swaps them in the second pass to rearrange them.
In the actual implementation I used the block size of 128 suggested by examples with least time costs in the paper, as the index offsets in this case would be 0 to 127 (\textbf{0b1111 1111}) as 1 byte for each cell in the buffer.
A block size of 128, which in turn would be 128 bytes in total, neatly fits into 2 cache lines, each line 64 bytes on the test machine.
Notably, it is also reflected in their paper that the block size of over 256 bytes will bring down the performance and is less stack memory friendly, especially on large data structures like 10-dimensional array of 64-bit vectors and 'records'\hypertarget{ref:record}, referring to 21-dimensional array of 32-bit integers.
An advantageous aspect of selecting a power-of-2 block size is it be easily optimized by the compiler to use bitwise operations to calculate the offsets. Loop unrolling inside the LLVM compiler could also give us a hand to optimize the code further and reduce the overhead of the loop termination condition check.
Furthermore, they are placed on stack to get rid of the extra memory allocation overheads on heap. Let's dive into the details of its implementation by following the pseudocode.
\begin{algorithm}[H]
\small
\caption{Block Partition Hoare}\label{BlockPartition}
\begin{algorithmic}[1]
\Procedure{BlockPartitionHoare}{$A[\ell,\dots, r]$, pivot}
\State \cousize $\mathrm{offsets}_L[0, \dots, \bs - 1], \mathrm{offsets}_R[0, \dots, \bs - 1]$
\State \cousize $\mathrm{start}_L,\mathrm{start}_R, \mathrm{num}_L,\mathrm{num}_R \gets 0$
\While{$r - \ell + 1 > 2 \bs$}\Comment{start main loop}
\If{$\mathrm{num}_L = 0$}\Comment{if left buffer is empty, refill it}
\State $\mathrm{start}_L \gets 0$
\For{$i = 0,\dots, \bs - 1$}
\State $\mathrm{offsets}_L[\mathrm{num}_L ] \gets i$\Comment{Will this unconditional write slows it down?}\hypertarget{UnconditionalWrite}
\State $\mathrm{num}_L \pluseq (\mathrm{pivot} \geq A[\ell + i] )$\Comment{branchless comparison to add counter}
\EndFor
\EndIf
\If{$\mathrm{num}_R = 0$}\Comment{if right buffer is empty, refill it}
\State $\mathrm{start}_R \gets 0$
\For{$i = 0,\dots, \bs - 1$}
\State $\mathrm{offsets}_R[\mathrm{num}_R ] \gets i$
\State $\mathrm{num}_R \pluseq (\mathrm{pivot} \leq A[r - i] )$ \Comment{branchless comparison to add counter}
\EndFor
\EndIf
\State \cousize num $= \min(\mathrm{num}_L, \mathrm{num}_R)$
% \For{$j = 0,\dots, \mathrm{num} - 1$}
% \State swap($A\bigl[\ell + \mathrm{offsets}_L[\mathrm{start}_L + j]\bigr], A\bigl[r- \mathrm{offsets}_R[\mathrm{start}_R + j]\bigr]$)\Comment{rearrangement}
% \EndFor
\State temp $\gets A\bigl[\ell +
\mathrm{offsets}_L[\mathrm{start}_L]\bigr]$ \Comment{rearrangement using cyclic permutation}
\State $A\bigl[\ell +
\mathrm{offsets}_L[\mathrm{start}_L]\bigr] \gets
A\bigl[r- \mathrm{offsets}_R[\mathrm{start}_R]\bigr]$
\For{$j = 1,\dots, \mathrm{num} - 1$}
\State $A\bigl[r - \mathrm{offsets}_R[\mathrm{start}_R + j - 1]\bigr] \gets
A\bigl[\ell + \mathrm{offsets}_L[\mathrm{start}_L + j]\bigr]$
\State $A\bigl[\ell + \mathrm{offsets}_L[\mathrm{start}_L + j]\bigr]
\gets A\bigl[r - \mathrm{offsets}_R[\mathrm{start}_R + j ]\bigr]$
\EndFor
\algstore{BlockPartition}
\end{algorithmic}
\end{algorithm}
\begin{algorithm}[H]
\small
\caption{Block Partition Hoare (Continued)}
\begin{algorithmic}[1]
\algrestore{BlockPartition}
\State $A\bigl[r - \mathrm{offsets}_R[\mathrm{start}_R + \mathrm{num} - 1]\bigr] \gets$temp
\State $\mathrm{num}_L, \mathrm{num}_R \minuseq \mathrm{num}$;
$\mathrm{start}_L, \mathrm{start}_R \pluseq \mathrm{num}$ \Comment{update the counters for buffers}
\LineIf{$(\mathrm{num}_L = 0)$} {$\ell \pluseq \bs$} \Comment{if left buffer is empty, move the left pointer}
\LineIf{$(\mathrm{num}_R = 0)$} {$r \minuseq \bs$} \Comment{Vice versa}
\EndWhile\Comment{end main loop}
\State rearrange remaining elements with the standard Hoare's method
\EndProcedure
\end{algorithmic}
\end{algorithm}
In the presented pseudocode, we witness a notable demonstration of how branchless comparisons are incorporated and function. The counter of elements less than the pivot is incremented by 1 if the condition is met, achieved by converting the result into a boolean value and adding it to the counter.
This elegant approach eliminates the occurrences of branching on counter manipulations. The same technique is applied to the right buffer simultaneously, and the minimum number of elements in both buffers is calculated to ensure the correct number of elements are swapped, also in a branchless manner.
The block partitioning method represents a novel approach that not only reduces the number of swaps but also improves the spatial locality of both the memory access and cache access.
I replaced the original swap function in the rearrangement phase with a cyclic permutation method also outlined in the referenced thesis, a finely tuned way to rearrange the elements in the buffer blocks.
For better understanding, we can compare this cyclic permutation to a rotate-n operation where n stands for the number of elements that need to be swapped to the right places.
Consequently, there may be elements left in one of the buffer blocks, but we will just shift the other block towards the middle and rearrange the elements left behind in the next iteration.
For testing machine with 32MB L3 cache, the block size of 128 bytes would be a suitable choice as not only it would fit into 2 cache lines, but for the maximum testing size of $10^7$ elements, the maximum recursion depth would be $\ceil\log_{2} 10^7 = 24$,
the maximum stack usage would be around $24 \times 128{bytes} = 3{Kb}$, a small amount for both memory and cache even with little extra space required by RIP, RBP registers and other local variables, ensuring its cache friendliness and cache efficiency.
Another thing we might concern is the \hyperlink{UnconditionalWrite}{unconditional write} of the offsets in the buffer blocks, which overwrites the previous value unless the offset index is incremented by 1 in the last step of scan.
While this operation doesn't incur much overhead thanks to the write-back policy of the cache line supported by modern CPUs and is crucial for them, which replaces the old value with the new one in the cache line and write back to main memory only if the value is final,
to get rid of multiple expensive direct main memory manipulation and stalls the entire pipeline. This action is penalty-free and the cost is much less than write-through operations that updates the main memory every time a write operation occurs.
Eventually, the buffer blocks will be written back to the main memory eventually, in a deterministic manner.
This is ensured by the write-back policy of the cache line, and is a common stable feature in modern CPUs to avoid excessive memory access.
To offer a clearer visual comprehension of the cyclic permutation, a simple figure illustrating how it works is provided right below\hyperlink{fig:cyclicpermu}{[→]}.
\begin{center}
\begin{figure}[H]
\hypertarget{fig:cyclicpermu}{}
\caption{Cyclic Permutation}
\centering
% \vspace*{+0.1\textheight}
\includegraphics[width=1\textwidth]{cyclicpermu.drawio.pdf}
\end{figure}
\end{center}
\begin{algorithm}[H]
\small
\caption{Cyclic Permutation}\label{CyclicPermutation}
\begin{algorithmic}
\State temp $\gets A\bigl[\ell +
\mathrm{offsets}_L[\mathrm{start}_L]\bigr]$ \Comment{(1) preserve the last element in temp variable}
\State $A\bigl[\ell +
\mathrm{offsets}_L[\mathrm{start}_L]\bigr] \gets
A\bigl[r- \mathrm{offsets}_R[\mathrm{start}_R]\bigr]$\Comment{(2) start with the first one}
\For{$j = 1,\dots, \mathrm{num} - 1$}\Comment{main cyclic for loop}
\State $A\bigl[r - \mathrm{offsets}_R[\mathrm{start}_R + j - 1]\bigr] \gets
A\bigl[\ell + \mathrm{offsets}_L[\mathrm{start}_L + j]\bigr]$
\State $A\bigl[\ell + \mathrm{offsets}_L[\mathrm{start}_L + j]\bigr]
\gets A\bigl[r - \mathrm{offsets}_R[\mathrm{start}_R + j ]\bigr]$
\EndFor
\State $A\bigl[r - \mathrm{offsets}_R[\mathrm{start}_R + \mathrm{num} - 1]\bigr] \gets$
temp \Comment{(3) put the preserved element back to the last place}
\end{algorithmic}
\end{algorithm}
Here we mapped the pseudocode in parts to the steps in the figure, the first step is to preserve the last element in the left buffer block in a temporary variable,
then we doing the cyclic permutation in the main loop, shifting misplaced elements from buffers on one side to the other to make sure they are well partitioned, and finally we put the preserved element back to the last place in the right buffer block.
This rotate-n like method is basically made up of multiple copys from pointers in order to reduce the number of swaps to the minimum, reducing an amortized $\frac{n}{2}$ writes to temp variable to $\frac{n}{2 \times BLOCK-SIZE} = \frac{n}{256}$, a giant leap ahead.
We also have adaptions of the Block Lomuto's partition method. Comprehension won't be hard on basis of our fundamental understandings of the classical Lomuto's partition method.
\begin{algorithm}[t!]
\small
\caption{One-Pivot Block Partition Lomuto}\samepage\label{algo:single:pivot:partitioning}
\textbf{procedure} \lomutoone($\textit{A}[1..\textit{n}]$)
\begin{algorithmic}[1]
\Require $\textit{n} > 1, \text{Pivot in $\textit{A}[\textit{n}]$}$
\State $\texttt{p} \gets \textit{A}[\textit{n}]$;
\State \textbf{integer} $\text{block}[0, \ldots, \textit{B} - 1]$, $\texttt{i}, \texttt{j} \gets 1$, $\texttt{num} \gets 0$
\While{$\texttt{j} < \textit{n}$}
\State $\texttt{t} \gets \text{min}(\textit{B}, \textit{n} - \texttt{j} )$;
\For{$\texttt{c} \gets 0; \texttt{c} < \texttt{t}; \texttt{c} \gets \texttt{c} + 1$}
\State $\text{block}[\texttt{num}] \gets \texttt{c}$;
\State $\texttt{num} \gets \texttt{num} + (\texttt{p} > \textit{A}[\texttt{j} + \texttt{c}])$;
\EndFor
\For{$\texttt{c} \gets 0; \texttt{c} < \texttt{num}; \texttt{c} \gets \texttt{c} + 1$}
\State \text{Swap $\textit{A}[\texttt{i}]$ and $\textit{A}[\texttt{j} + \text{block}[\texttt{c}]]$}
\State $\texttt{i} \gets \texttt{i} + 1$
\EndFor
\State $\texttt{num} \gets 0$;
\State $\texttt{j} \gets \texttt{j} + \texttt{t}$;
\EndWhile
\State Swap $\textit{A}[\texttt{i}]$ and $\textit{A}[\textit{n}]$;\\
\Return $\texttt{i}$;
\end{algorithmic}
\end{algorithm}
\begin{figure}[t!]
\vspace*{-1em}
\centering
\begin{tikzpicture}[xscale = 1.0]
% main array
\draw[fill=hellgruen!30] (0,0) rectangle (1.5, 0.5);
\draw[fill=dunkelgruen!30] (1.5,0) rectangle (3.5, 0.5);
\draw[fill=dunkelgruen!30] (7.5,0) rectangle (8, 0.5);
\fill[fill=dunkelgruen!30] (4.5,0.25) rectangle (7.5, 0.5);
\fill[fill=hellgruen!30] (4.5,0) rectangle (7.5, 0.25);
\fill[fill=dunkelgruen!30] (3.5,0) rectangle (3.75, 0.5);
\fill[fill=hellgruen!30] (3.75,0) rectangle (4.0, 0.5);
\fill[fill=hellgruen!30] (4,0) rectangle (4.25, 0.5);
\fill[fill=dunkelgruen!30] (4.25,0) rectangle (4.5, 0.5);
\draw (0,0) rectangle (8, 0.5);
\draw (7.5, 0) -- (7.5, 0.5);
\draw (1.5, 0) -- (1.5, 0.5);
\draw (3.5, 0) -- (3.5, 0.5);
\draw[dotted, thick] (5.5, 0) -- (5.5, 0.5);
\node at (7.75, 0.25) {$p$};
\node at (0.75, 0.25) {$< p$};
\node at (2.5, 0.25) {$\geq p$};
\node at (6.5, 0.25) {? $\cdots$ ?};
\node at (1.65, -0.25) {\small$\uparrow$};
\node at (1.65, -0.65) {$\texttt{i}$};
\node at (3.65, -0.25) {\small$\uparrow$};
\node at (3.65, -0.65) {$\texttt{j}$};
\node at (4.65, -0.25) {\small$\uparrow$};
\node at (4.65, -0.65) {$\texttt{j} + \texttt{c}$};
\draw [decorate, decoration={brace,amplitude=5pt},yshift=2pt](3.5,0.5)--(5.5,0.5) node [midway,yshift=15pt]{$B$};
%\draw [step = .25, blue, opacity=0.3] (0, -1.5) grid (\columnwidth, 0.5);
%block
\draw (6.25,-1) rectangle (8.25, -0.5);
\draw (6.25, -1) -- (6.25, -0.5);
\draw (6.75, -1) -- (6.75, -0.5);
\draw (7.25, -1) -- (7.25, -0.5);
\draw (7.75, -1) -- (7.75, -0.5);
\draw (8.25, -1) -- (8.25, -0.5);
\node at (6.5, -0.75) {1};
\node at (7, -0.75) {2};
\node at (7.5, -0.75) {3};
\node at (7.5, -1.25) {\small$\uparrow$};
\node at (7.5, -1.6) {$\texttt{num}$};
\node at (6.25, -1.3) {$\texttt{block}$};
\end{tikzpicture}
\vspace{-1.0em}
%\includegraphics[width=\columnwidth]{fig-lomuto-block.png}
\caption{BlockQuickSort with Lomuto's partitioning scheme.} %Picture depicts the situation where Algorithm~\ref{algo:single:pivot:partitioning} is currently on Line~7 with $\texttt{c} = 4$. So far, the block contains the indexes 1 and 2, representing that $A[\texttt{j} + 1]$ and $A[\texttt{j} + 2]$ are smaller than $p$. In general, given that $\texttt{c}$ has value $c$ and $\texttt{num}$ is \textit{num}, $\texttt{block}[0..\textit{num} - 1]$ contains the indexes (relative to $\texttt{j}$) of all misplaced elements in $A[\texttt{j}..\texttt{j} + c]$. Unlabeled elements have only half the width of the array cell depicting $p$ in the picture.}
\label{fig:lomuto:block:invariant}
\end{figure}
Partition left to the index j is the sub-result of last iteration, we are currently scanning on the block which covers the range of $A[j..j+B-1]$, and the block buffer is used to store the index offsets of the misplaced elements inside.
The first element with index 0 is neglected for being blue -- greater than the pivot, and the yellow elements with index 1 and 2 -- less than the pivot, are stored in the buffer. The third blue element with index 2, temporarily stored in the buffer, will be replaced soon by the next misplaced item in the next iteration that deals with unknown values that can be either less or greater than the pivot.
\begin{algorithm}[t!]
\small
\caption{Two-Pivot Block Partition Lomuto}\samepage\label{algo:dual:pivot:partitioning}
\textbf{procedure} \lomutotwo($\textit{A}[1..\textit{n}]$)
\begin{algorithmic}[1]
\Require $n > 1$, \text{Pivots in $\textit{A}[1] \leq \textit{A}[\textit{n}]$}
\State $\texttt{p} \gets \textit{A}[1]$; $\texttt{q} \gets \textit{A}[n]$;
\State \textbf{integer} $\text{block}[0, \ldots, \textit{B} - 1]$
\State $\texttt{i}, \texttt{j}, \texttt{k} \gets 2$, $\texttt{num}_{< \texttt{p}}, \texttt{num}_{\leq \texttt{q}} \gets 0$
\While{$\texttt{k} < \textit{n}$}
\State $\texttt{t} \gets \text{min}(\textit{B}, \textit{n} - \texttt{k} )$;
\For{$\texttt{c} \gets 0; \texttt{c} < \texttt{t}; \texttt{c} \gets \texttt{c} + 1$}
\State $\text{block}[\texttt{num}_{\leq \texttt{q}}] \gets \texttt{c}$;
\State $\texttt{num}_{\leq \texttt{q}} \gets \texttt{num}_{\leq \texttt{q}} + (\texttt{q} \geq \textit{A}[\texttt{k} + \texttt{c}])$;
\EndFor
\For{$\texttt{c} \gets 0; \texttt{c} < \texttt{num}_{\leq \texttt{q}}; \texttt{c} \gets \texttt{c} + 1$}
\State \text{Swap $\textit{A}[\texttt{j} + \texttt{c}]$ and $\textit{A}[\texttt{k} + \text{block}[\texttt{c}]]$}
\EndFor
\State $\texttt{k} \gets \texttt{k} + \texttt{t}$;
\For{$\texttt{c} \gets 0; \texttt{c} < \texttt{num}_{\leq \texttt{q}}; \texttt{c} \gets \texttt{c} + 1$}
\State $\text{block}[\texttt{num}_{< \texttt{p}}] \gets \texttt{c}$;
\State $\texttt{num}_{< \texttt{p}} \gets \texttt{num}_{< \texttt{p}} + (\texttt{p} > \textit{A}[\texttt{j} + \texttt{c}])$;
\EndFor
\For{$\texttt{c} \gets 0; \texttt{c} < \texttt{num}_{< \texttt{p}}; \texttt{c} \gets \texttt{c} + 1$}
\State \text{Swap $\textit{A}[\texttt{i}]$ and $\textit{A}[\texttt{j} + \text{block}[\texttt{c}]]$}
\State $\texttt{i} \gets \texttt{i} + 1$
\EndFor
\State $\texttt{j} \gets \texttt{j} + \texttt{num}_{\leq \texttt{q}}$;
\State $\texttt{num}_{< \texttt{p}}, \texttt{num}_{\leq \texttt{q}} \gets 0$;
\EndWhile
\State Swap $\textit{A}[\texttt{i} - 1]$ and $\textit{A}[1]$;
\State Swap $\textit{A}[\texttt{j}]$ and $\textit{A}[\textit{n}]$;\\
\Return $(\texttt{i} - 1,\texttt{j})$;
\end{algorithmic}
\end{algorithm}
\begin{figure}[H]
\vspace*{-1em}
\centering
\scalebox{.92}{
\begin{tikzpicture}[xscale = 1.05, xshift=0.5cm]
\draw[fill=gruen!30] (-0.5, 0) rectangle (0, 0.5);
\draw[fill=hellgruen!30] (0,0) rectangle (1, 0.5);
\draw[fill=gruen!30] (1,0) rectangle (3, 0.5);
\draw[fill=dunkelgruen!30] (3, 0) rectangle (4, 0.5);
\draw[fill=gruen!30] (7.5,0) rectangle (8, 0.5);
\fill[fill=dunkelgruen!30] (4,0) rectangle (4.25, 0.5);
\fill[fill=gruen!30] (4.25,0.25) rectangle (4.75, 0.5);
\fill[fill=hellgruen!30] (4.25,0) rectangle (4.75, 0.25);
\fill[fill=hellgruen!30] (4.75,0) rectangle (7.5, 0.16);
\fill[fill=gruen!30] (4.75,0.16) rectangle (7.5, 0.33);
\fill[fill=dunkelgruen!30] (4.75,0.33) rectangle (7.5, 0.5);
% \fill[fill=hellgruen!30] (4.5,0.25) rectangle (7.5, 0.5);
% \fill[fill=dunkelgruen!30] (3.5,0) rectangle (3.75, 0.5);
% \fill[fill=hellgruen!30] (3.75,0) rectangle (4.0, 0.5);
% \fill[fill=hellgruen!30] (4,0) rectangle (4.25, 0.5);
% \fill[fill=dunkelgruen!30] (4.25,0) rectangle (4.5, 0.5);
\draw (0,0) rectangle (8, 0.5);
\draw (7.5, 0) -- (7.5, 0.5);
%\draw (1, 0) -- (1, 0.5);
%\draw (3.5, 0) -- (3.5, 0.5);
\draw[dotted, thick] (5.5, 0) -- (5.5, 0.5);
\node at (-0.25, 0.25) {$p$};
\node at (7.75, 0.25) {$q$};
\node at (0.5, 0.25) {$< p$};
\node at (2, 0.25) {$p \leq \dots \leq q$};
\node at (3.5, 0.25) {$> q$};
\node at (6.5, 0.25) {? $\cdots$ ?};
\node at (1.05, -0.25) {\small$\uparrow$};
\node at (1.05, -0.65) {$\texttt{i}$};
\node at (3.05, -0.25) {\small$\uparrow$};
\node at (3.05, -0.65) {$\texttt{j}$};
\node at (4.05, -0.25) {\small$\uparrow$};
\node at (4.05, -0.65) {$\texttt{k}$};
\node at (4.85, -0.25) {\small$\uparrow$};
\node at (4.85, -0.65) {$\texttt{k} + \texttt{c}$};
\draw [decorate, decoration={brace,amplitude=5pt},yshift=2pt](4,0.5)--(5.5,0.5) node [midway,yshift=15pt]{$B$};
%\draw [step = .25, blue, opacity=0.3] (0, -1.5) grid (\columnwidth, 0.5);
%block
\draw (6.25,-1) rectangle (8.25, -0.5);
\draw (6.25, -1) -- (6.25, -0.5);
\draw (6.75, -1) -- (6.75, -0.5);
\draw (7.25, -1) -- (7.25, -0.5);
\draw (7.75, -1) -- (7.75, -0.5);
\draw (8.25, -1) -- (8.25, -0.5);
\node at (6.5, -0.75) {1};
\node at (7, -0.75) {2};
\node at (7.5, -0.75) {3};
\node at (7.5, -1.25) {\small$\uparrow$};
\node at (7.5, -1.6) {$\texttt{num}_{\leq \texttt{q}}$};
\node at (6.25, -1.3) {$\texttt{block}$};
\begin{scope}[yshift = -3cm]
% main array
\draw[fill=gruen!30] (-0.5, 0) rectangle (0, 0.5);
\draw[fill=hellgruen!30] (0,0) rectangle (1, 0.5);
\draw[fill=gruen!30] (1,0) rectangle (3, 0.5);
\draw[fill=dunkelgruen!30] (3.75, 0) rectangle (5.5, 0.5);
\draw[fill=gruen!30] (7.5,0) rectangle (8, 0.5);
%\fill[fill=dunkelgruen!30] (4,0) rectangle (4.25, 0.5);
%\fill[fill=gruen!30] (4.25,0.25) rectangle (4.75, 0.5);
%\fill[fill=hellgruen!30] (4.25,0) rectangle (4.75, 0.25);
\fill[fill=hellgruen!30] (5.5,0) rectangle (7.5, 0.16);
\fill[fill=gruen!30] (5.5,0.16) rectangle (7.5, 0.33);
\fill[fill=dunkelgruen!30] (5.5,0.33) rectangle (7.5, 0.5);
\fill[fill=hellgruen!30] (3, 0) rectangle (3.25, 0.5);
\fill[fill=hellgruen!30] (3.25,0.0) rectangle (3.75, 0.25);
\fill[fill=gruen!30] (3.25,0.25) rectangle (3.75, 0.5);
% \fill[fill=hellgruen!30] (4.5,0.25) rectangle (7.5, 0.5);
% \fill[fill=dunkelgruen!30] (3.5,0) rectangle (3.75, 0.5);
% \fill[fill=hellgruen!30] (3.75,0) rectangle (4.0, 0.5);
% \fill[fill=hellgruen!30] (4,0) rectangle (4.25, 0.5);
% \fill[fill=dunkelgruen!30] (4.25,0) rectangle (4.5, 0.5);
\draw (0,0) rectangle (8, 0.5);
\draw (7.5, 0) -- (7.5, 0.5);
%\draw (1, 0) -- (1, 0.5);
%\draw (3.5, 0) -- (3.5, 0.5);
%\draw[dotted, thick] (5.5, 0) -- (5.5, 0.5);
\node at (-0.25, 0.25) {$p$};
\node at (7.75, 0.25) {$q$};
\node at (0.5, 0.25) {$< p$};
\node at (2, 0.25) {$p \leq \dots \leq q$};
\node at (4.65, 0.25) {$> q$};
\node at (6.5, 0.25) {? $\cdots$ ?};
\node at (1.05, -0.25) {\small$\uparrow$};
\node at (1.05, -0.65) {$\texttt{i}$};
\node at (3.05, -0.25) {\small$\uparrow$};
\node at (3.05, -0.65) {$\texttt{j}$};
\node at (5.55, -0.25) {\small$\uparrow$};
\node at (5.55, -0.65) {$\texttt{k}$};
\draw[->] (3.6, -0.75) -- (3.35, - 0.1);
\node at (3.6, -1) {$\texttt{j} + \texttt{c}$};
\draw [decorate, decoration={brace,amplitude=5pt},yshift=2pt](3,0.5)--(3.75,0.5) node [midway,yshift=15pt]{$\texttt{num}_{\leq \texttt{q}}$};
%\draw [step = .25, blue, opacity=0.3] (0, -1.5) grid (\columnwidth, 0.5);
%block
\draw (6.25,-1) rectangle (8.25, -0.5);
\draw (6.25, -1) -- (6.25, -0.5);
\draw (6.75, -1) -- (6.75, -0.5);
\draw (7.25, -1) -- (7.25, -0.5);
\draw (7.75, -1) -- (7.75, -0.5);
\draw (8.25, -1) -- (8.25, -0.5);
\node at (6.5, -0.75) {0};
\node at (7, -0.75) {1};
\node at (7.0, -1.25) {\small$\uparrow$};
\node at (7.0, -1.7) {$\texttt{num}_{< \texttt{p}}$};
\node at (6.25, -1.3) {$\texttt{block}$};
\end{scope}
\end{tikzpicture}}
%\includegraphics[width=\columnwidth]{fig-lomuto-block-2.png}
\vspace*{-1em}
\caption{Lomuto block partitioning with two pivots. Top: Algorithm~\ref{algo:dual:pivot:partitioning}
compares elements with $q$ and is on Line~8 with $\texttt{c} = 3$, cf.~Figure~\ref{fig:lomuto:block:invariant}. Bottom: Algorithm compares the $\texttt{num}_{\leq \texttt{q}}$ elements at most as large as $q$ to $p$. Algorithm is on Line~14 with $\texttt{c} = 1$.}
--- Reference: Simple and Fast BlockQuickSort using Lomuto's Partitioning Scheme\cite{BlockQuickSortLomuto}
\label{fig:lomuto:two:block:invariant}
\end{figure}
We can observe that there exists one more pass for 2-Pivot Block Lomuto's partition method than the 1-Pivot one. The first pass compares with the larger pivot $q$ and the second pass compares with the smaller pivot $p$.
The intermediate state between the 2 passes shows a mixture of elements that are less than $p$ and at most as large as $q$ swapped to the exchange area in the first pass. The second pass will then move the elements that are less than $p$ to the left side of the array.
\subsection{Multi-Pivot QuickSort}
\begin{algorithm}[H]
\caption{Dual-Pivot QuickSort}\label{DualPivotQuickSort}
\begin{algorithmic}[1]
% \Procedure{DualPivotQuickSort}{$A, l, r$}
% \If{$l < r$}
% \State $p1, p2 \gets \textsc{DualPivotPartition}(A, l, r)$
% \State \textsc{DualPivotQuickSort}(A, l, p1-1)
% \State \textsc{DualPivotQuickSort}(A, p1+1, p2-1)
% \State \textsc{DualPivotQuickSort}(A, p2+1, r)
% \EndIf
% \EndProcedure
\Procedure{DualPivotPartition}{$A, l, r$}
\State $P1, P2 \gets A[l], A[r]$ \Comment{Pivots, and DO make sure P1 < P2}
% \If{$P1 > P2$}
% \State \textsc{Swap}($P1, P2$)
% \EndIf
\State $less \gets l + 1$
\State $greater \gets r - 1$
\State $k \gets l$
\While{$k \leq greater$}
\If{$A[k] < P1$} \Comment{If the current element is less than P1}
\State \textsc{Swap}($A[k], A[less]$) \Comment{Shift the element to the leftmost partition}
\State $less \gets less + 1$
\ElsIf{$A[k] > P2$} \Comment{If the current element is greater than P2}
\While{$A[greater] > P2$ \textbf{and} $k < greater$}
\State $greater \gets greater - 1$ \Comment{Find the first element less than P2}
\EndWhile
\State \textsc{swap}(A[k], A[greater]) \Comment{Swap to the rightmost partition}
\State $greater \gets greater - 1$
\If{$A[j] < P1$} \Comment{Double check if another swap is needed}
\State \textsc{Swap}($A[k], A[less]$)
\State $less \gets less + 1$
\EndIf
\EndIf
\State $k \gets k + 1$ \Comment{Move to the next element}
\EndWhile
\State $less \gets less - 1$
\State $greater \gets greater + 1$
\State \textsc{Swap}($A[l], A[less]$) \Comment{Put Pivot1 in the middle}
\State \textsc{Swap}($A[r], A[greater]$) \Comment{Put Pivot2 in the middle}
% \algstore{DualPivotPartition}
% \end{algorithmic}
% \end{algorithm}
% \begin{algorithm}
% \caption{Dual-Pivot QuickSort (Continued)}
% \begin{algorithmic}
% \algrestore{DualPivotPartition}
\State \textbf{return} $less, greater$
\EndProcedure
\end{algorithmic}
\end{algorithm}
With one more pivot added, the two-pivot QuickSort method is introduced by Vladimir Yaroslavskiy in 2009. It uses 2 pivots selected from each end to split the array into 3 parts.
K stands for the current element being scanned, it will be swapped to the left part, which is accumulated by the less pointer, if it is less than the first pivot or do nothing if it is in the middle part.
Otherwise, if it is greater than the second pivot, it will be swapped to the right part, which is accumulated by the greater pointer.
We will double check the current element at index k after the swap to see if it is less than the first pivot, if so, we will swap it to the left part and move the less pointer to the right.
Hereby, we can see that the Dual-Pivot QuickSort is a generalization of the original QuickSort, and the partitioning method is also a generalization of the original Hoare's partition method but with 2 pivots.
\subsubsection{Theoretical Analysis vs Real World Performance}
What's attracting is that there was once a debate on whether multiple pivots could enhance the performance, as Sedgewick,
who analyzed a dual-pivot approach with an average of $\frac{32}{15}n \ln n + \bigO{n}$ comparisons in 1978, in constrast to the $2n\ln n + \bigO{n}$ of the standard QuickSort, considered this prototype to be inferior to the classical QuickSort.
However, the Yaroslavskiy's Dual-Pivot QuickSort has proved a success, with the experimental results showing that it has a less amount of $1.9n\ln n + \bigO{n}$ amortized comparisons, but more swaps,
approximately $0.6n\ln n + \bigO{n}$ which is much greater than the $0.33n\ln n + \bigO{n}$ of the original one, is faster than the original QuickSort in most cases.
It is surprising that even with asymptotically only 5\% of less comparisons and nearly 80\% more swaps, the Dual-Pivot QuickSort still outperforms the original QuickSort in practice.
What makes this more mysterious, is the variation of Yaroslavskiy's partition method that always compares to the larger pivot\hypertarget{CmpToLargerPivot}, by Martin Aumüller and Martin Dietzfelbinger 2013 in their thesis of 'Optimal Partitioning for Dual Pivot QuickSort' \cite{OptimalPartitioningForDualPivotQuickSort}.
The method shows no difference in terms of both comparison and swaps with the original Dual-Pivot QuickSort, but it turned out that it is as fast as the Yaroslavskiy's partition, and even faster than another method along with it, named 'Countering Strategy C', who claimed to have the least
estimations of $1.8n \ln n + \bigO(n)$ comparisons and $1.6n \ln n + \bigO{n}$ swaps. This is a great example of how the real-world performance could be different from the theoretical analysis. In fact, in their following tests, the 'Countering Strategy C' method was proven to be the slowest among all the 3 methods.
It is noteworthy the there still lies a huge gap between the theoretical analysis and the real-world performance. It is time we broaden our horizon and look for more factors that could contribute more to the change of the performance of the QuickSort algorithm.
The cache misses, branch mis-predictions, and the number of instructions are the things we should take into consideration when we are analyzing on mutated versions of the QuickSort algorithm.
Apparently, the number of pivots plays a significant role, and is one of the factors that could potentially bring a significant change.
SSSSort, another extreme example of using 127 pivots by default, is an attempt to exploit the full edges of multi pivots with the help of branchless comparisons. While it demonstrated impressive performance in arrays of random permutations for certain ranges of sizes,
it's not as resilient as the classical Multi-Pivot QuickSorts to arrays with many duplicates, which is a common occurrence in real-world scenarios, given by the tests in comparisons with Block QuickSort\cite{BlockQuickSort}. However, it does outperform all the other variations when it comes to complicated
data structures like vectors and records\hyperlink{ref:record}{[→]}, although these aspects are not the primary focus of this thesis, it is still worth mentioning that the Block QuickSort is a more general and versatile method that can be applied to a wider range of scenarios. At this moment, we are convinced that more pivots tend to bring better performance in general, but how many?
Let's dive deeper to see the world of 3-Pivots QuickSort to check it out.
\begin{algorithm}[H]
\caption{3-Pivot QuickSort}\label{3PivotQuickSort}
\begin{algorithmic}[1]
\Procedure{ThreePivotPartition}{$A, l, r$}
\State $P1, P2, P3 \gets A[l], A[l+1], A[r]$ \Comment{And sort 3 Pivots in ascending order}
\State $i, j \gets l + 2$
\State $k, l \gets r - 1$
\While{$j \leq k$}
% // j moves right until arr[j] >= p2
% while arr[j].cmp(&p2) == Ordering::Less {
% // arr[<i] -> elements that are less than p1, arr[i] is not less than p1
% if arr[j].cmp(&p1) == Ordering::Less {
% arr.swap_unchecked(i, j);
% i += 1;
% }
% j += 1;
% }
\While{$A[j] < P2$} \Comment{Put left side elements less than P2 in order}
\If{$A[j] < P1$}
\State \textsc{Swap}($A[i], A[j]$)
\State $i \gets i + 1$
\EndIf
\State $j \gets j + 1$
\EndWhile
% // k moves left until arr[k] <= p2
% while arr[k].cmp(&p2) == Ordering::Greater {
% // arr[>l] -> elements that are greater than p3, arr[l] is not greater than p3
% if arr[k].cmp(&p3) == Ordering::Greater {
% arr.swap_unchecked(k, l);
% l -= 1;
% }
% k -= 1;
% }
\While{$A[k] > P2$} \Comment{Put right side elements greater than P2 in order}
\If{$A[k] > P3$}
\State \textsc{Swap}($A[k], A[l]$)
\State $l \gets l - 1$
\EndIf
\State $k \gets k - 1$
\EndWhile
\If{$j \leq k$}
\If{$A[j] > P3$} \Comment{Deal with 2 branches when A[j] is greater than P3}
\If{$A[k] < P1$}
\State \textsc{Rotate3}($A, [j, i, k]$)
\State $i \gets i + 1$
\Else
\algstore{3PivotPartition}
\end{algorithmic}
\end{algorithm}
\begin{algorithm}
\caption{3-Pivot QuickSort (Continued)}
\begin{algorithmic}
\algrestore{3PivotPartition}
% // if j is still less than k
% if j <= k {
% if arr[j].cmp(&p3) == Ordering::Greater {
% if arr[k].cmp(&p1) == Ordering::Less {
% // if arr[j] > p3 and arr[k] < p1,
% // rotate arr[j] to k and arr[k] to i because arr[<i] < p1