@@ -521,8 +521,11 @@ template <typename vtype, int maxN>
521521void sort_n (typename vtype::type_t *arr, int N);
522522
523523template <typename vtype, typename comparator, typename type_t >
524- static void
525- qsort_ (type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters)
524+ static void qsort_ (type_t *arr,
525+ arrsize_t left,
526+ arrsize_t right,
527+ arrsize_t max_iters,
528+ arrsize_t task_threshold)
526529{
527530 /*
528531 * Resort to std::sort if quicksort isnt making any progress
@@ -559,10 +562,40 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters)
559562 type_t leftmostValue = comparator::leftmost (smallest, biggest);
560563 type_t rightmostValue = comparator::rightmost (smallest, biggest);
561564
565+ #ifdef XSS_COMPILE_OPENMP
566+ if (pivot != leftmostValue) {
567+ bool parallel_left = (pivot_index - left) > task_threshold;
568+ if (parallel_left) {
569+ #pragma omp task
570+ qsort_<vtype, comparator>(
571+ arr, left, pivot_index - 1 , max_iters - 1 , task_threshold);
572+ }
573+ else {
574+ qsort_<vtype, comparator>(
575+ arr, left, pivot_index - 1 , max_iters - 1 , task_threshold);
576+ }
577+ }
578+ if (pivot != rightmostValue) {
579+ bool parallel_right = (right - pivot_index) > task_threshold;
580+
581+ if (parallel_right) {
582+ #pragma omp task
583+ qsort_<vtype, comparator>(
584+ arr, pivot_index, right, max_iters - 1 , task_threshold);
585+ }
586+ else {
587+ qsort_<vtype, comparator>(
588+ arr, pivot_index, right, max_iters - 1 , task_threshold);
589+ }
590+ }
591+ #else
592+ UNUSED (task_threshold);
593+
562594 if (pivot != leftmostValue)
563- qsort_<vtype, comparator>(arr, left, pivot_index - 1 , max_iters - 1 );
595+ qsort_<vtype, comparator>(arr, left, pivot_index - 1 , max_iters - 1 , 0 );
564596 if (pivot != rightmostValue)
565- qsort_<vtype, comparator>(arr, pivot_index, right, max_iters - 1 );
597+ qsort_<vtype, comparator>(arr, pivot_index, right, max_iters - 1 , 0 );
598+ #endif
566599}
567600
568601template <typename vtype, typename comparator, typename type_t >
@@ -627,8 +660,41 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan)
627660 }
628661
629662 UNUSED (hasnan);
663+
664+ #ifdef XSS_COMPILE_OPENMP
665+
666+ bool use_parallel = arrsize > 100000 ;
667+
668+ if (use_parallel) {
669+ // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system
670+ constexpr int thread_limit = 8 ;
671+ int thread_count = std::min (thread_limit, omp_get_max_threads ());
672+ arrsize_t task_threshold
673+ = std::max ((arrsize_t )100000 , arrsize / 100 );
674+
675+ // We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_
676+ // The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems
677+ // Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays
678+ #pragma omp parallel num_threads(thread_count)
679+ #pragma omp single
680+ qsort_<vtype, comparator, T>(arr,
681+ 0 ,
682+ arrsize - 1 ,
683+ 2 * (arrsize_t )log2 (arrsize),
684+ task_threshold);
685+ }
686+ else {
687+ qsort_<vtype, comparator, T>(arr,
688+ 0 ,
689+ arrsize - 1 ,
690+ 2 * (arrsize_t )log2 (arrsize),
691+ std::numeric_limits<arrsize_t >::max ());
692+ }
693+ #pragma omp taskwait
694+ #else
630695 qsort_<vtype, comparator, T>(
631- arr, 0 , arrsize - 1 , 2 * (arrsize_t )log2 (arrsize));
696+ arr, 0 , arrsize - 1 , 2 * (arrsize_t )log2 (arrsize), 0 );
697+ #endif
632698
633699 replace_inf_with_nan (arr, arrsize, nan_count, descending);
634700 }
0 commit comments