diff --git a/library/core/src/slice/sort/select.rs b/library/core/src/slice/sort/select.rs index f6529f23bcb3f..3358c03d30a9b 100644 --- a/library/core/src/slice/sort/select.rs +++ b/library/core/src/slice/sort/select.rs @@ -7,6 +7,7 @@ //! better performance than one would get using heapsort as fallback. use crate::mem::{self, SizedTypeProperties}; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::pivot::choose_pivot; use crate::slice::sort::shared::smallsort::insertion_sort_shift_left; use crate::slice::sort::unstable::quicksort::partition; @@ -40,7 +41,13 @@ where let min_idx = min_index(v, &mut is_less).unwrap(); v.swap(min_idx, index); } else { - partition_at_index_loop(v, index, None, &mut is_less); + cfg_if! { + if #[cfg(feature = "optimize_for_size")] { + median_of_medians(v, &mut is_less, index); + } else { + partition_at_index_loop(v, index, None, &mut is_less); + } + } } let (left, right) = v.split_at_mut(index); @@ -53,6 +60,7 @@ where // most once, it doesn't make sense to use something more sophisticated than insertion-sort. const INSERTION_SORT_THRESHOLD: usize = 16; +#[cfg(not(feature = "optimize_for_size"))] fn partition_at_index_loop<'a, T, F>( mut v: &'a mut [T], mut index: usize, @@ -169,6 +177,7 @@ fn median_of_medians bool>(mut v: &mut [T], is_less: &mut if v.len() >= 2 { insertion_sort_shift_left(v, 1, is_less); } + return; } diff --git a/library/core/src/slice/sort/shared/mod.rs b/library/core/src/slice/sort/shared/mod.rs index ad1171bfc6a0a..e0f8d475a2e30 100644 --- a/library/core/src/slice/sort/shared/mod.rs +++ b/library/core/src/slice/sort/shared/mod.rs @@ -1,3 +1,5 @@ +#![cfg_attr(feature = "optimize_for_size", allow(dead_code))] + use crate::marker::Freeze; pub(crate) mod pivot; diff --git a/library/core/src/slice/sort/shared/smallsort.rs b/library/core/src/slice/sort/shared/smallsort.rs index db0c5c72822c0..aaa9f9f1c7fd1 100644 --- a/library/core/src/slice/sort/shared/smallsort.rs +++ b/library/core/src/slice/sort/shared/smallsort.rs @@ -378,7 +378,12 @@ where /// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the /// value at position `b_pos` is less than the one at position `a_pos`. -pub unsafe fn swap_if_less(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F) +/// +/// Purposefully not marked `#[inline]`, despite us wanting it to be inlined for integers like +/// types. `is_less` could be a huge function and we want to give the compiler an option to +/// not inline this function. For the same reasons that this function is very perf critical +/// it should be in the same module as the functions that use it. +unsafe fn swap_if_less(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F) where F: FnMut(&T, &T) -> bool, { diff --git a/library/core/src/slice/sort/stable/mod.rs b/library/core/src/slice/sort/stable/mod.rs index a383b0f589ccf..00eb3785e0f25 100644 --- a/library/core/src/slice/sort/stable/mod.rs +++ b/library/core/src/slice/sort/stable/mod.rs @@ -1,15 +1,24 @@ //! This module contains the entry points for `slice::sort`. +#[cfg(not(feature = "optimize_for_size"))] +use crate::cmp; +use crate::intrinsics; use crate::mem::{self, MaybeUninit, SizedTypeProperties}; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::{ insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN, }; -use crate::{cmp, intrinsics}; -pub(crate) mod drift; pub(crate) mod merge; + +#[cfg(not(feature = "optimize_for_size"))] +pub(crate) mod drift; +#[cfg(not(feature = "optimize_for_size"))] pub(crate) mod quicksort; +#[cfg(feature = "optimize_for_size")] +pub(crate) mod tiny; + /// Stable sort called driftsort by Orson Peters and Lukas Bergdoll. /// Design document: /// @@ -30,25 +39,53 @@ pub fn sort bool, BufT: BufGuard>(v: &mut [T], is_less return; } - // More advanced sorting methods than insertion sort are faster if called in - // a hot loop for small inputs, but for general-purpose code the small - // binary size of insertion sort is more important. The instruction cache in - // modern processors is very valuable, and for a single sort call in general - // purpose code any gains from an advanced method are cancelled by i-cache - // misses during the sort, and thrashing the i-cache for surrounding code. - const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; - if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { - insertion_sort_shift_left(v, 1, is_less); - return; - } + cfg_if! { + if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] { + let alloc_len = len / 2; + + cfg_if! { + if #[cfg(target_pointer_width = "16")] { + let heap_buf = BufT::with_capacity(alloc_len); + let scratch = heap_buf.as_uninit_slice_mut(); + } else { + // For small inputs 4KiB of stack storage suffices, which allows us to avoid + // calling the (de-)allocator. Benchmarks showed this was quite beneficial. + let mut stack_buf = AlignedStorage::::new(); + let stack_scratch = stack_buf.as_uninit_slice_mut(); + let mut heap_buf; + let scratch = if stack_scratch.len() >= alloc_len { + stack_scratch + } else { + heap_buf = BufT::with_capacity(alloc_len); + heap_buf.as_uninit_slice_mut() + }; + } + } - driftsort_main::(v, is_less); + tiny::mergesort(v, scratch, is_less); + } else { + // More advanced sorting methods than insertion sort are faster if called in + // a hot loop for small inputs, but for general-purpose code the small + // binary size of insertion sort is more important. The instruction cache in + // modern processors is very valuable, and for a single sort call in general + // purpose code any gains from an advanced method are cancelled by i-cache + // misses during the sort, and thrashing the i-cache for surrounding code. + const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { + insertion_sort_shift_left(v, 1, is_less); + return; + } + + driftsort_main::(v, is_less); + } + } } /// See [`sort`] /// /// Deliberately don't inline the main sorting routine entrypoint to ensure the /// inlined insertion sort i-cache footprint remains minimal. +#[cfg(not(feature = "optimize_for_size"))] #[inline(never)] fn driftsort_main bool, BufT: BufGuard>(v: &mut [T], is_less: &mut F) { // By allocating n elements of memory we can ensure the entire input can diff --git a/library/core/src/slice/sort/stable/tiny.rs b/library/core/src/slice/sort/stable/tiny.rs new file mode 100644 index 0000000000000..071ab8e107fe3 --- /dev/null +++ b/library/core/src/slice/sort/stable/tiny.rs @@ -0,0 +1,41 @@ +//! Binary-size optimized mergesort inspired by https://github.com/voultapher/tiny-sort-rs. + +use crate::mem::MaybeUninit; +use crate::ptr; +use crate::slice::sort::stable::merge; + +/// Tiny recursive top-down merge sort optimized for binary size. It has no adaptiveness whatsoever, +/// no run detection, etc. +#[inline(always)] +pub fn mergesort bool>( + v: &mut [T], + scratch: &mut [MaybeUninit], + is_less: &mut F, +) { + let len = v.len(); + + if len > 2 { + let mid = len / 2; + + // SAFETY: mid is in-bounds. + unsafe { + // Sort the left half recursively. + mergesort(v.get_unchecked_mut(..mid), scratch, is_less); + // Sort the right half recursively. + mergesort(v.get_unchecked_mut(mid..), scratch, is_less); + } + + merge::merge(v, scratch, mid, is_less); + } else if len == 2 { + // SAFETY: We checked the len, the pointers we create are valid and don't overlap. + unsafe { + let v_base = v.as_mut_ptr(); + let v_a = v_base; + let v_b = v_base.add(1); + + if is_less(&*v_b, &*v_a) { + ptr::swap_nonoverlapping(v_a, v_b, 1); + } + } + } +} diff --git a/library/core/src/slice/sort/unstable/heapsort.rs b/library/core/src/slice/sort/unstable/heapsort.rs index 27e2ad588ea09..85231779d031f 100644 --- a/library/core/src/slice/sort/unstable/heapsort.rs +++ b/library/core/src/slice/sort/unstable/heapsort.rs @@ -1,46 +1,46 @@ //! This module contains a branchless heapsort as fallback for unstable quicksort. -use crate::{intrinsics, ptr}; +use crate::{cmp, intrinsics, ptr}; /// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case. /// /// Never inline this, it sits the main hot-loop in `recurse` and is meant as unlikely algorithmic /// fallback. -/// -/// SAFETY: The caller has to guarantee that `v.len()` >= 2. #[inline(never)] -pub(crate) unsafe fn heapsort(v: &mut [T], is_less: &mut F) +pub(crate) fn heapsort(v: &mut [T], is_less: &mut F) where F: FnMut(&T, &T) -> bool, { - // SAFETY: See function safety. - unsafe { - intrinsics::assume(v.len() >= 2); - - // Build the heap in linear time. - for i in (0..v.len() / 2).rev() { - sift_down(v, i, is_less); - } + let len = v.len(); - // Pop maximal elements from the heap. - for i in (1..v.len()).rev() { + for i in (0..len + len / 2).rev() { + let sift_idx = if i >= len { + i - len + } else { v.swap(0, i); - sift_down(&mut v[..i], 0, is_less); + 0 + }; + + // SAFETY: The above calculation ensures that `sift_idx` is either 0 or + // `(len..(len + (len / 2))) - len`, which simplifies to `0..(len / 2)`. + // This guarantees the required `sift_idx <= len`. + unsafe { + sift_down(&mut v[..cmp::min(i, len)], sift_idx, is_less); } } } // This binary heap respects the invariant `parent >= child`. // -// SAFETY: The caller has to guarantee that node < `v.len()`. -#[inline(never)] +// SAFETY: The caller has to guarantee that `node <= v.len()`. +#[inline(always)] unsafe fn sift_down(v: &mut [T], mut node: usize, is_less: &mut F) where F: FnMut(&T, &T) -> bool, { // SAFETY: See function safety. unsafe { - intrinsics::assume(node < v.len()); + intrinsics::assume(node <= v.len()); } let len = v.len(); @@ -69,9 +69,7 @@ where break; } - // Swap `node` with the greater child, move one step down, and continue sifting. This - // could be ptr::swap_nonoverlapping but that adds a significant amount of binary-size. - ptr::swap(v_base.add(node), v_base.add(child)); + ptr::swap_nonoverlapping(v_base.add(node), v_base.add(child), 1); } node = child; diff --git a/library/core/src/slice/sort/unstable/mod.rs b/library/core/src/slice/sort/unstable/mod.rs index 932e01f4401e5..8bbd85443d478 100644 --- a/library/core/src/slice/sort/unstable/mod.rs +++ b/library/core/src/slice/sort/unstable/mod.rs @@ -2,7 +2,9 @@ use crate::intrinsics; use crate::mem::SizedTypeProperties; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::find_existing_run; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::insertion_sort_shift_left; pub(crate) mod heapsort; @@ -28,25 +30,32 @@ pub fn sort bool>(v: &mut [T], is_less: &mut F) { return; } - // More advanced sorting methods than insertion sort are faster if called in - // a hot loop for small inputs, but for general-purpose code the small - // binary size of insertion sort is more important. The instruction cache in - // modern processors is very valuable, and for a single sort call in general - // purpose code any gains from an advanced method are cancelled by i-cache - // misses during the sort, and thrashing the i-cache for surrounding code. - const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; - if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { - insertion_sort_shift_left(v, 1, is_less); - return; - } + cfg_if! { + if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] { + heapsort::heapsort(v, is_less); + } else { + // More advanced sorting methods than insertion sort are faster if called in + // a hot loop for small inputs, but for general-purpose code the small + // binary size of insertion sort is more important. The instruction cache in + // modern processors is very valuable, and for a single sort call in general + // purpose code any gains from an advanced method are cancelled by i-cache + // misses during the sort, and thrashing the i-cache for surrounding code. + const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { + insertion_sort_shift_left(v, 1, is_less); + return; + } - ipnsort(v, is_less); + ipnsort(v, is_less); + } + } } /// See [`sort`] /// /// Deliberately don't inline the main sorting routine entrypoint to ensure the /// inlined insertion sort i-cache footprint remains minimal. +#[cfg(not(feature = "optimize_for_size"))] #[inline(never)] fn ipnsort(v: &mut [T], is_less: &mut F) where diff --git a/library/core/src/slice/sort/unstable/quicksort.rs b/library/core/src/slice/sort/unstable/quicksort.rs index cd53656e9b4b8..4feef5deeb0fb 100644 --- a/library/core/src/slice/sort/unstable/quicksort.rs +++ b/library/core/src/slice/sort/unstable/quicksort.rs @@ -1,8 +1,12 @@ //! This module contains an unstable quicksort and two partition implementations. use crate::mem::{self, ManuallyDrop}; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::pivot::choose_pivot; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::UnstableSmallSortTypeImpl; +#[cfg(not(feature = "optimize_for_size"))] +use crate::slice::sort::unstable::heapsort; use crate::{intrinsics, ptr}; /// Sorts `v` recursively. @@ -11,6 +15,7 @@ use crate::{intrinsics, ptr}; /// /// `limit` is the number of allowed imbalanced partitions before switching to `heapsort`. If zero, /// this function will immediately switch to heapsort. +#[cfg(not(feature = "optimize_for_size"))] pub(crate) fn quicksort<'a, T, F>( mut v: &'a mut [T], mut ancestor_pivot: Option<&'a T>, @@ -28,10 +33,7 @@ pub(crate) fn quicksort<'a, T, F>( // If too many bad pivot choices were made, simply fall back to heapsort in order to // guarantee `O(N x log(N))` worst-case. if limit == 0 { - // SAFETY: We assume the `small_sort` threshold is at least 1. - unsafe { - crate::slice::sort::unstable::heapsort::heapsort(v, is_less); - } + heapsort::heapsort(v, is_less); return; } @@ -98,13 +100,15 @@ where return 0; } - // Allows for panic-free code-gen by proving this property to the compiler. if pivot >= len { intrinsics::abort(); } - // Place the pivot at the beginning of slice. - v.swap(0, pivot); + // SAFETY: We checked that `pivot` is in-bounds. + unsafe { + // Place the pivot at the beginning of slice. + v.swap_unchecked(0, pivot); + } let (pivot, v_without_pivot) = v.split_at_mut(1); // Assuming that Rust generates noalias LLVM IR we can be sure that a partition function @@ -118,8 +122,15 @@ where // compile-time by only instantiating the code that is needed. Idea by Frank Steffahn. let num_lt = (const { inst_partition::() })(v_without_pivot, pivot, is_less); - // Place the pivot between the two partitions. - v.swap(0, num_lt); + if num_lt >= len { + intrinsics::abort(); + } + + // SAFETY: We checked that `num_lt` is in-bounds. + unsafe { + // Place the pivot between the two partitions. + v.swap_unchecked(0, num_lt); + } num_lt } @@ -129,7 +140,13 @@ const fn inst_partition bool>() -> fn(&mut [T], &T, &mut if mem::size_of::() <= MAX_BRANCHLESS_PARTITION_SIZE { // Specialize for types that are relatively cheap to copy, where branchless optimizations // have large leverage e.g. `u64` and `String`. - partition_lomuto_branchless_cyclic:: + cfg_if! { + if #[cfg(feature = "optimize_for_size")] { + partition_lomuto_branchless_simple:: + } else { + partition_lomuto_branchless_cyclic:: + } + } } else { partition_hoare_branchy_cyclic:: } @@ -215,6 +232,7 @@ where } } +#[cfg(not(feature = "optimize_for_size"))] struct PartitionState { // The current element that is being looked at, scans left to right through slice. right: *mut T, @@ -225,6 +243,7 @@ struct PartitionState { gap: GapGuardRaw, } +#[cfg(not(feature = "optimize_for_size"))] fn partition_lomuto_branchless_cyclic(v: &mut [T], pivot: &T, is_less: &mut F) -> usize where F: FnMut(&T, &T) -> bool, @@ -316,6 +335,27 @@ where } } +#[cfg(feature = "optimize_for_size")] +fn partition_lomuto_branchless_simple bool>( + v: &mut [T], + pivot: &T, + is_less: &mut F, +) -> usize { + let mut left = 0; + + for right in 0..v.len() { + // SAFETY: `left` can at max be incremented by 1 each loop iteration, which implies that + // left <= right and that both are in-bounds. + unsafe { + let right_is_lt = is_less(v.get_unchecked(right), pivot); + v.swap_unchecked(left, right); + left += right_is_lt as usize; + } + } + + left +} + struct GapGuard { pos: *mut T, value: ManuallyDrop, @@ -333,11 +373,13 @@ impl Drop for GapGuard { /// Ideally this wouldn't be needed and we could just use the regular GapGuard. /// See comment in [`partition_lomuto_branchless_cyclic`]. +#[cfg(not(feature = "optimize_for_size"))] struct GapGuardRaw { pos: *mut T, value: *mut T, } +#[cfg(not(feature = "optimize_for_size"))] impl Drop for GapGuardRaw { fn drop(&mut self) { // SAFETY: `self` MUST be constructed in a way that makes copying the gap value into pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy