[WIP] Allow cub::DeviceRadixSort and cub::DeviceSegmentedRadixSort to use iterator as input #374

zasdfgbnm · 2021-09-13T18:14:27Z

…terator as input

zasdfgbnm · 2021-09-14T00:38:49Z

cub/device/dispatch/dispatch_radix_sort.cuh

+                                KeyInIterT d_keys_in_ = d_keys_out;
+                                ValueInIterT d_values_in_ = d_values_out;
+                                onesweep_kernel<<<num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream>>>
+                                    (d_lookback, d_ctrs   part * num_passes   pass,
+                                    part < num_parts - 1 ?
+                                        d_bins   ((part   1) * num_passes   pass) * RADIX_DIGITS : NULL,
+                                    d_bins   (part * num_passes   pass) * RADIX_DIGITS,
+                                    d_keys_out_,
+                                    d_keys_in_   part * PART_SIZE,
+                                    d_values_out_,
+                                    d_values_in_   part * PART_SIZE,
+                                    part_num_items, current_bit, num_bits);
+                                break;
+                            }
+                        }
+                    } else {
+                        using KeyOutIterT = KeyIteratorT;
+                        using ValueOutIterT = ValueIteratorT;
+                        KeyOutIterT d_keys_out_ = d_keys_out;
+                        ValueOutIterT d_values_out_ = d_values_out;
+                        switch (input_mode) {
+                            case INPUT: {
+                                using KeyInIterT = KeyInputIteratorT;
+                                using ValueInIterT = ValueInputIteratorT;
+                                auto onesweep_kernel = DeviceRadixSortOnesweepKernel<
+                                    MaxPolicyT, IS_DESCENDING, KeyInIterT, KeyOutIterT,
+                                    ValueInIterT, ValueOutIterT, OffsetT>;
+                                KeyInIterT d_keys_in_ = d_keys_in;
+                                ValueInIterT d_values_in_ = d_values_in;
+                                onesweep_kernel<<<num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream>>>
+                                    (d_lookback, d_ctrs   part * num_passes   pass,
+                                    part < num_parts - 1 ?
+                                        d_bins   ((part   1) * num_passes   pass) * RADIX_DIGITS : NULL,
+                                    d_bins   (part * num_passes   pass) * RADIX_DIGITS,
+                                    d_keys_out_,
+                                    d_keys_in_   part * PART_SIZE,
+                                    d_values_out_,
+                                    d_values_in_   part * PART_SIZE,
+                                    part_num_items, current_bit, num_bits);
+                                break;
+                            }
+                            case TMP_STORAGE: {
+                                using KeyInIterT = KeyT *;
+                                using ValueInIterT = ValueT *;
+                                auto onesweep_kernel = DeviceRadixSortOnesweepKernel<
+                                    MaxPolicyT, IS_DESCENDING, KeyInIterT, KeyOutIterT,
+                                    ValueInIterT, ValueOutIterT, OffsetT>;
+                                KeyInIterT d_keys_in_ = d_keys_tmp;
+                                ValueInIterT d_values_in_ = d_values_tmp;
+                                onesweep_kernel<<<num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream>>>
+                                    (d_lookback, d_ctrs   part * num_passes   pass,
+                                    part < num_parts - 1 ?
+                                        d_bins   ((part   1) * num_passes   pass) * RADIX_DIGITS : NULL,
+                                    d_bins   (part * num_passes   pass) * RADIX_DIGITS,
+                                    d_keys_out_,
+                                    d_keys_in_   part * PART_SIZE,
+                                    d_values_out_,
+                                    d_values_in_   part * PART_SIZE,
+                                    part_num_items, current_bit, num_bits);
+                                break;
+                            }
+                            case OUTPUT: {
+                                using KeyInIterT = KeyIteratorT;
+                                using ValueInIterT = ValueIteratorT;
+                                auto onesweep_kernel = DeviceRadixSortOnesweepKernel<
+                                    MaxPolicyT, IS_DESCENDING, KeyInIterT, KeyOutIterT,
+                                    ValueInIterT, ValueOutIterT, OffsetT>;
+                                KeyInIterT d_keys_in_ = d_keys_out;
+                                ValueInIterT d_values_in_ = d_values_out;
+                                onesweep_kernel<<<num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream>>>
+                                    (d_lookback, d_ctrs   part * num_passes   pass,
+                                    part < num_parts - 1 ?
+                                        d_bins   ((part   1) * num_passes   pass) * RADIX_DIGITS : NULL,
+                                    d_bins   (part * num_passes   pass) * RADIX_DIGITS,
+                                    d_keys_out_,
+                                    d_keys_in_   part * PART_SIZE,
+                                    d_values_out_,
+                                    d_values_in_   part * PART_SIZE,
+                                    part_num_items, current_bit, num_bits);
+                                break;
+                            }
+                        }
+                    }
                    if (CubDebug(error = cudaPeekAtLastError())) break;
                }
-
-                // use the temporary buffers if no overwrite is allowed
-                if (!is_overwrite_okay && pass == 0)
-                {
-                    d_keys = num_passes % 2 == 0 ?
-                        DoubleBuffer<KeyT>(d_keys_tmp, d_keys_tmp2) :
-                        DoubleBuffer<KeyT>(d_keys_tmp2, d_keys_tmp);
-                    d_values = num_passes % 2 == 0 ?
-                        DoubleBuffer<ValueT>(d_values_tmp, d_values_tmp2) :
-                        DoubleBuffer<ValueT>(d_values_tmp2, d_values_tmp);
-                }
-                d_keys.selector ^= 1;
-                d_values.selector ^= 1;
+                input_mode = output_is_tmp ? TMP_STORAGE : OUTPUT;
+                output_is_tmp = !output_is_tmp;


@allisonvacanti Before I continue my work, I'd like to hear your feedback about this part. In order to support iterators, I have to add this verbose logic because the type of input iter, type of tmp storage, and type of output iter can be different.

I am not sure if you are OK with this change, but this is very verbose, and I can not think of a better solution.

We'll need to restore the is_overwrite_okay optimization first and then we can take a closer look at this part.

zasdfgbnm · 2021-09-14T16:28:26Z

cub/device/device_radix_sort.cuh

            num_items,
            begin_bit,
            end_bit,
-            is_overwrite_okay,


is_overwrite_okay is removed, with iterator support, it never overwrites.

alliepiper

Thanks for the patch! I'd like to get this functionality into CUB, but the proposed implementation will be too disruptive, since there is no way for a user to avoid the large temporary storage allocations needed to hold the intermediate keys/values. When users call the DoubleBuffer APIs, they are explicitly providing the scratch space needed to hold these intermediate results -- we need to use this memory instead of reallocating it.

I definitely want to merge this, but we'll need to preserve the is_overwrite_okay optimization and only allocate the extra scratch memory when it is absolutely necessary.

alliepiper · 2021-09-30T18:16:31Z

cub/device/dispatch/dispatch_radix_sort.cuh

@@ -1281,9  1305,9 @@ struct DispatchRadixSort :
            // lookback
            max_num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT),
            // extra key buffer
-            is_overwrite_okay || num_passes <= 1 ? 0 : num_items * sizeof(KeyT),
+            num_passes <= 1 ? 0 : num_items * sizeof(KeyT),


This will drastically increase the amount of temporary storage needed for some invocations of radix sort. We need to keep this optimization in place for the DoubleBuffer overloads in the Device*RadixSort APIs, since folks specifically use those to reduce the temporary storage allocations.

alliepiper · 2021-09-30T18:19:46Z

cub/device/dispatch/dispatch_radix_sort.cuh

+                INPUT,
+                TMP_STORAGE,
+                OUTPUT
+            } input_mode = INPUT;


Style: This should be split into separate declarations:

enum InputMode { ... }; InputMode input_mode = INPUT;

alliepiper · 2021-09-30T18:26:43Z

cub/device/dispatch/dispatch_radix_sort.cuh

@@ -1822,8  1966,8 @@ struct DispatchSegmentedRadixSort :
            void* allocations[2] = {};
            size_t allocation_sizes[2] =
            {
-                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                      // bytes needed for 3rd keys buffer
-                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),     // bytes needed for 3rd values buffer
+                num_items * sizeof(KeyT),                      // bytes needed for 3rd keys buffer


Same as above, we need to keep this optimization in place.

alliepiper · 2021-09-30T18:53:12Z

Also -- @senior-zero is adding a new segmented sort implementation that uses iterator instead of pointers, see #357. It preserves the double buffer optimizations, so it may be a useful reference.

zasdfgbnm · 2021-10-01T17:38:35Z

@allisonvacanti Looks like in #357, iterators are only supported for offsets, for input and output keys and values, they still need to be pointers?

alliepiper · 2021-10-06T21:29:32Z

@zasdfgbnm My mistake -- you are correct. We don't have an example that does the switching. I checked the new merge sort implementation and it also copies the keys/values unconditionally. But that was a new algorithm, so it's less concerning.

I'd still like to update this PR to only allocate the extra temporary storage when the iterators aren't pointers to avoid changing the requirements of this algorithm -- it should be possible to have special logic for the first pass that reads from the iterators instead of the temp storage buffers. Let me know if you'd like to discuss this more.

alliepiper · 2021-10-14T20:22:18Z

I'll be starting the 1.15 RC next week, and it looks like this will take a bit more work to be ready. Bumping to 1.16 milestone -- let me know if you plan to finish this before Monday and we can keep it at 1.15.

zasdfgbnm · 2021-10-14T20:26:25Z

1.16 is fine to me

cliffburdick · 2023-08-28T17:51:13Z

Hi @zasdfgbnm , are there any updates on this?

zasdfgbnm · 2023-08-28T19:29:30Z

@cliffburdick No, I am not working on this any more.

zasdfgbnm added 6 commits September 13, 2021 11:13

Allow cub::DeviceRadixSort and cub::DeviceSegmentedRadixSort to use i…

b6b0bb2

…terator as input

save

967a5ae

fix

ef97e60

fix

aa40169

no is_overwrite_okay

a93034d

save

fd7dd1b

zasdfgbnm commented Sep 14, 2021

View reviewed changes

zasdfgbnm mentioned this pull request Sep 14, 2021

Allow iterators in cub::DeviceRadixSort NVIDIA/cccl#868

Open

save

2eee39c

zasdfgbnm commented Sep 14, 2021

View reviewed changes

fix

8f0073f

alliepiper self-assigned this Sep 21, 2021

alliepiper added this to the 1.15.0 milestone Sep 21, 2021

alliepiper suggested changes Sep 30, 2021

View reviewed changes

alliepiper assigned zasdfgbnm and unassigned alliepiper Sep 30, 2021

alliepiper modified the milestones: 1.15.0, 1.16.0 Oct 14, 2021

alliepiper added helps: pytorch Helps or needed by PyTorch. P1: should have Necessary, but not critical. labels Oct 14, 2021

alliepiper modified the milestones: 1.16.0, 1.17.0 Feb 7, 2022

alliepiper added P3: backlog Unprioritized and removed P1: should have Necessary, but not critical. labels Apr 6, 2022

alliepiper modified the milestones: 1.17.0, Backlog Apr 25, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[WIP] Allow cub::DeviceRadixSort and cub::DeviceSegmentedRadixSort to use iterator as input #374

[WIP] Allow cub::DeviceRadixSort and cub::DeviceSegmentedRadixSort to use iterator as input #374

zasdfgbnm commented Sep 13, 2021 •

edited

Loading

zasdfgbnm Sep 14, 2021

zasdfgbnm Sep 14, 2021

alliepiper Sep 30, 2021

zasdfgbnm Sep 14, 2021

alliepiper left a comment

alliepiper Sep 30, 2021

alliepiper Sep 30, 2021

alliepiper Sep 30, 2021

alliepiper commented Sep 30, 2021

zasdfgbnm commented Oct 1, 2021

alliepiper commented Oct 6, 2021

alliepiper commented Oct 14, 2021

zasdfgbnm commented Oct 14, 2021

cliffburdick commented Aug 28, 2023

zasdfgbnm commented Aug 28, 2023

[WIP] Allow cub::DeviceRadixSort and cub::DeviceSegmentedRadixSort to use iterator as input #374

Are you sure you want to change the base?

[WIP] Allow cub::DeviceRadixSort and cub::DeviceSegmentedRadixSort to use iterator as input #374

Conversation

zasdfgbnm commented Sep 13, 2021 • edited Loading

zasdfgbnm Sep 14, 2021

Choose a reason for hiding this comment

zasdfgbnm Sep 14, 2021

Choose a reason for hiding this comment

alliepiper Sep 30, 2021

Choose a reason for hiding this comment

zasdfgbnm Sep 14, 2021

Choose a reason for hiding this comment

alliepiper left a comment

Choose a reason for hiding this comment

alliepiper Sep 30, 2021

Choose a reason for hiding this comment

alliepiper Sep 30, 2021

Choose a reason for hiding this comment

alliepiper Sep 30, 2021

Choose a reason for hiding this comment

alliepiper commented Sep 30, 2021

zasdfgbnm commented Oct 1, 2021

alliepiper commented Oct 6, 2021

alliepiper commented Oct 14, 2021

zasdfgbnm commented Oct 14, 2021

cliffburdick commented Aug 28, 2023

zasdfgbnm commented Aug 28, 2023

zasdfgbnm commented Sep 13, 2021 •

edited

Loading