Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Priority Queue #105

Open
wants to merge 56 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
5ab856e
Initial priority queue commit
Sep 9, 2021
1f2092c
Add priority queue benchmark
Sep 9, 2021
6a9dc99
Class comment
Sep 9, 2021
6b263e3
Improve comments and switch to cuco style
Sep 9, 2021
0eaaedf
Iterators
Sep 17, 2021
249165c
Test for iterators with thrust device_vector
Sep 17, 2021
c28a5ad
Add allocator template parameter
Oct 19, 2021
e8a9c1e
Allocator
andrewbriand Oct 20, 2021
012ebde
Accept arbitrary comparison
andrewbriand Oct 20, 2021
8cf681a
Accept arbitrary types instead of just pairs
andrewbriand Oct 24, 2021
8485bec
Remove pq_pair.h
andrewbriand Nov 2, 2021
da608cc
Start porting priority queue benchmark to gbenchmark
andrewbriand Nov 2, 2021
8a11b7f
Finish porting priority queue benchmark to gbenchmark
andrewbriand Nov 3, 2021
d1392b9
Add multiple node sizes to benchmark
andrewbriand Dec 18, 2021
9ee6c8b
Start porting tests to Catch2
andrewbriand Dec 18, 2021
e223598
Prevent block size from being larger than node size
andrewbriand Dec 18, 2021
dd8c6b7
Continue porting tests to Catch2
andrewbriand Dec 19, 2021
d031519
Make generate_element for KVPair generic
andrewbriand Dec 19, 2021
ba3a6fd
Finish Catch2 tests
andrewbriand Dec 26, 2021
16db085
Hide kernel launch details
andrewbriand Dec 26, 2021
052cec0
Clean up partial deletion code
andrewbriand Dec 27, 2021
a11bea5
Correct test comparisons
andrewbriand Dec 27, 2021
e3c4a27
Commenting and cleanup
andrewbriand Dec 27, 2021
f6fa484
Commenting for Compare
andrewbriand Dec 27, 2021
599067f
Cleanup, arbitrary number of elements for device API functions
andrewbriand Dec 27, 2021
44db340
Formatting
andrewbriand Dec 27, 2021
acfdf7e
Add missing syncs
andrewbriand Apr 12, 2022
d870e29
Merge NVIDIA:dev into andrewbriand:dev
andrewbriand Apr 14, 2022
71775b6
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Apr 14, 2022
9838569
Add copyright to priority_queue_bench.cu
andrewbriand May 31, 2022
aab4ba0
Add copyright to priority queue files
andrewbriand May 31, 2022
0196bde
Order headers from near to far in priority queue files
andrewbriand May 31, 2022
4af61ca
Bug fix in priority queue test code
andrewbriand May 31, 2022
a1d074a
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] May 31, 2022
bf930dd
Remove unnecessary allocator
andrewbriand May 31, 2022
2d9bda9
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] May 31, 2022
54dc9f3
Add missing member docs in priority_queue.cuh
andrewbriand Jun 11, 2022
a5c169d
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jun 11, 2022
4269e9c
Add stream parameter to priority queue ctor
andrewbriand Jun 11, 2022
30cbf83
Snake case in priority queue files
andrewbriand Jun 12, 2022
bec63f3
Put priority queue kernels in detail namespace
andrewbriand Jun 12, 2022
aa12404
generate_keys_uniform -> generate_kv_pairs_uniform
andrewbriand Jun 13, 2022
55cf2e6
Remove FavorInsertionPerformance template parameter
andrewbriand Jun 13, 2022
f4814db
Default node size 64 -> 1024
andrewbriand Jun 15, 2022
89eea18
Avoid c-style expressions in priority queue files
andrewbriand Jun 15, 2022
7d47200
Remove FavorInsertionPerformance in priority queue benchmark
andrewbriand Jun 15, 2022
007316a
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jun 15, 2022
192e263
Snake case in priority_queue_test.cu
andrewbriand Jun 17, 2022
66dd359
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jun 17, 2022
9da822f
kPBufferIdx -> p_buffer_idx and kRootIdx -> root_idx
andrewbriand Jun 17, 2022
0cfdd94
Use const and constexpr wherever possible in priority queue files
andrewbriand Jun 19, 2022
828b00b
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jun 19, 2022
1932418
Add missing const in priority queue
andrewbriand Jun 19, 2022
7c4b1f6
Add docs for stream parameter to priority queue ctor
andrewbriand Jun 19, 2022
838e4ea
Add value_type to priority_queue::device_mutable_view
andrewbriand Jun 19, 2022
d58dd9f
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jun 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,7 @@ ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}")
###################################################################################################
set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu")
ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}")

#TODO: Port priority_queue benchmark to google benchmark
add_executable(PRIORITY_QUEUE_BENCH "${CMAKE_CURRENT_SOURCE_DIR}/priority_queue/priority_queue_bench.cu")
target_link_libraries(PRIORITY_QUEUE_BENCH cuco)
187 changes: 187 additions & 0 deletions benchmarks/priority_queue/priority_queue_bench.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
#include <iostream>
#include <vector>
#include <cstdint>
#include <cstdlib>

#include <cuda_runtime.h>

#include <cuco/priority_queue.cuh>
#include <cuco/detail/error.hpp>

using namespace cuco;

template <typename Key, typename Value>
__global__ void DeviceAPIInsert(
typename priority_queue<Key, Value>::device_mutable_view view,
Pair<Key, Value> *elements,
size_t num_elements) {
extern __shared__ int shmem[];
thread_block g = this_thread_block();
for (size_t i = blockIdx.x * view.get_node_size();
i < num_elements; i += gridDim.x * view.get_node_size()) {
view.push(g, elements + i, min(view.get_node_size(), num_elements - i),
shmem);
}
}

template <typename Key, typename Value>
__global__ void DeviceAPIDelete(
typename priority_queue<Key, Value>::device_mutable_view view,
Pair<Key, Value> *out,
size_t num_elements) {

extern __shared__ int shmem[];
thread_block g = this_thread_block();
for (size_t i = blockIdx.x * view.get_node_size();
i < num_elements; i += gridDim.x * view.get_node_size()) {
view.pop(g, out + i, min(view.get_node_size(), num_elements - i), shmem);
}
}

// Use CUDA events to time the code in the lambda function
template <typename F>
float TimeCode(F f) {
cudaEvent_t t1;
CUCO_CUDA_TRY(cudaEventCreate(&t1));

cudaEvent_t t2;
CUCO_CUDA_TRY(cudaEventCreate(&t2));

CUCO_CUDA_TRY(cudaEventRecord(t1));
f();
CUCO_CUDA_TRY(cudaEventRecord(t2));

CUCO_CUDA_TRY(cudaEventSynchronize(t1));
CUCO_CUDA_TRY(cudaEventSynchronize(t2));

float result;
CUCO_CUDA_TRY(cudaEventElapsedTime(&result, t1, t2));
return result;
}

// Time the insertion of the num_keys elements at d_elements into pq in ms
float TimeInsert(priority_queue<uint32_t, uint32_t> &pq,
Pair<uint32_t, uint32_t> *d_elements,
size_t num_keys) {
return TimeCode([&]() {
pq.push(d_elements, num_keys);
});
}

// Time insert of the num_keys elements with the device API at d_elements
// into pq in ms
float TimeInsertDeviceAPI(priority_queue<uint32_t, uint32_t> &pq,
Pair<uint32_t, uint32_t> *d_elements,
size_t num_keys) {
return TimeCode([&]() {
DeviceAPIInsert<<<64000, 256, pq.get_shmem_size(256)>>>
(pq.get_mutable_device_view(), d_elements, num_keys);
});
}

// Time the deletion of num_keys elements from pq in ms
float TimeDeleteDeviceAPI(priority_queue<uint32_t, uint32_t> &pq,
Pair<uint32_t, uint32_t> *d_elements,
size_t num_keys) {
return TimeCode([&]() {
DeviceAPIDelete<<<32000, 512, pq.get_shmem_size(512)>>>
(pq.get_mutable_device_view(), d_elements, num_keys);
});
}

// Time the deletion of num_keys elements from pq in ms
float TimeDelete(priority_queue<uint32_t, uint32_t> &pq,
Pair<uint32_t, uint32_t> *d_elements,
size_t num_keys) {
return TimeCode([&]() {
pq.pop(d_elements, num_keys);
});
}

// Follow the first experiment in the paper,
// inserting 512 million 4-byte keys and then deleting them all
// Repeat in ascending, descending and random key order
void InsertThenDelete() {

std::cout << "==Insert then delete==" << std::endl;

size_t num_keys = 512e6;

std::cout << num_keys << " keys" << std::endl;

std::cout << "Order\t\tInsertion (ms)\t\tDeletion (ms)" << std::endl;

// Allocate GPU memory to store the keys that will be inserted
Pair<uint32_t, uint32_t> *d_elements;
size_t num_bytes = num_keys * sizeof(Pair<uint32_t, uint32_t>);
CUCO_CUDA_TRY(cudaMalloc((void**)&d_elements, num_bytes));

priority_queue<uint32_t, uint32_t> pq(num_keys);

// Ascending
std::vector<Pair<uint32_t, uint32_t>> ascending(num_keys);

for (uint32_t i = 0; i < num_keys; i++) {
ascending[i] = {i, i};
}

CUCO_CUDA_TRY(cudaMemcpy(d_elements, &ascending[0],
num_bytes, cudaMemcpyHostToDevice));

auto time_elapsed_insert = TimeInsert(pq, d_elements, num_keys);
auto time_elapsed_delete = TimeDelete(pq, d_elements, num_keys);

std::cout << "Ascend\t\t" << time_elapsed_insert << "\t\t"
<< time_elapsed_delete << std::endl;

// Descending
std::vector<Pair<uint32_t, uint32_t>> descending(num_keys);

for (uint32_t i = 0; i < num_keys; i++) {
descending[num_keys - i - 1] = {i, i};
}

CUCO_CUDA_TRY(cudaMemcpy(d_elements, &descending[0],
num_bytes, cudaMemcpyHostToDevice));

time_elapsed_insert = TimeInsert(pq, d_elements, num_keys);
time_elapsed_delete = TimeDelete(pq, d_elements, num_keys);

std::cout << "Descend\t\t" << time_elapsed_insert << "\t\t"
<< time_elapsed_delete << std::endl;

// Random
std::vector<Pair<uint32_t, uint32_t>> random(num_keys);

for (uint32_t i = 0; i < num_keys; i++) {
random[i] = {(uint32_t)rand(), i};
}

CUCO_CUDA_TRY(cudaMemcpy(d_elements, &random[0],
num_bytes, cudaMemcpyHostToDevice));

time_elapsed_insert = TimeInsert(pq, d_elements, num_keys);
time_elapsed_delete = TimeDelete(pq, d_elements, num_keys);

std::cout << "Random\t\t" << time_elapsed_insert << "\t\t"
<< time_elapsed_delete << std::endl;

CUCO_CUDA_TRY(cudaMemcpy(d_elements, &random[0],
num_bytes, cudaMemcpyHostToDevice));

time_elapsed_insert = TimeInsertDeviceAPI(pq, d_elements, num_keys);
time_elapsed_delete = TimeDeleteDeviceAPI(pq, d_elements, num_keys);

std::cout << "Random Dev. API\t\t" << time_elapsed_insert << "\t\t"
<< time_elapsed_delete << std::endl;

CUCO_CUDA_TRY(cudaFree(d_elements));
}


int main() {

InsertThenDelete();

return 0;
}
21 changes: 21 additions & 0 deletions include/cuco/detail/pq_pair.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

namespace cuco {

template <typename Key, typename Value>
struct Pair {
Key key;
Value value;
};

/*
* Check if two Pairs have the same key and value
* @param a The first pair
* @param b The second pair
*/
template <typename Key, typename Value>
bool operator==(const Pair<Key, Value> &a, const Pair<Key, Value> &b) {
return a.key == b.key && a.value == b.value;
}

}
158 changes: 158 additions & 0 deletions include/cuco/detail/priority_queue.inl
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#pragma once
#include <cmath>

#include <cuco/detail/priority_queue_kernels.cuh>
#include <cuco/detail/error.hpp>

namespace cuco {

template <typename Key, typename Value, bool Max>
priority_queue<Key, Value, Max>::priority_queue(size_t initial_capacity,
size_t node_size) {

node_size_ = node_size;

// Round up to the nearest multiple of node size
int nodes = ((initial_capacity + node_size_ - 1) / node_size_);

node_capacity_ = nodes;
lowest_level_start_ = 1 << (int)log2(nodes);

// Allocate device variables

CUCO_CUDA_TRY(cudaMalloc((void**)&d_size_, sizeof(int)));

CUCO_CUDA_TRY(cudaMemset(d_size_, 0, sizeof(int)));

CUCO_CUDA_TRY(cudaMalloc((void**)&d_p_buffer_size_, sizeof(size_t)));

CUCO_CUDA_TRY(cudaMemset(d_p_buffer_size_, 0, sizeof(size_t)));

CUCO_CUDA_TRY(cudaMalloc((void**)&d_heap_,
sizeof(Pair<Key, Value>)
* (node_capacity_ * node_size_ + node_size_)));

CUCO_CUDA_TRY(cudaMalloc((void**)&d_locks_,
sizeof(int) * (node_capacity_ + 1)));

CUCO_CUDA_TRY(cudaMemset(d_locks_, 0,
sizeof(int) * (node_capacity_ + 1)));

CUCO_CUDA_TRY(cudaMalloc((void**)&d_pop_tracker_, sizeof(int)));

}

template <typename Key, typename Value, bool Max>
priority_queue<Key, Value, Max>::~priority_queue() {
CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_size_));
CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_p_buffer_size_));
CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_heap_));
CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_locks_));
CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_pop_tracker_));
}


template <typename Key, typename Value, bool Max>
void priority_queue<Key, Value, Max>::push(Pair<Key, Value> *elements,
size_t num_elements,
int block_size,
int grid_size,
bool warp_level,
cudaStream_t stream) {

const int kBlockSize = block_size;
const int kNumBlocks = grid_size;

if (!warp_level) {
PushKernel<Max><<<kNumBlocks, kBlockSize,
get_shmem_size(kBlockSize), stream>>>
(elements, num_elements, d_heap_, d_size_,
node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_);
} else {
PushKernelWarp<Max><<<kNumBlocks, kBlockSize,
get_shmem_size(32) * kBlockSize / 32, stream>>>
(elements, num_elements, d_heap_, d_size_,
node_size_, d_locks_, d_p_buffer_size_,
lowest_level_start_, get_shmem_size(32));
}

CUCO_CUDA_TRY(cudaGetLastError());
}

template <typename Key, typename Value, bool Max>
void priority_queue<Key, Value, Max>::pop(Pair<Key, Value> *out,
size_t num_elements,
int block_size,
int grid_size,
bool warp_level,
cudaStream_t stream) {

const int kBlockSize = block_size;
const int kNumBlocks = grid_size;

cudaMemset(d_pop_tracker_, 0, sizeof(int));
if (!warp_level) {
PopKernel<Max><<<kNumBlocks, kBlockSize,
get_shmem_size(kBlockSize), stream>>>
(out, num_elements, d_heap_, d_size_,
node_size_, d_locks_, d_p_buffer_size_,
d_pop_tracker_, lowest_level_start_, node_capacity_);
} else {
PopKernelWarp<Max><<<kNumBlocks, kBlockSize,
get_shmem_size(32) * kBlockSize / 32, stream>>>
(out, num_elements, d_heap_, d_size_,
node_size_, d_locks_, d_p_buffer_size_,
d_pop_tracker_, lowest_level_start_,
node_capacity_, get_shmem_size(32));

}

CUCO_CUDA_TRY(cudaGetLastError());
}

template <typename Key, typename Value, bool Max>
template <typename CG>
__device__ void priority_queue<Key, Value, Max>::device_mutable_view::push(
CG const& g,
Pair<Key, Value> *elements,
size_t num_elements,
void *temp_storage) {

SharedMemoryLayout<Key, Value> shmem =
GetSharedMemoryLayout<Key, Value>((int*)temp_storage,
g.size(), node_size_);
if (num_elements == node_size_) {
PushSingleNode<Max>(g, elements, d_heap_, d_size_, node_size_,
d_locks_, lowest_level_start_, shmem);
} else if (num_elements < node_size_) {
PushPartialNode<Max>(g, elements, num_elements, d_heap_,
d_size_, node_size_, d_locks_,
d_p_buffer_size_, lowest_level_start_, shmem);
}
}

template <typename Key, typename Value, bool Max>
template <typename CG>
__device__ void priority_queue<Key, Value, Max>::device_mutable_view::pop(
CG const& g,
Pair<Key, Value> *out,
size_t num_elements,
void *temp_storage) {
int pop_tracker = 0;

SharedMemoryLayout<Key, Value> shmem =
GetSharedMemoryLayout<Key, Value>((int*)temp_storage,
g.size(), node_size_);

if (num_elements == node_size_) {
PopSingleNode<Max>(g, out, d_heap_, d_size_, node_size_, d_locks_,
d_p_buffer_size_, &pop_tracker, lowest_level_start_,
node_capacity_, shmem);
} else {
PopPartialNode<Max>(g, out, num_elements, d_heap_, d_size_, node_size_,
d_locks_, d_p_buffer_size_, lowest_level_start_,
node_capacity_, shmem);
}
}

}
Loading