From 5d868441b5672490d75ddfca287b75d1f60de48c Mon Sep 17 00:00:00 2001 From: Alex Yu Date: Tue, 14 Dec 2021 16:47:00 -0800 Subject: [PATCH] Added some launch_bounds #6 --- svox2/csrc/loss_kernel.cu | 6 +++--- svox2/csrc/misc_kernel.cu | 35 +++++++++++++++++++---------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/svox2/csrc/loss_kernel.cu b/svox2/csrc/loss_kernel.cu index 0a3ca7db..14fc93dc 100644 --- a/svox2/csrc/loss_kernel.cu +++ b/svox2/csrc/loss_kernel.cu @@ -61,6 +61,7 @@ void calculate_ray_scale(float ndc_coeffx, maxz, \ out_name) +__launch_bounds__(TV_GRAD_CUDA_THREADS, MIN_BLOCKS_PER_SM) __global__ void tv_kernel( torch::PackedTensorAccessor32 links, torch::PackedTensorAccessor64 data, @@ -497,10 +498,9 @@ torch::Tensor tv(torch::Tensor links, torch::Tensor data, int nl = (links.size(0) - 1) * (links.size(1) - 1) * (links.size(2) - 1); size_t Q = nl * size_t(end_dim - start_dim); - const int cuda_n_threads = 1024; - const int blocks = CUDA_N_BLOCKS_NEEDED(Q, cuda_n_threads); + const int blocks = CUDA_N_BLOCKS_NEEDED(Q, TV_GRAD_CUDA_THREADS); torch::Tensor result = torch::zeros({}, data.options()); - device::tv_kernel<<>>( + device::tv_kernel<<>>( links.packed_accessor32(), data.packed_accessor64(), start_dim, diff --git a/svox2/csrc/misc_kernel.cu b/svox2/csrc/misc_kernel.cu index 6031218f..7843c6d9 100644 --- a/svox2/csrc/misc_kernel.cu +++ b/svox2/csrc/misc_kernel.cu @@ -11,9 +11,13 @@ #include "cubemap_util.cuh" namespace { + +const int MISC_CUDA_THREADS = 256; +const int MISC_MIN_BLOCKS_PER_SM = 4; namespace device { // Can also implement using convs..... +__launch_bounds__(MISC_CUDA_THREADS, MISC_MIN_BLOCKS_PER_SM) __global__ void dilate_kernel( const torch::PackedTensorAccessor32 grid, // Output @@ -49,6 +53,7 @@ __global__ void dilate_kernel( // ** Distance transforms // TODO: Maybe replace this with an euclidean distance transform eg PBA // Actual L-infty distance transform; turns out this is slower than the geometric way +__launch_bounds__(MISC_CUDA_THREADS, MISC_MIN_BLOCKS_PER_SM) __global__ void accel_linf_dist_transform_kernel( torch::PackedTensorAccessor32 grid, int32_t* __restrict__ tmp, @@ -101,6 +106,7 @@ __global__ void accel_linf_dist_transform_kernel( } // Geometric L-infty distance transform-ish thing +__launch_bounds__(MISC_CUDA_THREADS, MISC_MIN_BLOCKS_PER_SM) __global__ void accel_dist_set_kernel( const torch::PackedTensorAccessor32 grid, bool* __restrict__ tmp) { @@ -134,6 +140,7 @@ __global__ void accel_dist_set_kernel( } } +__launch_bounds__(MISC_CUDA_THREADS, MISC_MIN_BLOCKS_PER_SM) __global__ void accel_dist_prop_kernel( torch::PackedTensorAccessor32 grid, const bool* __restrict__ tmp) { @@ -299,6 +306,7 @@ __device__ __inline__ void grid_trace_ray( // chnl_id); // } +__launch_bounds__(MISC_CUDA_THREADS, MISC_MIN_BLOCKS_PER_SM) __global__ void grid_weight_render_kernel( const torch::PackedTensorAccessor32 data, @@ -336,10 +344,9 @@ torch::Tensor dilate(torch::Tensor grid) { int Q = grid.size(0) * grid.size(1) * grid.size(2); - const int cuda_n_threads = std::min(Q, CUDA_MAX_THREADS); - const int blocks = CUDA_N_BLOCKS_NEEDED(Q, cuda_n_threads); + const int blocks = CUDA_N_BLOCKS_NEEDED(Q, MISC_CUDA_THREADS); torch::Tensor result = torch::empty_like(grid); - device::dilate_kernel<<>>( + device::dilate_kernel<<>>( grid.packed_accessor32(), // Output result.packed_accessor32()); @@ -359,8 +366,7 @@ void accel_dist_prop(torch::Tensor grid) { int Q = grid.size(0) * grid.size(1) * grid.size(2); - const int cuda_n_threads = std::min(Q, CUDA_MAX_THREADS); - const int blocks = CUDA_N_BLOCKS_NEEDED(Q, cuda_n_threads); + const int blocks = CUDA_N_BLOCKS_NEEDED(Q, MISC_CUDA_THREADS); int64_t req_size = 0; while (sz_x > 1 && sz_y > 1 && sz_z > 1) { @@ -376,11 +382,11 @@ void accel_dist_prop(torch::Tensor grid) { .device(grid.device()) .requires_grad(false); torch::Tensor tmp = torch::zeros({req_size}, tmp_options); - device::accel_dist_set_kernel<<>>( + device::accel_dist_set_kernel<<>>( grid.packed_accessor32(), tmp.data_ptr()); - device::accel_dist_prop_kernel<<>>( + device::accel_dist_prop_kernel<<>>( grid.packed_accessor32(), tmp.data_ptr()); @@ -396,10 +402,9 @@ void accel_dist_prop(torch::Tensor grid) { // int d1 = 3 - d0 - d2; // int Q = grid.size(d0) * grid.size(d1); // - // const int cuda_n_threads = std::min(Q, CUDA_MAX_THREADS); - // const int blocks = CUDA_N_BLOCKS_NEEDED(Q, cuda_n_threads); + // const int blocks = CUDA_N_BLOCKS_NEEDED(Q, MISC_CUDA_THREADS); // - // device::accel_linf_dist_transform_kernel<<>>( + // device::accel_linf_dist_transform_kernel<<>>( // grid.packed_accessor32(), // tmp, // d2); @@ -425,10 +430,9 @@ void grid_weight_render( cam.check(); const size_t Q = size_t(cam.width) * cam.height; - const int cuda_n_threads = 512; - const int blocks = CUDA_N_BLOCKS_NEEDED(Q, cuda_n_threads); + const int blocks = CUDA_N_BLOCKS_NEEDED(Q, MISC_CUDA_THREADS); - device::grid_weight_render_kernel<<>>( + device::grid_weight_render_kernel<<>>( data.packed_accessor32(), cam, step_size, @@ -455,10 +459,9 @@ void grid_weight_render( // TORCH_CHECK(cubemap.size(1) == cubemap.size(2)); // // const size_t Q = size_t(dirs.size(0)) * cubemap.size(3); -// const int cuda_n_threads = 512; -// const int blocks = CUDA_N_BLOCKS_NEEDED(Q, cuda_n_threads); +// const int blocks = CUDA_N_BLOCKS_NEEDED(Q, MISC_CUDA_THREADS); // -// device::sample_cubemap_kernel<<>>( +// device::sample_cubemap_kernel<<>>( // cubemap.packed_accessor32(), // dirs.packed_accessor32(), // Q,