From 60d4b49cef5cf00dbafe99dcaee05478be8da729 Mon Sep 17 00:00:00 2001 From: Edwin Carlinet <edwin.carlinet@lrde.epita.fr> Date: Wed, 21 Sep 2022 11:22:22 +0200 Subject: [PATCH 1/3] Add mst impl. --- CMakeLists.txt | 35 ++--- CMakePresets.json | 33 ++++ conanfile.txt | 4 + include/mst.h | 4 + src_mst/main.cpp | 28 ++++ src_mst/mst.cu | 341 ++++++++++++++++++++++++++++++++++++++++++ src_mst/mst.cuh | 86 +++++++++++ src_mst/mst_launch.cu | 144 ++++++++++++++++++ 8 files changed, 652 insertions(+), 23 deletions(-) create mode 100644 CMakePresets.json create mode 100644 include/mst.h create mode 100644 src_mst/main.cpp create mode 100644 src_mst/mst.cu create mode 100644 src_mst/mst.cuh create mode 100644 src_mst/mst_launch.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b2fc03..4db1578 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,30 +1,10 @@ cmake_minimum_required(VERSION 3.20) project(gpu_alpha_tree LANGUAGES CXX CUDA) -include(FindCUDAToolkit) +#include(FindCUDAToolkit) include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake) conan_basic_setup(TARGETS) -include(FetchContent) -FetchContent_Declare( - GoogleTest - URL https://github.com/google/googletest/archive/refs/tags/v1.10.x.tar.gz) -FetchContent_Declare( - GoogleBenchmark - URL https://github.com/google/benchmark/archive/v1.4.1.tar.gz) -FetchContent_Declare( - CLI11 - URL https://github.com/CLIUtils/CLI11/archive/v1.8.0.tar.gz) - -if (NOT GoogleBenchmark_POPULATED) - FetchContent_Populate(GoogleBenchmark) - set(BENCHMARK_ENABLE_GTEST_TESTS OFF CACHE BOOL "From Gtest") - set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "From Gtest ") - add_subdirectory(${googlebenchmark_SOURCE_DIR} ${googlebenchmark_BINARY_DIR}) -endif () - -FetchContent_MakeAvailable(CLI11) -FetchContent_MakeAvailable(GoogleTest) enable_testing() find_package(OpenCV REQUIRED COMPONENTS core imgcodecs) @@ -70,8 +50,17 @@ set_target_properties(gpu_alpha_tree PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +add_library(gpu-mst src_mst/mst_launch.cu src_mst/mst.cu) +target_compile_options(gpu-mst PRIVATE "--expt-relaxed-constexpr") +target_link_libraries(gpu-mst PRIVATE CONAN_PKG::fmt) +target_include_directories(gpu-mst PUBLIC include) + + add_executable(main src/main.cpp) -target_link_libraries(main PRIVATE gpu_alpha_tree CLI11::CLI11 ${OpenCV_LIBS}) +target_link_libraries(main PRIVATE gpu_alpha_tree CONAN_PKG::cli11 ${OpenCV_LIBS}) + +add_executable(mst-bin src_mst/main.cpp) +target_link_libraries(mst-bin PRIVATE gpu-mst CONAN_PKG::cli11 ${OpenCV_LIBS} CONAN_PKG::fmt) add_executable(bench ${BENCHMARKS}) target_link_libraries(bench PRIVATE gpu_alpha_tree benchmark ${OpenCV_LIBS} boost_thread) @@ -79,4 +68,4 @@ target_include_directories(bench PRIVATE ${CONAN_INCLUDE_DIRS_BOOST}) target_link_directories(bench PRIVATE ${CONAN_LIB_DIRS_BOOST}) add_executable(tests ${TESTS}) -target_link_libraries(tests PRIVATE gpu_alpha_tree gtest_main) \ No newline at end of file +target_link_libraries(tests PRIVATE gpu_alpha_tree CONAN_PKG::gtest) \ No newline at end of file diff --git a/CMakePresets.json b/CMakePresets.json new file mode 100644 index 0000000..70a7c7c --- /dev/null +++ b/CMakePresets.json @@ -0,0 +1,33 @@ +{ + "version": 2, + "configurePresets": [ + { + "name": "debug", + "description": "Sets Ninja generator, build and install directory", + "generator": "Ninja", + "binaryDir": "${sourceDir}/build", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Debug", + "CMAKE_INSTALL_PREFIX": "${sourceDir}/build/install/${presetName}", + "CMAKE_CUDA_ARCHITECTURES" : "70-virtual;72-real" + }, + "environment": { + "CUDACXX": "/usr/local/cuda/bin/nvcc" + } + }, + { + "name": "release", + "description": "Sets Ninja generator, build and install directory", + "generator": "Ninja", + "binaryDir": "${sourceDir}/build", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "CMAKE_INSTALL_PREFIX": "${sourceDir}/build/install/${presetName}", + "CMAKE_CUDA_ARCHITECTURES" : "70-virtual;72-real" + }, + "environment": { + "CUDACXX": "/usr/local/cuda/bin/nvcc" + } + } + ] +} diff --git a/conanfile.txt b/conanfile.txt index e18c50b..b1f1739 100644 --- a/conanfile.txt +++ b/conanfile.txt @@ -1,5 +1,9 @@ [requires] boost/1.71.0 +gtest/[>=1.10] +benchmark/[>=1.5] +cli11/[>=2.0] +fmt/[>=7] [generators] cmake \ No newline at end of file diff --git a/include/mst.h b/include/mst.h new file mode 100644 index 0000000..a314101 --- /dev/null +++ b/include/mst.h @@ -0,0 +1,4 @@ +#pragma once +#include <cstdint> + +void mst(const uint8_t* input, int width, int height, std::ptrdiff_t stride); \ No newline at end of file diff --git a/src_mst/main.cpp b/src_mst/main.cpp new file mode 100644 index 0000000..c8a6dcb --- /dev/null +++ b/src_mst/main.cpp @@ -0,0 +1,28 @@ +#include "mst.h" + +#include <CLI/CLI.hpp> +#include <opencv4/opencv2/core.hpp> +#include <opencv4/opencv2/imgcodecs.hpp> +#include <cstdio> +#include <fmt/core.h> + +int main(int argc, char** argv) +{ + std::string image_path; + + CLI::App app("gpu_alpha_tree"); + app.add_option("-i", image_path, "Input image path")->required()->check(CLI::ExistingFile); + + CLI11_PARSE(app, argc, argv); + + auto image = cv::imread(image_path, cv::IMREAD_ANYCOLOR); + if (!((image.channels() == 1) && (image.depth() == CV_8U))) + { + fmt::print(stderr, "Input image must be 8-bit grayscale ({} channels provided).\n", image.channels()); + std::exit(1); + } + + + mst(image.data, image.rows, image.cols, image.rows * sizeof(uint8_t)); + return 0; +} \ No newline at end of file diff --git a/src_mst/mst.cu b/src_mst/mst.cu new file mode 100644 index 0000000..262981b --- /dev/null +++ b/src_mst/mst.cu @@ -0,0 +1,341 @@ +#include "mst.cuh" +#include <cstdint> +#include <cstddef> +#include <utility> +#include <cassert> +#include <cstdio> +#include <algorithm> + +#include <cuda/barrier> +#include <cuda/annotated_ptr> +#include <cooperative_groups.h> + +namespace cg = cooperative_groups; + +template <class T> +__device__ void swap(T& a, T& b) { T tmp = b; b = a; a = tmp;} + + +/// +///@brief Make union between two components (u,v) +/// +///@param L +///@param u +///@param v +///@return Return the new root of the component +__device__ int unite(int* L, int u, int v) +{ + while (1) + { + if (u > v) + swap(u,v); + if (u == v) + return u; + int old = atomicMin(&L[v], u); + if (old == v) + return u; + v = old; + } +} + +__device__ int findroot(int* L, int x) +{ + int q; + while ((q = L[x]) != x) + x = q; + return q; +} + + +__global__ void mst_init(const uint8_t* input, int width, int height, int stride, int* L, uint8_t* MST) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int u = y * stride + x; + int left = u - 1; + int right = u + 1; + int up = u - stride; + int down = u + stride; + + + // 1. Make-set + //if (x < width && y < height) + // L[u] = u; + // + //__syncthreads(); + + // 2. Select the weak edge and compute the CC + + if (x < width && y < height) + { + int m = INT_MAX; + int M; + int v; + int e; + if (y > 0 && (M = std::abs(input[u] - input[up])) < m) { e = NORTH; v = up; m = M; } + if (x > 0 && (M = std::abs(input[u] - input[left])) < m) { e = WEST; v = left; m = M; } + if (x < (width-1) && (M = std::abs(input[u] - input[right])) < m) { e = EAST; v = right; m = M; } + if (y < (height-1) && (M = std::abs(input[u] - input[down])) < m) { e = SOUTH; v = down; m = M; } + + + unite(L, v, u); // Unite + MST[u] = e; // Set the edge "e" as part of the MST + } + + __syncthreads(); + + // 3. compress + if (x < width && y < height) + L[u] = findroot(L, u); +} + + +__global__ void compress(const uint8_t* input, int width, int height, int stride, int* L, int* gCount, uint64_t* WE) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int u = y * stride + x; + + if (x >= width || y >= height) + return; + + int q = (L[u] = findroot(L, u)); + if (u == q) + { + atomicAdd(gCount, 1); + WE[u] = WEdge::zero(); + } +} + +template <int ITEMS_PER_THREAD, class T> +__forceinline__ __device__ void loadbuffer(T* dst, const T* src, int size) +{ + for (int k = 0; k < ITEMS_PER_THREAD; ++k) + { + int i = blockDim.x * k + threadIdx.x; + if (i < size) + dst[i] = src[i]; + } +} + + +template <class T, class Barrier> +__forceinline__ __device__ void loadbuffer_async( + const cg::thread_block& g, + T* dst, + const T* src, + int size, + Barrier& barrier) +{ + cuda::memcpy_async(g, + cuda::annotated_ptr<T, cuda::access_property::shared>(dst), + cuda::annotated_ptr<T, cuda::access_property::streaming>((T*)src), size * sizeof(T), barrier); +} + + + + +__global__ void mst_get_weak_edges(const uint8_t* input, int width, int height, int stride, int* L, uint64_t* WE) +{ + //using input_load_t = cub::BlockLoad<uint8_t, BLOCK_WIDTH, 2, BLOCK_LOAD_DIRECT, BLOCK_HEIGHT>; + //using L_load_t = cub::BlockLoad<int, BLOCK_WIDTH, 2, BLOCK_LOAD_DIRECT, BLOCK_HEIGHT>; + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int u = y * stride + x; + + assert(BLOCK_HEIGHT == blockDim.y); + assert(BLOCK_WIDTH == blockDim.x); + + + __shared__ uint8_t s_input_data[BLOCK_HEIGHT + 2][BLOCK_WIDTH + 2]; + __shared__ int s_L_data[BLOCK_HEIGHT + 2][BLOCK_WIDTH + 2]; + + static_assert(sizeof(s_input_data) == (BLOCK_HEIGHT + 2)*(BLOCK_WIDTH + 2)); + + // Compute the bounds of the regions to copy (with the 1-pixel border) + { + int x0 = blockIdx.x * BLOCK_WIDTH - 1; + int y0 = int(blockIdx.y * blockDim.y) - 1; + int x1 = std::min(width, (int)((blockIdx.x + 1) * BLOCK_WIDTH)); + int y1 = std::min(height, (int)((blockIdx.y + 1) * blockDim.y)); + int w = x1 - x0 + 1; + int h = y1 - y0 + 1; + + if (threadIdx.y < h) + { + loadbuffer<2>(s_input_data[threadIdx.y], input + (y0 + (int)threadIdx.y) * stride + x0, w); + loadbuffer<2>(s_L_data[threadIdx.y], L + (y0 + (int)threadIdx.y) * stride + x0, w); + } + int ty2 = threadIdx.y + blockDim.y; + if (ty2 < h) + { + loadbuffer<2>(s_input_data[ty2], input + (y0 + ty2) * stride + x0, w); + loadbuffer<2>(s_L_data[ty2], L + (y0 + ty2) * stride + x0, w); + } + } + __syncthreads(); + + const uint8_t* s_input = &(s_input_data[1][1]); + int* s_L = &(s_L_data[1][1]); + + // 1. Select the weak outer-edge of the current pixel the CC + if (x < width && y < height) + { + int i = threadIdx.y * (BLOCK_WIDTH + 2) + threadIdx.x; + int left = i - 1; + int right = i + 1; + int up = i - (BLOCK_WIDTH + 2); + int down = i + (BLOCK_WIDTH + 2); + int m = INT_MAX; + int M; + Direction e; + int q = s_L[i]; + int v = s_input[i]; + if ((s_L[up] >= 0) && (s_L[up] != q) && (M = std::abs(v - s_input[up])) < m) { e = NORTH; m = M; } + if ((s_L[left] >= 0) && (s_L[left] != q) && (M = std::abs(v - s_input[left])) < m) { e = WEST; m = M; } + if ((s_L[right] >= 0) && (s_L[right] != q) && (M = std::abs(v - s_input[right])) < m) { e = EAST; m = M; } + if ((s_L[down] >= 0) && (s_L[down] != q) && (M = std::abs(v - s_input[down])) < m) { e = SOUTH; m = M; } + + //L[u] = q; + if (m == INT_MAX) + return; + + // 2. Reduce -> For-each CC get the current weakest-edge + atomicMin((unsigned long long*)&WE[q], WEdge::pack(u, e, m)); + } +} + +template <class T, size_t n> +__device__ __forceinline__ void roll(T* (&tab)[n]) +{ + auto tmp = tab[0]; + for (size_t i = 0; i < (n-1); ++i) + tab[i] = tab[i+1]; + tab[n-1] = tmp; +} + + + +__global__ void mst_get_weak_edges_wpt(const uint8_t* input, int width, int height, int stride, int* L, uint64_t* WE) +{ + constexpr int BLOCK_WIDTH = GET_EDGES_BLOCK_DIM.x; + constexpr int BLOCK_HEIGHT = GET_EDGES_BLOCK_DIM.y; + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int x0 = int(blockIdx.x * BLOCK_WIDTH) - 1; + const int y0 = int(blockIdx.y * BLOCK_HEIGHT) - 1; + const int x1 = std::min(width, (int)((blockIdx.x + 1) * blockDim.x)); + const int y1 = std::min(height, (int)((blockIdx.y + 1) * BLOCK_HEIGHT)); + const int w = x1 - x0 + 1; + const int h = y1 - y0 + 1; + auto g = cg::this_thread_block(); + + __shared__ uint8_t s_input_data[4][BLOCK_WIDTH + 2]; + __shared__ int s_L_data[4][BLOCK_WIDTH + 2]; + + __shared__ uint8_t* s_input[4]; + __shared__ int* s_L[4]; + + + if (threadIdx.x == 0) + { + s_input[0] = s_input_data[0]; + s_input[1] = s_input_data[1]; + s_input[2] = s_input_data[2]; + s_input[3] = s_input_data[3]; + s_L[0] = s_L_data[0]; + s_L[1] = s_L_data[1]; + s_L[2] = s_L_data[2]; + s_L[3] = s_L_data[3]; + } + + auto swap_buffer_on_completion = []() { roll(s_L); roll(s_input); }; + __shared__ cuda::barrier<cuda::thread_scope_block> bar; + init(&bar, BLOCK_WIDTH); + + __syncthreads(); + + // Load border and the first line + { + loadbuffer_async(g, s_input[1], input + y0 * stride + x0, w, bar); + loadbuffer_async(g, s_L[1], L + y0 * stride + x0, w, bar); + loadbuffer_async(g, s_input[2], input + (y0 + 1) * stride + x0, w, bar); + loadbuffer_async(g, s_L[2], L + (y0 + 1) * stride + x0, w, bar); + loadbuffer_async(g, s_input[3], input + (y0 + 2) * stride + x0, w, bar); + loadbuffer_async(g, s_L[3], L + (y0 + 2) * stride + x0, w, bar); + } + + // 1. Select the weak outer-edge of the current pixel the CC + assert((y0 + 1) < y1); + for (int y = y0 + 1; y < y1; ++y) + { + bar.arrive_and_wait(); + if (threadIdx.x == 0) + swap_buffer_on_completion(); + __syncthreads(); + + // prefetch the line N+2 + if ((y+1) < y1) { + loadbuffer_async(g, s_input[3], input + (y+2) * stride + x0, w, bar); + loadbuffer_async(g, s_L[3], L + (y+2) * stride + x0, w, bar); + } + + if (x < width) + { + int left = threadIdx.x; + int i = threadIdx.x + 1; + int right = threadIdx.x + 2; + int m = INT_MAX; + int M; + Direction e; + int q = s_L[1][i]; + int v = s_input[1][i]; + if ((s_L[0][i] >= 0) & (s_L[0][i] != q) & (M = std::abs(v - s_input[0][i])) < m) { e = NORTH; m = M; } + if ((s_L[1][left] >= 0) & (s_L[1][left] != q) & (M = std::abs(v - s_input[1][left])) < m) { e = WEST; m = M; } + if ((s_L[1][right] >= 0) & (s_L[1][right] != q) & (M = std::abs(v - s_input[1][right])) < m) { e = EAST; m = M; } + if ((s_L[2][i] >= 0) & (s_L[2][i] != q) & (M = std::abs(v - s_input[2][i])) < m) { e = SOUTH; m = M; } + + const int u = y * stride + x; + //L[u] = q; + + // 2. Reduce -> For-each CC get the current weakest-edge + if (m != INT_MAX) + atomicMin((unsigned long long*)&WE[q], WEdge::pack(u, e, m)); + } + } +} + + +__global__ void mst_merge_components(const uint8_t* input, int width, int height, int stride, int* L, uint8_t* MST, uint64_t* WE) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int u = y * stride + x; + + if (x >= width || y >= height) + return; + + // Only the root has work + if (u != L[u]) + return; + + assert(WE[u] != WEdge::zero()); + auto e = WEdge::unpack(WE[u]); + + int offset = 0; + switch (e.dir) { + case NORTH: offset = -stride; break; + case SOUTH: offset = +stride; break; + case WEST: offset = -1; break; + case EAST: offset = +1; break; + } + + int root = u; + u = e.origin; + int v = u + offset; + //printf("CC: %i edge %i -> %i\n", root, u, v); + unite(L, root, L[v]); // Unite + MST[u] = MST[u] | e.dir; // Set the edge "e" as part of the MST (uniqueness of the writer, the root representative thread) +} + diff --git a/src_mst/mst.cuh b/src_mst/mst.cuh new file mode 100644 index 0000000..00bb2af --- /dev/null +++ b/src_mst/mst.cuh @@ -0,0 +1,86 @@ +#pragma once +#include <cstdint> + +constexpr int BLOCK_WIDTH = 32; +constexpr int BLOCK_HEIGHT = 8; + +constexpr dim3 GET_EDGES_BLOCK_DIM = dim3(128, 8); + +enum Direction : uint8_t { + NORTH = 1, + WEST= 2, + EAST = 4, + SOUTH = 8, +}; + +struct WEdge { + int origin; + Direction dir; + uint8_t weight; + + // Pack into: + // weight: 16 + // direction: 16 + // origin: 32 + __device__ static inline constexpr uint64_t pack(uint64_t origin, Direction dir, uint64_t weight) { + return (weight << 48) | ((uint64_t) dir << 32) | origin; + } + + __device__ static inline constexpr WEdge unpack(uint64_t value) { + WEdge e = { + .origin = (int)(value & 0xFFFFFFFF), + .dir = (Direction)((value >> 32) & 0xFF), + .weight = (uint8_t)(value >> 48), + }; + return e; + } + + __device__ static inline constexpr uint64_t zero() { return UINT64_MAX; } +}; + + +__global__ void mst_init(const uint8_t* input, int width, int height, int stride, int* L, uint8_t* MST); + +/// +///@brief Get the weak edges between hyper-vertices +/// +///@param input The input image +///@param width +///@param height +///@param stride +///@param L Label image for the connected component labeling +///@param WE Store the weak edge for each component in WE[L] where L is the label of the component +///@return __global__ +__global__ void mst_get_weak_edges(const uint8_t* input, int width, int height, int stride, int* L, uint64_t* WE); + + +/// +/// Like previous but with +__global__ void mst_get_weak_edges_wpt(const uint8_t* input, int width, int height, int stride, int* L, uint64_t* WE); + + +/// +///@brief +/// +///@param input +///@param width +///@param height +///@param stride +///@param L +///@param gCount +///@param WE +///@return __global__ +__global__ void compress(const uint8_t* input, int width, int height, int stride, int* L, int* gCount, uint64_t* WE); + + +/// +///@brief +/// +///@param input +///@param width +///@param height +///@param stride +///@param L +///@param WE +///@return __global__ +__global__ void mst_merge_components(const uint8_t* input, int width, int height, int stride, int* L, uint8_t* MST, uint64_t* WE); diff --git a/src_mst/mst_launch.cu b/src_mst/mst_launch.cu new file mode 100644 index 0000000..076366b --- /dev/null +++ b/src_mst/mst_launch.cu @@ -0,0 +1,144 @@ +#include "mst.cuh" +#include <cstddef> +#include <fmt/core.h> +#include <unistd.h> +#include <memory> +#include <fmt/core.h> +#include <fmt/chrono.h> +#include <chrono> + +#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__) +template <typename T> +void check(T err, const char* const func, const char* const file, + const int line) +{ + if (err != cudaSuccess) + { + fmt::print(stderr, "CUDA Runtime Error at: {} : {}\n", file, line); + fmt::print(stderr, "{} {}\n", cudaGetErrorString(err), func); + // We don't exit when we encounter CUDA errors in this example. + // std::exit(EXIT_FAILURE); + } +} + +#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__) +void checkLast(const char* const file, const int line) +{ + cudaError_t err{cudaGetLastError()}; + if (err != cudaSuccess) + { + fmt::print(stderr, "CUDA Runtime Error at: {} : {}\n", file, line); + fmt::print(stderr, "{}\n", cudaGetErrorString(err)); + // We don't exit when we encounter CUDA errors in this example. + // std::exit(EXIT_FAILURE); + } +} + +__global__ void init_L(int* L, int width, int height, int pitch) +{ + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + if (x < width && y < height) + L[y * pitch + x] = y * pitch + x; +} + +template <class T> +void export_pgm(const T* ptr, int width, int height, int pitch) +{ + auto tmp = std::make_unique<T[]>((width+2) * (height+2)); + cudaMemcpy2D(tmp.get(), (width+2) * sizeof(T), ptr - pitch - 1, pitch * sizeof(T), (width+2) * sizeof(T), (height+2), cudaMemcpyDeviceToHost); + for (int y = 0; y < (height+2); ++y) { + for (int x = 0; x < (width+2); ++x) + fmt::print("{:04} ", tmp[y * (width+2) + x]); + fmt::print("\n"); + } + fmt::print("\n"); +} + +constexpr int iceil(int a, int b) { + return (a+b-1) / b; +} + +void mst(const uint8_t* input, int width, int height, std::ptrdiff_t stride) +{ + size_t pitch; + + uint8_t* in, *padded_in; + int* L, *padded_L; + uint8_t* MST, *padded_MST; + uint64_t* tmp, *padded_tmp; + + + int bwidth = width + 2; + int bheight = height + 2; + + CHECK_CUDA_ERROR(cudaMallocPitch(&padded_in, &pitch, bwidth, bheight)); + CHECK_CUDA_ERROR(cudaMallocPitch(&padded_MST, &pitch, bwidth, bheight)); + + int fullWidth = pitch; + CHECK_CUDA_ERROR(cudaMalloc(&padded_L, fullWidth * bheight * sizeof(int))); + CHECK_CUDA_ERROR(cudaMalloc(&padded_tmp, fullWidth * bheight * sizeof(uint64_t))); + + in = padded_in + pitch + 1; + L = padded_L + pitch + 1; + MST = padded_MST + pitch + 1; + tmp = padded_tmp + pitch + 1; + + CHECK_CUDA_ERROR(cudaMemset(padded_L, 0xFF, fullWidth * bheight * sizeof(int))); + CHECK_CUDA_ERROR(cudaMemcpy2D(in, pitch, input, stride, width, height, cudaMemcpyHostToDevice)); + + int* gCounter; + CHECK_CUDA_ERROR(cudaMalloc(&gCounter, sizeof(int))); + CHECK_CUDA_ERROR(cudaMemset(gCounter, 0, sizeof(int))); + + + dim3 bsize(BLOCK_WIDTH, BLOCK_HEIGHT); + dim3 gsize(iceil(width, bsize.x), iceil(height, bsize.y)); + + auto start = std::chrono::high_resolution_clock::now(); + + + init_L<<<gsize, bsize>>>(L, width, height, pitch); + //export_pgm(L, width, height, pitch); + CHECK_LAST_CUDA_ERROR(); + mst_init<<<gsize, bsize>>>(in, width, height, pitch, L, MST); + compress<<<gsize, bsize>>>(in, width, height, pitch, L, gCounter, tmp); + //export_pgm(L, width, height, pitch); + CHECK_LAST_CUDA_ERROR(); + + int counter; + cudaMemcpy(&counter, gCounter, sizeof(int), cudaMemcpyDeviceToHost); + fmt::print("Number of CC: {}\n", counter); + + while (counter != 1) + { + mst_get_weak_edges<<<gsize, bsize>>>(in, width, height, pitch, L, tmp); + CHECK_LAST_CUDA_ERROR(); + //{ + // dim3 gsize((width + GET_EDGES_BLOCK_DIM.x - 1) / GET_EDGES_BLOCK_DIM.x, (height + GET_EDGES_BLOCK_DIM.y - 1) / GET_EDGES_BLOCK_DIM.y); + // dim3 bsize(GET_EDGES_BLOCK_DIM.x, 1); + // mst_get_weak_edges_wpt<<<gsize, bsize>>>(in, width, height, pitch, L, tmp); + //} + //export_pgm(L, width, height, pitch); + CHECK_CUDA_ERROR(cudaMemset(gCounter, 0, sizeof(int))); + + CHECK_CUDA_ERROR(cudaMemset(gCounter, 0, sizeof(int))); + mst_merge_components<<<gsize, bsize>>>(in, width, height, pitch, L, MST, tmp); + CHECK_CUDA_ERROR(cudaMemset(gCounter, 0, sizeof(int))); + compress<<<gsize, bsize>>>(in, width, height, pitch, L, gCounter, tmp); + + cudaMemcpy(&counter, gCounter, sizeof(int), cudaMemcpyDeviceToHost); + fmt::print("Number of CC: {}\n", counter); + //sleep(1); + } + + auto end = std::chrono::high_resolution_clock::now(); + auto duration = end - start; + double pix_sec = double(width * height) / std::chrono::duration_cast<std::chrono::seconds>(duration).count(); + fmt::print("MST (kernel) computed in: {} ms ({} MPix/s). \n", duration, pix_sec / 1000000); + + cudaFree(padded_MST); + cudaFree(padded_in); + cudaFree(padded_L); + cudaFree(padded_MST); +} \ No newline at end of file -- GitLab From 7b2b3f145f51071b57894d2fbaf25cc10308bac2 Mon Sep 17 00:00:00 2001 From: E carlinet <edwin.carlinet@lrde.epita.fr> Date: Wed, 21 Sep 2022 12:12:12 +0200 Subject: [PATCH 2/3] Fix duration pb --- CMakePresets.json | 4 ++-- src_mst/mst_launch.cu | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index 70a7c7c..83159ea 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -12,7 +12,7 @@ "CMAKE_CUDA_ARCHITECTURES" : "70-virtual;72-real" }, "environment": { - "CUDACXX": "/usr/local/cuda/bin/nvcc" + "CUDACXX": "nvcc" } }, { @@ -26,7 +26,7 @@ "CMAKE_CUDA_ARCHITECTURES" : "70-virtual;72-real" }, "environment": { - "CUDACXX": "/usr/local/cuda/bin/nvcc" + "CUDACXX": "nvcc" } } ] diff --git a/src_mst/mst_launch.cu b/src_mst/mst_launch.cu index 076366b..9f0184f 100644 --- a/src_mst/mst_launch.cu +++ b/src_mst/mst_launch.cu @@ -133,9 +133,9 @@ void mst(const uint8_t* input, int width, int height, std::ptrdiff_t stride) } auto end = std::chrono::high_resolution_clock::now(); - auto duration = end - start; - double pix_sec = double(width * height) / std::chrono::duration_cast<std::chrono::seconds>(duration).count(); - fmt::print("MST (kernel) computed in: {} ms ({} MPix/s). \n", duration, pix_sec / 1000000); + std::chrono::duration<double> duration = end - start; + double pix_sec = double(width * height) / duration.count(); + fmt::print("MST (kernel) computed in: {} ms ({} MPix/s). \n", duration.count() * 1000, pix_sec / 1000000); cudaFree(padded_MST); cudaFree(padded_in); -- GitLab From 6935ff0411a1069c3f92449dd8c4848245d38aac Mon Sep 17 00:00:00 2001 From: E carlinet <edwin.carlinet@lrde.epita.fr> Date: Wed, 21 Sep 2022 17:04:38 +0200 Subject: [PATCH 3/3] Fix data race --- src_mst/mst.cu | 21 ++++++--------------- src_mst/mst_launch.cu | 3 ++- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/src_mst/mst.cu b/src_mst/mst.cu index 262981b..7ada598 100644 --- a/src_mst/mst.cu +++ b/src_mst/mst.cu @@ -57,14 +57,7 @@ __global__ void mst_init(const uint8_t* input, int width, int height, int stride int up = u - stride; int down = u + stride; - - // 1. Make-set - //if (x < width && y < height) - // L[u] = u; - // - //__syncthreads(); - - // 2. Select the weak edge and compute the CC + // 1. Select the weak edge and compute the CC if (x < width && y < height) { @@ -81,12 +74,6 @@ __global__ void mst_init(const uint8_t* input, int width, int height, int stride unite(L, v, u); // Unite MST[u] = e; // Set the edge "e" as part of the MST } - - __syncthreads(); - - // 3. compress - if (x < width && y < height) - L[u] = findroot(L, u); } @@ -99,12 +86,16 @@ __global__ void compress(const uint8_t* input, int width, int height, int stride if (x >= width || y >= height) return; - int q = (L[u] = findroot(L, u)); + int q = L[u]; if (u == q) { atomicAdd(gCount, 1); WE[u] = WEdge::zero(); } + else + { + L[u] = findroot(L, q); + } } template <int ITEMS_PER_THREAD, class T> diff --git a/src_mst/mst_launch.cu b/src_mst/mst_launch.cu index 9f0184f..f72435d 100644 --- a/src_mst/mst_launch.cu +++ b/src_mst/mst_launch.cu @@ -107,6 +107,7 @@ void mst(const uint8_t* input, int width, int height, std::ptrdiff_t stride) CHECK_LAST_CUDA_ERROR(); int counter; + cudaDeviceSynchronize(); cudaMemcpy(&counter, gCounter, sizeof(int), cudaMemcpyDeviceToHost); fmt::print("Number of CC: {}\n", counter); @@ -140,5 +141,5 @@ void mst(const uint8_t* input, int width, int height, std::ptrdiff_t stride) cudaFree(padded_MST); cudaFree(padded_in); cudaFree(padded_L); - cudaFree(padded_MST); + cudaFree(padded_tmp); } \ No newline at end of file -- GitLab