From 60d4b49cef5cf00dbafe99dcaee05478be8da729 Mon Sep 17 00:00:00 2001
From: Edwin Carlinet <edwin.carlinet@lrde.epita.fr>
Date: Wed, 21 Sep 2022 11:22:22 +0200
Subject: [PATCH 1/3] Add mst impl.

---
 CMakeLists.txt        |  35 ++---
 CMakePresets.json     |  33 ++++
 conanfile.txt         |   4 +
 include/mst.h         |   4 +
 src_mst/main.cpp      |  28 ++++
 src_mst/mst.cu        | 341 ++++++++++++++++++++++++++++++++++++++++++
 src_mst/mst.cuh       |  86 +++++++++++
 src_mst/mst_launch.cu | 144 ++++++++++++++++++
 8 files changed, 652 insertions(+), 23 deletions(-)
 create mode 100644 CMakePresets.json
 create mode 100644 include/mst.h
 create mode 100644 src_mst/main.cpp
 create mode 100644 src_mst/mst.cu
 create mode 100644 src_mst/mst.cuh
 create mode 100644 src_mst/mst_launch.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4b2fc03..4db1578 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,30 +1,10 @@
 cmake_minimum_required(VERSION 3.20)
 project(gpu_alpha_tree LANGUAGES CXX CUDA)
 
-include(FindCUDAToolkit)
+#include(FindCUDAToolkit)
 include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
 conan_basic_setup(TARGETS)
 
-include(FetchContent)
-FetchContent_Declare(
-        GoogleTest
-        URL https://github.com/google/googletest/archive/refs/tags/v1.10.x.tar.gz)
-FetchContent_Declare(
-        GoogleBenchmark
-        URL https://github.com/google/benchmark/archive/v1.4.1.tar.gz)
-FetchContent_Declare(
-        CLI11
-        URL https://github.com/CLIUtils/CLI11/archive/v1.8.0.tar.gz)
-
-if (NOT GoogleBenchmark_POPULATED)
-    FetchContent_Populate(GoogleBenchmark)
-    set(BENCHMARK_ENABLE_GTEST_TESTS OFF CACHE BOOL "From Gtest")
-    set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "From Gtest ")
-    add_subdirectory(${googlebenchmark_SOURCE_DIR} ${googlebenchmark_BINARY_DIR})
-endif ()
-
-FetchContent_MakeAvailable(CLI11)
-FetchContent_MakeAvailable(GoogleTest)
 enable_testing()
 
 find_package(OpenCV REQUIRED COMPONENTS core imgcodecs)
@@ -70,8 +50,17 @@ set_target_properties(gpu_alpha_tree PROPERTIES
        CUDA_SEPARABLE_COMPILATION ON)
 
 
+add_library(gpu-mst src_mst/mst_launch.cu src_mst/mst.cu)
+target_compile_options(gpu-mst PRIVATE "--expt-relaxed-constexpr")
+target_link_libraries(gpu-mst PRIVATE CONAN_PKG::fmt)
+target_include_directories(gpu-mst PUBLIC include)
+
+
 add_executable(main src/main.cpp)
-target_link_libraries(main PRIVATE gpu_alpha_tree CLI11::CLI11 ${OpenCV_LIBS})
+target_link_libraries(main PRIVATE gpu_alpha_tree CONAN_PKG::cli11 ${OpenCV_LIBS})
+
+add_executable(mst-bin src_mst/main.cpp)
+target_link_libraries(mst-bin PRIVATE gpu-mst CONAN_PKG::cli11 ${OpenCV_LIBS} CONAN_PKG::fmt)
 
 add_executable(bench ${BENCHMARKS})
 target_link_libraries(bench PRIVATE gpu_alpha_tree benchmark ${OpenCV_LIBS} boost_thread)
@@ -79,4 +68,4 @@ target_include_directories(bench PRIVATE ${CONAN_INCLUDE_DIRS_BOOST})
 target_link_directories(bench PRIVATE ${CONAN_LIB_DIRS_BOOST})
 
 add_executable(tests ${TESTS})
-target_link_libraries(tests PRIVATE gpu_alpha_tree gtest_main)
\ No newline at end of file
+target_link_libraries(tests PRIVATE gpu_alpha_tree CONAN_PKG::gtest)
\ No newline at end of file
diff --git a/CMakePresets.json b/CMakePresets.json
new file mode 100644
index 0000000..70a7c7c
--- /dev/null
+++ b/CMakePresets.json
@@ -0,0 +1,33 @@
+{
+    "version": 2,
+    "configurePresets": [
+        {
+            "name": "debug",
+            "description": "Sets Ninja generator, build and install directory",
+            "generator": "Ninja",
+            "binaryDir": "${sourceDir}/build",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Debug",
+                "CMAKE_INSTALL_PREFIX": "${sourceDir}/build/install/${presetName}",
+                "CMAKE_CUDA_ARCHITECTURES" : "70-virtual;72-real"
+            },
+            "environment": {
+                "CUDACXX": "/usr/local/cuda/bin/nvcc"
+            }
+        },
+        {
+            "name": "release",
+            "description": "Sets Ninja generator, build and install directory",
+            "generator": "Ninja",
+            "binaryDir": "${sourceDir}/build",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Release",
+                "CMAKE_INSTALL_PREFIX": "${sourceDir}/build/install/${presetName}",
+                "CMAKE_CUDA_ARCHITECTURES" : "70-virtual;72-real"
+            },
+            "environment": {
+                "CUDACXX": "/usr/local/cuda/bin/nvcc"
+            }
+        }
+    ]
+}
diff --git a/conanfile.txt b/conanfile.txt
index e18c50b..b1f1739 100644
--- a/conanfile.txt
+++ b/conanfile.txt
@@ -1,5 +1,9 @@
 [requires]
 boost/1.71.0
+gtest/[>=1.10]
+benchmark/[>=1.5]
+cli11/[>=2.0]
+fmt/[>=7]
 
 [generators]
 cmake
\ No newline at end of file
diff --git a/include/mst.h b/include/mst.h
new file mode 100644
index 0000000..a314101
--- /dev/null
+++ b/include/mst.h
@@ -0,0 +1,4 @@
+#pragma once
+#include <cstdint>
+
+void mst(const uint8_t* input, int width, int height, std::ptrdiff_t stride);
\ No newline at end of file
diff --git a/src_mst/main.cpp b/src_mst/main.cpp
new file mode 100644
index 0000000..c8a6dcb
--- /dev/null
+++ b/src_mst/main.cpp
@@ -0,0 +1,28 @@
+#include "mst.h"
+
+#include <CLI/CLI.hpp>
+#include <opencv4/opencv2/core.hpp>
+#include <opencv4/opencv2/imgcodecs.hpp>
+#include <cstdio>
+#include <fmt/core.h>
+
+int main(int argc, char** argv)
+{
+    std::string image_path;
+
+    CLI::App app("gpu_alpha_tree");
+    app.add_option("-i", image_path, "Input image path")->required()->check(CLI::ExistingFile);
+
+    CLI11_PARSE(app, argc, argv);
+
+    auto image = cv::imread(image_path, cv::IMREAD_ANYCOLOR);
+    if (!((image.channels() == 1) && (image.depth() == CV_8U)))
+    {
+        fmt::print(stderr, "Input image must be 8-bit grayscale ({} channels provided).\n", image.channels());
+        std::exit(1);
+    }
+
+
+    mst(image.data, image.rows, image.cols, image.rows * sizeof(uint8_t));
+    return 0;
+}
\ No newline at end of file
diff --git a/src_mst/mst.cu b/src_mst/mst.cu
new file mode 100644
index 0000000..262981b
--- /dev/null
+++ b/src_mst/mst.cu
@@ -0,0 +1,341 @@
+#include "mst.cuh"
+#include <cstdint>
+#include <cstddef>
+#include <utility>
+#include <cassert>
+#include <cstdio>
+#include <algorithm>
+
+#include <cuda/barrier>
+#include <cuda/annotated_ptr>
+#include <cooperative_groups.h>
+
+namespace cg = cooperative_groups;
+
+template <class T>
+__device__ void swap(T& a, T& b) { T tmp = b; b = a; a = tmp;}
+
+
+///
+///@brief Make union between two components (u,v)
+///
+///@param L 
+///@param u 
+///@param v 
+///@return Return the new root of the component
+__device__ int unite(int* L, int u, int v)
+{
+    while (1)
+    {
+        if (u > v)
+            swap(u,v);
+        if (u == v)
+            return u;
+        int old = atomicMin(&L[v], u); 
+        if (old == v)
+            return u;
+        v = old;
+    }    
+}
+
+__device__ int findroot(int* L, int x)
+{
+    int q;
+    while ((q = L[x]) != x)
+        x = q;
+    return q;
+}
+
+
+__global__ void mst_init(const uint8_t* input, int width, int height, int stride, int* L, uint8_t* MST)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int u = y * stride + x;
+    int left = u - 1;
+    int right = u + 1;
+    int up   = u - stride;
+    int down = u + stride;
+
+    
+    // 1. Make-set
+    //if (x < width && y < height)
+    //    L[u] = u;
+    //
+    //__syncthreads();
+
+    // 2. Select the weak edge and compute the CC
+
+    if (x < width && y < height)
+    {
+        int m = INT_MAX;
+        int M; 
+        int v;
+        int e;
+        if (y > 0          && (M = std::abs(input[u] - input[up])) < m)    { e = NORTH; v = up; m = M; }
+        if (x > 0          && (M = std::abs(input[u] - input[left])) < m)  { e = WEST; v = left; m = M; }
+        if (x < (width-1)  && (M = std::abs(input[u] - input[right])) < m) { e = EAST; v = right; m = M; }
+        if (y < (height-1) && (M = std::abs(input[u] - input[down])) < m)  { e = SOUTH; v = down; m = M; }
+
+
+        unite(L, v, u); // Unite
+        MST[u] = e;     // Set the edge "e" as part of the MST
+    }
+
+    __syncthreads();
+
+    // 3. compress
+    if (x < width && y < height)
+        L[u] = findroot(L, u);
+}
+
+
+__global__ void compress(const uint8_t* input, int width, int height, int stride, int* L, int* gCount, uint64_t* WE)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int u = y * stride + x;
+
+    if (x >= width || y >= height)
+        return;
+
+    int q = (L[u] = findroot(L, u));
+    if (u == q)
+    {
+        atomicAdd(gCount, 1);
+        WE[u] = WEdge::zero();
+    }
+}
+
+template <int ITEMS_PER_THREAD, class T>
+__forceinline__ __device__ void loadbuffer(T* dst, const T* src, int size)
+{
+    for (int k = 0; k < ITEMS_PER_THREAD; ++k)
+    {
+        int i = blockDim.x * k + threadIdx.x;
+        if (i < size)
+            dst[i] = src[i];
+    }
+}
+
+
+template <class T, class Barrier>
+__forceinline__ __device__ void loadbuffer_async(
+    const cg::thread_block& g,
+    T* dst,
+    const T* src,
+    int size,
+    Barrier& barrier)
+{
+    cuda::memcpy_async(g, 
+    cuda::annotated_ptr<T, cuda::access_property::shared>(dst), 
+    cuda::annotated_ptr<T, cuda::access_property::streaming>((T*)src), size * sizeof(T), barrier);
+}
+
+
+
+
+__global__ void mst_get_weak_edges(const uint8_t* input, int width, int height, int stride, int* L, uint64_t* WE)
+{
+    //using input_load_t = cub::BlockLoad<uint8_t, BLOCK_WIDTH, 2, BLOCK_LOAD_DIRECT, BLOCK_HEIGHT>;
+    //using L_load_t = cub::BlockLoad<int, BLOCK_WIDTH, 2, BLOCK_LOAD_DIRECT, BLOCK_HEIGHT>;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    const int u = y * stride + x;
+
+    assert(BLOCK_HEIGHT == blockDim.y);
+    assert(BLOCK_WIDTH == blockDim.x);
+
+
+    __shared__ uint8_t s_input_data[BLOCK_HEIGHT + 2][BLOCK_WIDTH + 2];
+    __shared__ int s_L_data[BLOCK_HEIGHT + 2][BLOCK_WIDTH + 2];
+
+    static_assert(sizeof(s_input_data) == (BLOCK_HEIGHT + 2)*(BLOCK_WIDTH + 2));
+
+    // Compute the bounds of the regions to copy (with the 1-pixel border)
+    {
+        int x0 = blockIdx.x * BLOCK_WIDTH - 1;
+        int y0 = int(blockIdx.y * blockDim.y) - 1;
+        int x1 = std::min(width,  (int)((blockIdx.x + 1) * BLOCK_WIDTH));
+        int y1 = std::min(height, (int)((blockIdx.y + 1) * blockDim.y));
+        int w = x1 - x0 + 1;
+        int h = y1 - y0 + 1;
+
+        if (threadIdx.y < h)
+        {
+            loadbuffer<2>(s_input_data[threadIdx.y], input + (y0 + (int)threadIdx.y) * stride + x0, w);
+            loadbuffer<2>(s_L_data[threadIdx.y], L + (y0 + (int)threadIdx.y) * stride + x0, w);
+        }
+        int ty2 = threadIdx.y + blockDim.y;
+        if (ty2 < h)
+        {
+            loadbuffer<2>(s_input_data[ty2], input + (y0 + ty2) * stride + x0, w);
+            loadbuffer<2>(s_L_data[ty2], L + (y0 + ty2) * stride + x0, w);
+        }
+    }
+    __syncthreads();
+
+    const uint8_t* s_input = &(s_input_data[1][1]);
+    int* s_L = &(s_L_data[1][1]);
+
+    // 1. Select the weak outer-edge of the current pixel the CC
+    if (x < width && y < height)
+    {
+        int i = threadIdx.y * (BLOCK_WIDTH + 2) + threadIdx.x;
+        int left = i - 1;
+        int right = i + 1;
+        int up   = i - (BLOCK_WIDTH + 2);
+        int down = i + (BLOCK_WIDTH + 2);
+        int m = INT_MAX;
+        int M;
+        Direction e;
+        int q = s_L[i];
+        int v = s_input[i];
+        if ((s_L[up]    >= 0) && (s_L[up]    != q) && (M = std::abs(v - s_input[up])) < m)    { e = NORTH; m = M; }
+        if ((s_L[left]  >= 0) && (s_L[left]  != q) && (M = std::abs(v - s_input[left])) < m)  { e = WEST;  m = M; }
+        if ((s_L[right] >= 0) && (s_L[right] != q) && (M = std::abs(v - s_input[right])) < m) { e = EAST;  m = M; }
+        if ((s_L[down]  >= 0) && (s_L[down]  != q) && (M = std::abs(v - s_input[down])) < m)  { e = SOUTH; m = M; }
+
+        //L[u] = q;
+        if (m == INT_MAX)
+            return;
+
+        // 2. Reduce -> For-each CC get the current weakest-edge
+        atomicMin((unsigned long long*)&WE[q], WEdge::pack(u, e, m));
+    }
+}
+
+template <class T, size_t n>
+__device__ __forceinline__ void roll(T* (&tab)[n])
+{
+    auto tmp = tab[0];
+    for (size_t i = 0; i < (n-1); ++i)
+        tab[i] = tab[i+1];
+    tab[n-1] = tmp;
+}
+
+
+
+__global__ void mst_get_weak_edges_wpt(const uint8_t* input, int width, int height, int stride, int* L, uint64_t* WE)
+{
+    constexpr int BLOCK_WIDTH = GET_EDGES_BLOCK_DIM.x;
+    constexpr int BLOCK_HEIGHT = GET_EDGES_BLOCK_DIM.y;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int x0 = int(blockIdx.x * BLOCK_WIDTH) - 1;
+    const int y0 = int(blockIdx.y * BLOCK_HEIGHT) - 1;
+    const int x1 = std::min(width,  (int)((blockIdx.x + 1) * blockDim.x));
+    const int y1 = std::min(height, (int)((blockIdx.y + 1) * BLOCK_HEIGHT));
+    const int w = x1 - x0 + 1;
+    const int h = y1 - y0 + 1;
+    auto g = cg::this_thread_block();
+
+    __shared__ uint8_t s_input_data[4][BLOCK_WIDTH + 2];
+    __shared__ int s_L_data[4][BLOCK_WIDTH + 2];
+
+    __shared__ uint8_t* s_input[4];
+    __shared__ int*     s_L[4];
+
+
+    if (threadIdx.x == 0)
+    {
+        s_input[0] = s_input_data[0];
+        s_input[1] = s_input_data[1];
+        s_input[2] = s_input_data[2];
+        s_input[3] = s_input_data[3];
+        s_L[0] = s_L_data[0];
+        s_L[1] = s_L_data[1];
+        s_L[2] = s_L_data[2];
+        s_L[3] = s_L_data[3];
+    }
+
+    auto swap_buffer_on_completion = []() { roll(s_L); roll(s_input); };
+    __shared__ cuda::barrier<cuda::thread_scope_block> bar;
+    init(&bar, BLOCK_WIDTH);
+
+    __syncthreads();
+
+    // Load border and the first line
+    {
+        loadbuffer_async(g, s_input[1], input + y0 * stride + x0, w, bar);
+        loadbuffer_async(g, s_L[1], L + y0 * stride + x0, w, bar);
+        loadbuffer_async(g, s_input[2], input + (y0 + 1) * stride + x0, w, bar);
+        loadbuffer_async(g, s_L[2], L + (y0 + 1) * stride + x0, w, bar);
+        loadbuffer_async(g, s_input[3], input + (y0 + 2) * stride + x0, w, bar);
+        loadbuffer_async(g, s_L[3], L + (y0 + 2) * stride + x0, w, bar);
+    }
+
+    // 1. Select the weak outer-edge of the current pixel the CC
+    assert((y0 + 1) < y1);
+    for (int y = y0 + 1; y < y1; ++y)
+    {
+        bar.arrive_and_wait();
+        if (threadIdx.x == 0)
+            swap_buffer_on_completion();
+        __syncthreads();
+
+        // prefetch the line N+2
+        if ((y+1) < y1) {
+            loadbuffer_async(g, s_input[3], input + (y+2) * stride + x0, w, bar);
+            loadbuffer_async(g, s_L[3], L + (y+2) * stride + x0, w, bar);
+        }
+
+        if (x < width)
+        {
+            int left = threadIdx.x;
+            int i = threadIdx.x + 1;
+            int right = threadIdx.x + 2;
+            int m = INT_MAX;
+            int M;
+            Direction e;
+            int q = s_L[1][i];
+            int v = s_input[1][i];
+            if ((s_L[0][i]    >= 0)  & (s_L[0][i]     != q) & (M = std::abs(v - s_input[0][i])) < m)      { e = NORTH; m = M; }
+            if ((s_L[1][left]  >= 0) & (s_L[1][left]  != q) & (M = std::abs(v - s_input[1][left])) < m)   { e = WEST;  m = M; }
+            if ((s_L[1][right] >= 0) & (s_L[1][right] != q) & (M = std::abs(v - s_input[1][right])) < m)  { e = EAST;  m = M; }
+            if ((s_L[2][i]  >= 0)    & (s_L[2][i]     != q) & (M = std::abs(v - s_input[2][i])) < m)      { e = SOUTH; m = M; }
+
+            const int u = y * stride + x;
+            //L[u] = q;
+
+            // 2. Reduce -> For-each CC get the current weakest-edge
+            if (m != INT_MAX)
+                atomicMin((unsigned long long*)&WE[q], WEdge::pack(u, e, m));
+        }
+    } 
+}
+
+
+__global__ void mst_merge_components(const uint8_t* input, int width, int height, int stride, int* L, uint8_t* MST, uint64_t* WE)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int u = y * stride + x;
+
+    if (x >= width || y >= height)
+        return;
+
+    // Only the root has work
+    if (u != L[u])
+        return;
+
+    assert(WE[u] != WEdge::zero());
+    auto e = WEdge::unpack(WE[u]);
+
+    int offset = 0;
+    switch (e.dir) {
+        case NORTH: offset = -stride; break;
+        case SOUTH: offset = +stride; break;
+        case WEST: offset = -1; break;
+        case EAST: offset = +1; break;
+    }
+
+    int root = u;
+    u = e.origin;
+    int v = u + offset;
+    //printf("CC: %i edge %i -> %i\n", root, u, v);
+    unite(L, root, L[v]); // Unite
+    MST[u] = MST[u] | e.dir; // Set the edge "e" as part of the MST (uniqueness of the writer, the root representative thread)
+}
+
diff --git a/src_mst/mst.cuh b/src_mst/mst.cuh
new file mode 100644
index 0000000..00bb2af
--- /dev/null
+++ b/src_mst/mst.cuh
@@ -0,0 +1,86 @@
+#pragma once
+#include <cstdint>
+
+constexpr int BLOCK_WIDTH = 32;
+constexpr int BLOCK_HEIGHT = 8;
+
+constexpr dim3 GET_EDGES_BLOCK_DIM = dim3(128, 8);
+
+enum Direction : uint8_t {
+    NORTH = 1,
+    WEST= 2,
+    EAST = 4,
+    SOUTH = 8,
+};
+
+struct WEdge {
+    int        origin;
+    Direction  dir;
+    uint8_t    weight;
+
+    // Pack into:
+    // weight: 16
+    // direction: 16
+    // origin: 32
+    __device__ static inline constexpr uint64_t pack(uint64_t origin, Direction dir, uint64_t weight) {
+        return (weight << 48) | ((uint64_t) dir << 32) | origin; 
+    }
+
+    __device__ static inline constexpr WEdge unpack(uint64_t value) {
+        WEdge e = {
+          .origin = (int)(value & 0xFFFFFFFF),
+          .dir = (Direction)((value >> 32) & 0xFF),
+          .weight = (uint8_t)(value >> 48),
+        };
+        return e;
+    }
+
+    __device__ static inline constexpr uint64_t zero() { return  UINT64_MAX; }
+};
+
+
+__global__ void mst_init(const uint8_t* input, int width, int height, int stride, int* L, uint8_t* MST);
+
+///
+///@brief Get the weak edges between hyper-vertices
+///
+///@param input The input image
+///@param width 
+///@param height 
+///@param stride 
+///@param L    Label image for the connected component labeling
+///@param WE   Store the weak edge for each component in WE[L] where L is the label of the component
+///@return __global__ 
+__global__ void mst_get_weak_edges(const uint8_t* input, int width, int height, int stride, int* L, uint64_t* WE);
+
+
+///
+/// Like previous but with 
+__global__ void mst_get_weak_edges_wpt(const uint8_t* input, int width, int height, int stride, int* L, uint64_t* WE);
+
+
+///
+///@brief 
+///
+///@param input 
+///@param width 
+///@param height 
+///@param stride 
+///@param L 
+///@param gCount 
+///@param WE 
+///@return __global__ 
+__global__ void compress(const uint8_t* input, int width, int height, int stride, int* L, int* gCount, uint64_t* WE);
+
+
+///
+///@brief 
+///
+///@param input 
+///@param width 
+///@param height 
+///@param stride 
+///@param L 
+///@param WE 
+///@return __global__ 
+__global__ void mst_merge_components(const uint8_t* input, int width, int height, int stride, int* L, uint8_t* MST, uint64_t* WE);
diff --git a/src_mst/mst_launch.cu b/src_mst/mst_launch.cu
new file mode 100644
index 0000000..076366b
--- /dev/null
+++ b/src_mst/mst_launch.cu
@@ -0,0 +1,144 @@
+#include "mst.cuh"
+#include <cstddef>
+#include <fmt/core.h>
+#include <unistd.h>
+#include <memory>
+#include <fmt/core.h>
+#include <fmt/chrono.h>
+#include <chrono>
+
+#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
+template <typename T>
+void check(T err, const char* const func, const char* const file,
+           const int line)
+{
+    if (err != cudaSuccess)
+    {
+        fmt::print(stderr, "CUDA Runtime Error at: {} : {}\n", file, line);
+        fmt::print(stderr, "{} {}\n", cudaGetErrorString(err), func);
+        // We don't exit when we encounter CUDA errors in this example.
+        // std::exit(EXIT_FAILURE);
+    }
+}
+
+#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
+void checkLast(const char* const file, const int line)
+{
+    cudaError_t err{cudaGetLastError()};
+    if (err != cudaSuccess)
+    {
+        fmt::print(stderr, "CUDA Runtime Error at: {} : {}\n", file, line);
+        fmt::print(stderr, "{}\n", cudaGetErrorString(err));
+        // We don't exit when we encounter CUDA errors in this example.
+        // std::exit(EXIT_FAILURE);
+    }
+}
+
+__global__ void init_L(int* L, int width, int height, int pitch)
+{
+    int x = blockDim.x * blockIdx.x + threadIdx.x;
+    int y = blockDim.y * blockIdx.y + threadIdx.y;
+    if (x < width && y < height)
+        L[y * pitch + x] = y * pitch + x;
+}
+
+template <class T>
+void export_pgm(const T* ptr, int width, int height, int pitch)
+{
+    auto tmp =  std::make_unique<T[]>((width+2) * (height+2));
+    cudaMemcpy2D(tmp.get(), (width+2) * sizeof(T), ptr - pitch - 1, pitch * sizeof(T), (width+2) * sizeof(T), (height+2), cudaMemcpyDeviceToHost);
+    for (int y = 0; y < (height+2); ++y) {
+        for (int x = 0; x < (width+2); ++x)
+            fmt::print("{:04} ", tmp[y * (width+2) + x]);
+        fmt::print("\n");
+    }
+    fmt::print("\n");
+}
+
+constexpr int iceil(int a, int b) {
+    return (a+b-1) / b;
+}
+
+void mst(const uint8_t* input, int width, int height, std::ptrdiff_t stride)
+{
+    size_t pitch;
+
+    uint8_t* in, *padded_in;
+    int* L, *padded_L;
+    uint8_t* MST, *padded_MST;
+    uint64_t* tmp, *padded_tmp;
+    
+
+    int bwidth = width + 2;
+    int bheight = height + 2;
+    
+    CHECK_CUDA_ERROR(cudaMallocPitch(&padded_in, &pitch, bwidth, bheight));
+    CHECK_CUDA_ERROR(cudaMallocPitch(&padded_MST, &pitch, bwidth, bheight));
+
+    int fullWidth = pitch;
+    CHECK_CUDA_ERROR(cudaMalloc(&padded_L, fullWidth * bheight * sizeof(int)));
+    CHECK_CUDA_ERROR(cudaMalloc(&padded_tmp, fullWidth * bheight * sizeof(uint64_t)));
+
+    in = padded_in + pitch + 1;
+    L = padded_L + pitch + 1;
+    MST = padded_MST + pitch + 1;
+    tmp = padded_tmp + pitch + 1;
+
+    CHECK_CUDA_ERROR(cudaMemset(padded_L, 0xFF, fullWidth * bheight * sizeof(int)));
+    CHECK_CUDA_ERROR(cudaMemcpy2D(in, pitch, input, stride, width, height, cudaMemcpyHostToDevice));
+
+    int* gCounter;
+    CHECK_CUDA_ERROR(cudaMalloc(&gCounter, sizeof(int)));
+    CHECK_CUDA_ERROR(cudaMemset(gCounter, 0, sizeof(int)));
+
+
+    dim3 bsize(BLOCK_WIDTH, BLOCK_HEIGHT);
+    dim3 gsize(iceil(width, bsize.x), iceil(height, bsize.y));
+
+    auto start = std::chrono::high_resolution_clock::now();
+
+
+    init_L<<<gsize, bsize>>>(L, width, height, pitch);
+    //export_pgm(L, width, height, pitch);
+    CHECK_LAST_CUDA_ERROR();
+    mst_init<<<gsize, bsize>>>(in, width, height, pitch, L, MST);
+    compress<<<gsize, bsize>>>(in, width, height, pitch, L, gCounter, tmp);
+    //export_pgm(L, width, height, pitch);
+    CHECK_LAST_CUDA_ERROR();
+
+    int counter;
+    cudaMemcpy(&counter, gCounter, sizeof(int), cudaMemcpyDeviceToHost);
+    fmt::print("Number of CC: {}\n", counter);
+
+    while (counter != 1)
+    {
+        mst_get_weak_edges<<<gsize, bsize>>>(in, width, height, pitch, L, tmp);
+        CHECK_LAST_CUDA_ERROR();
+        //{
+        //    dim3 gsize((width + GET_EDGES_BLOCK_DIM.x - 1) / GET_EDGES_BLOCK_DIM.x, (height + GET_EDGES_BLOCK_DIM.y - 1) / GET_EDGES_BLOCK_DIM.y);
+        //    dim3 bsize(GET_EDGES_BLOCK_DIM.x, 1);
+        //    mst_get_weak_edges_wpt<<<gsize, bsize>>>(in, width, height, pitch, L, tmp);
+        //}
+        //export_pgm(L, width, height, pitch);
+        CHECK_CUDA_ERROR(cudaMemset(gCounter, 0, sizeof(int)));
+
+        CHECK_CUDA_ERROR(cudaMemset(gCounter, 0, sizeof(int)));
+        mst_merge_components<<<gsize, bsize>>>(in, width, height, pitch, L, MST, tmp);
+        CHECK_CUDA_ERROR(cudaMemset(gCounter, 0, sizeof(int)));
+        compress<<<gsize, bsize>>>(in, width, height, pitch, L, gCounter, tmp);
+
+        cudaMemcpy(&counter, gCounter, sizeof(int), cudaMemcpyDeviceToHost);
+        fmt::print("Number of CC: {}\n", counter);
+        //sleep(1);
+    }
+
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration = end - start;
+    double pix_sec = double(width * height) / std::chrono::duration_cast<std::chrono::seconds>(duration).count();
+    fmt::print("MST (kernel) computed in: {} ms ({} MPix/s). \n", duration, pix_sec / 1000000);
+
+    cudaFree(padded_MST);
+    cudaFree(padded_in);
+    cudaFree(padded_L);
+    cudaFree(padded_MST);
+}
\ No newline at end of file
-- 
GitLab


From 7b2b3f145f51071b57894d2fbaf25cc10308bac2 Mon Sep 17 00:00:00 2001
From: E carlinet <edwin.carlinet@lrde.epita.fr>
Date: Wed, 21 Sep 2022 12:12:12 +0200
Subject: [PATCH 2/3] Fix duration pb

---
 CMakePresets.json     | 4 ++--
 src_mst/mst_launch.cu | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index 70a7c7c..83159ea 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -12,7 +12,7 @@
                 "CMAKE_CUDA_ARCHITECTURES" : "70-virtual;72-real"
             },
             "environment": {
-                "CUDACXX": "/usr/local/cuda/bin/nvcc"
+                "CUDACXX": "nvcc"
             }
         },
         {
@@ -26,7 +26,7 @@
                 "CMAKE_CUDA_ARCHITECTURES" : "70-virtual;72-real"
             },
             "environment": {
-                "CUDACXX": "/usr/local/cuda/bin/nvcc"
+                "CUDACXX": "nvcc"
             }
         }
     ]
diff --git a/src_mst/mst_launch.cu b/src_mst/mst_launch.cu
index 076366b..9f0184f 100644
--- a/src_mst/mst_launch.cu
+++ b/src_mst/mst_launch.cu
@@ -133,9 +133,9 @@ void mst(const uint8_t* input, int width, int height, std::ptrdiff_t stride)
     }
 
     auto end = std::chrono::high_resolution_clock::now();
-    auto duration = end - start;
-    double pix_sec = double(width * height) / std::chrono::duration_cast<std::chrono::seconds>(duration).count();
-    fmt::print("MST (kernel) computed in: {} ms ({} MPix/s). \n", duration, pix_sec / 1000000);
+    std::chrono::duration<double> duration = end - start;
+    double pix_sec = double(width * height) / duration.count();
+    fmt::print("MST (kernel) computed in: {} ms ({} MPix/s). \n", duration.count() * 1000, pix_sec / 1000000);
 
     cudaFree(padded_MST);
     cudaFree(padded_in);
-- 
GitLab


From 6935ff0411a1069c3f92449dd8c4848245d38aac Mon Sep 17 00:00:00 2001
From: E carlinet <edwin.carlinet@lrde.epita.fr>
Date: Wed, 21 Sep 2022 17:04:38 +0200
Subject: [PATCH 3/3] Fix data race

---
 src_mst/mst.cu        | 21 ++++++---------------
 src_mst/mst_launch.cu |  3 ++-
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/src_mst/mst.cu b/src_mst/mst.cu
index 262981b..7ada598 100644
--- a/src_mst/mst.cu
+++ b/src_mst/mst.cu
@@ -57,14 +57,7 @@ __global__ void mst_init(const uint8_t* input, int width, int height, int stride
     int up   = u - stride;
     int down = u + stride;
 
-    
-    // 1. Make-set
-    //if (x < width && y < height)
-    //    L[u] = u;
-    //
-    //__syncthreads();
-
-    // 2. Select the weak edge and compute the CC
+    // 1. Select the weak edge and compute the CC
 
     if (x < width && y < height)
     {
@@ -81,12 +74,6 @@ __global__ void mst_init(const uint8_t* input, int width, int height, int stride
         unite(L, v, u); // Unite
         MST[u] = e;     // Set the edge "e" as part of the MST
     }
-
-    __syncthreads();
-
-    // 3. compress
-    if (x < width && y < height)
-        L[u] = findroot(L, u);
 }
 
 
@@ -99,12 +86,16 @@ __global__ void compress(const uint8_t* input, int width, int height, int stride
     if (x >= width || y >= height)
         return;
 
-    int q = (L[u] = findroot(L, u));
+    int q = L[u];
     if (u == q)
     {
         atomicAdd(gCount, 1);
         WE[u] = WEdge::zero();
     }
+    else
+    {
+        L[u] = findroot(L, q);
+    }
 }
 
 template <int ITEMS_PER_THREAD, class T>
diff --git a/src_mst/mst_launch.cu b/src_mst/mst_launch.cu
index 9f0184f..f72435d 100644
--- a/src_mst/mst_launch.cu
+++ b/src_mst/mst_launch.cu
@@ -107,6 +107,7 @@ void mst(const uint8_t* input, int width, int height, std::ptrdiff_t stride)
     CHECK_LAST_CUDA_ERROR();
 
     int counter;
+    cudaDeviceSynchronize();
     cudaMemcpy(&counter, gCounter, sizeof(int), cudaMemcpyDeviceToHost);
     fmt::print("Number of CC: {}\n", counter);
 
@@ -140,5 +141,5 @@ void mst(const uint8_t* input, int width, int height, std::ptrdiff_t stride)
     cudaFree(padded_MST);
     cudaFree(padded_in);
     cudaFree(padded_L);
-    cudaFree(padded_MST);
+    cudaFree(padded_tmp);
 }
\ No newline at end of file
-- 
GitLab