Commit 063c97ac authored by Edwin Carlinet's avatar Edwin Carlinet
Browse files

Fix parallel dilation.

* Fix data race in worker copy
* Fix tile ROI problems
parent ef08025f
......@@ -160,8 +160,8 @@ distcheck-linux-coverage:
<<: *distcheck-linux-base
image: ${FEDORA_32}
after_script:
- mkdir coverage
- gcovr --root pylene build -s --xml cobertura --html-details coverage/index.html
- mkdir coverage
- cd build && gcovr --root .. --filter '\.\./pylene' . -s --xml ../cobertura --html-details ../coverage/index.html
variables:
PYLENE_CONFIGURATION: "Debug"
CXX: "g++"
......
#include <benchmark/benchmark.h>
#include <mln/bp/transpose.hpp>
#include <mln/bp/alloc.hpp>
template <class T>
class BMPrimitives : public benchmark::Fixture
{
public:
BMPrimitives()
{
m_dim = 256;
m_buffer = (T*) mln::bp::aligned_alloc_2d(m_dim, m_dim, sizeof(T), m_stride);
m_out = (T*) mln::bp::aligned_alloc_2d(m_dim, m_dim, sizeof(T), m_stride);
for (int y = 0; y < m_dim; ++y)
{
T* lineptr = mln::bp::ptr_offset(m_buffer, y * m_stride);
for (int i = 0; i < m_dim; ++i)
lineptr[i] = y * m_dim + i;
}
}
~BMPrimitives() override
{
mln::bp::aligned_free_2d(m_buffer);
mln::bp::aligned_free_2d(m_out);
}
void SetBytesProcessed(benchmark::State& st, int dim) const
{
st.SetBytesProcessed(sizeof(T) * st.iterations() * dim * dim);
}
protected:
T* m_buffer;
T* m_out;
int m_dim;
std::ptrdiff_t m_stride;
};
namespace baseline
{
template <class T>
[[gnu::noinline]]
void transpose(T* matrix, int n, std::ptrdiff_t stride)
{
mln::bp::impl::transpose_inplace_naive(matrix, n, stride);
}
}
struct rgb8
{
rgb8() = default;
rgb8(int x)
: r{static_cast<uint8_t>(x)}
, g{static_cast<uint8_t>(x)}
, b{static_cast<uint8_t>(x)}
{
}
bool operator==(rgb8 other) const { return std::make_tuple(r, g, b) == std::make_tuple(other.r, other.g, other.b); }
operator int() const { return r; }
uint8_t r, g, b;
};
struct rgba8
{
rgba8(int x)
: r{static_cast<uint8_t>(x)}
, g{static_cast<uint8_t>(x)}
, b{static_cast<uint8_t>(x)}
, a{static_cast<uint8_t>(x)}
{
}
bool operator==(rgba8 other) const { return std::make_tuple(r, g, b, a) == std::make_tuple(other.r, other.g, other.b, other.a); }
operator int() const { return r; }
uint8_t r, g, b, a;
};
#define BENCHMARK_TEMPLATE_INPLACE_BASELINE(TYPE) \
BENCHMARK_TEMPLATE_DEFINE_F(BMPrimitives, transpose_inplace_baseline_##TYPE, TYPE)(benchmark::State & st) \
{ \
int dim = st.range(0); \
for (auto _ : st) \
baseline::transpose(this->m_buffer, dim, this->m_stride); \
this->SetBytesProcessed(st, dim); \
} \
BENCHMARK_REGISTER_F(BMPrimitives, transpose_inplace_baseline_##TYPE)->Arg(64)->Arg(128)->Arg(131)->Arg(256);
#define BENCHMARK_TEMPLATE_INPLACE_OPTIMIZED(TYPE) \
BENCHMARK_TEMPLATE_DEFINE_F(BMPrimitives, transpose_inplace_optimized_##TYPE, TYPE)(benchmark::State & st) \
{ \
int dim = st.range(0); \
for (auto _ : st) \
mln::bp::transpose_inplace(this->m_buffer, dim, this->m_stride); \
this->SetBytesProcessed(st, dim); \
} \
BENCHMARK_REGISTER_F(BMPrimitives, transpose_inplace_optimized_##TYPE)->Arg(64)->Arg(128)->Arg(131)->Arg(256);
#define BENCHMARK_TEMPLATE_OPTIMIZED(TYPE) \
BENCHMARK_TEMPLATE_DEFINE_F(BMPrimitives, transpose_optimized_##TYPE, TYPE)(benchmark::State & st) \
{ \
int dim = st.range(0); \
for (auto _ : st) \
mln::bp::transpose(this->m_buffer, this->m_out, dim, dim, this->m_stride, this->m_stride); \
this->SetBytesProcessed(st, dim); \
} \
BENCHMARK_REGISTER_F(BMPrimitives, transpose_optimized_##TYPE)->Arg(64)->Arg(128)->Arg(131)->Arg(256);
BENCHMARK_TEMPLATE_INPLACE_BASELINE(uint8_t);
BENCHMARK_TEMPLATE_INPLACE_BASELINE(uint16_t);
BENCHMARK_TEMPLATE_INPLACE_BASELINE(uint32_t);
BENCHMARK_TEMPLATE_INPLACE_BASELINE(uint64_t);
BENCHMARK_TEMPLATE_INPLACE_BASELINE(float);
BENCHMARK_TEMPLATE_INPLACE_BASELINE(double);
BENCHMARK_TEMPLATE_INPLACE_BASELINE(rgb8);
BENCHMARK_TEMPLATE_INPLACE_BASELINE(rgba8);
BENCHMARK_TEMPLATE_INPLACE_OPTIMIZED(uint8_t);
BENCHMARK_TEMPLATE_INPLACE_OPTIMIZED(uint16_t);
BENCHMARK_TEMPLATE_INPLACE_OPTIMIZED(uint32_t);
BENCHMARK_TEMPLATE_INPLACE_OPTIMIZED(uint64_t);
BENCHMARK_TEMPLATE_INPLACE_OPTIMIZED(float);
BENCHMARK_TEMPLATE_INPLACE_OPTIMIZED(double);
BENCHMARK_TEMPLATE_INPLACE_OPTIMIZED(rgb8);
BENCHMARK_TEMPLATE_INPLACE_OPTIMIZED(rgba8);
BENCHMARK_TEMPLATE_OPTIMIZED(uint8_t);
BENCHMARK_TEMPLATE_OPTIMIZED(uint16_t);
BENCHMARK_TEMPLATE_OPTIMIZED(uint32_t);
BENCHMARK_TEMPLATE_OPTIMIZED(uint64_t);
BENCHMARK_TEMPLATE_OPTIMIZED(float);
BENCHMARK_TEMPLATE_OPTIMIZED(double);
BENCHMARK_TEMPLATE_OPTIMIZED(rgb8);
BENCHMARK_TEMPLATE_OPTIMIZED(rgba8);
......@@ -63,6 +63,6 @@ add_benchmark(BMMorphoBase BMMorphoBase.cpp)
add_benchmark(BMMorphers BMMorphers.cpp BMMorphers_main.cpp)
add_benchmark(BMReference_Linear BMReference_Linear.cpp BMReference_Linear_Reversed.cpp BMReference_Linear_main.cpp)
add_benchmark(BMReference_Neighborhood BMReference_Neighborhood_main.cpp)
add_benchmark(BMBufferPrimitives BMBufferPrimitives.cpp)
ExternalData_Add_Target(fetch-external-data)
......@@ -49,6 +49,21 @@ target_link_libraries(Pylene PRIVATE FreeImage::FreeImage)
#file(GLOB_RECURSE sources "include/mln/*.hpp")
#target_sources(Pylene INTERFACE $<BUILD_INTERFACE:${sources}>)
add_library(Pylene-bp OBJECT
src/bp/transpose.cpp
src/bp/swap.cpp
src/bp/alloc.cpp
)
target_include_directories(Pylene-bp PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>)
target_link_libraries(Pylene-bp PRIVATE xsimd::xsimd fmt::fmt)
target_compile_features(Pylene-bp PUBLIC cxx_std_20)
target_link_libraries(Pylene PRIVATE Pylene-bp)
target_sources(Pylene PRIVATE
src/accu/cvxhull.cpp
src/core/image_format.cpp
......@@ -82,6 +97,7 @@ target_sources(Pylene PRIVATE
target_compile_features(Pylene PUBLIC cxx_std_20)
if (CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10.0)
target_compile_options(Pylene-bp PUBLIC -fconcepts)
target_compile_options(Pylene PUBLIC -fconcepts)
endif()
......@@ -93,7 +109,7 @@ source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR}/include/mln FILES ${sources})
add_library(Pylene::Pylene ALIAS Pylene)
include(GNUInstallDirs)
install(TARGETS Pylene
install(TARGETS Pylene Pylene-bp
EXPORT PyleneTargets)
write_basic_package_version_file(
......
#pragma once
#include <cstddef>
#include <memory>
#include <mln/bp/utils.hpp>
namespace mln::bp
{
/// \brief Return an unitialized 2D-buffer with lines aligned on 32 bytes boundaries.
///
/// The pitch is the number of bytes between two consecutive lines. The pointer as to be freed
/// with ``mlb::bp::free``
///
///
/// \rst
/// :param width: Width of the 2D-buffer
/// :param height: Height of the 2D-buffer
/// :param out pitch: The number of bytes between two lines
/// :return: Pointer to the allocated buffer
/// \endrst
template <class T>
[[gnu::assume_aligned(32)]]
T* aligned_alloc_2d(int width, int height, std::ptrdiff_t& pitch);
[[gnu::assume_aligned(32)]]
void* aligned_alloc_2d(int width, int height, std::size_t esize, std::ptrdiff_t& pitch);
/// \brief Frees a 2D buffer allocated with ``mln::bp::aligned_alloc_2d``
///
/// \rst
/// :param ptr: Pointer to the 2D-buffer
/// \endrst
void aligned_free_2d(void* ptr);
template <class T>
void aligned_free_2d(T* ptr, int width, int height, std::ptrdiff_t pitch);
/******************************************/
/**** Implementation ****/
/******************************************/
template <class T>
[[gnu::assume_aligned(32)]]
T* aligned_alloc_2d(int width, int height, std::ptrdiff_t& pitch)
{
T* buffer = (T*)aligned_alloc_2d(width, height, sizeof(T), pitch);
std::ptrdiff_t s = pitch;
for (int y = 0; y < height; y++)
std::uninitialized_default_construct_n(mln::bp::ptr_offset(buffer, s * y), width);
return buffer;
}
template <class T>
inline
void aligned_free_2d(T* ptr, int width, int height, std::ptrdiff_t pitch)
{
for (int y = 0; y < height; y++)
std::destroy_n(mln::bp::ptr_offset(ptr, pitch * y), width);
aligned_free_2d((void*)ptr);
}
}
#pragma once
#include <cstddef>
#include <algorithm>
#include <mln/bp/utils.hpp>
namespace mln::bp
{
/// \brief Swap 2 2D-buffer
///
/// \rst
/// :param src: Buffer source
/// :param dst: Buffer destination
/// :param width: Width of the 2D-buffers
/// :param height: Height of the 2D-buffers
/// :param src_stride: Stride of the source buffer (in bytes)
/// :param dst_stride: Stride of the destination buffer (in bytes)
/// \endrst
template <typename T>
void copy(const T* __restrict src, T* dst, int width, int height, std::ptrdiff_t src_stride, std::ptrdiff_t dst_stride);
template <int WIDTH, int HEIGHT, typename T>
void copy(const T* src, T* dst, int width, int height, std::ptrdiff_t src_stride, std::ptrdiff_t dst_stride);
/******************************************/
/**** Implementation ****/
/******************************************/
template <int WIDTH, int HEIGHT, typename T>
inline void copy(const T* __restrict src, T* __restrict dst, int w, int h, std::ptrdiff_t src_stride,
std::ptrdiff_t dst_stride)
{
mln_assume(w <= WIDTH);
mln_assume(h <= HEIGHT);
for (int y = 0; y < h; ++y)
std::copy_n(mln::bp::ptr_offset(src, y * src_stride), w, mln::bp::ptr_offset(dst, y * dst_stride));
}
}
#pragma once
#include <cstddef>
#include <type_traits>
#include <algorithm>
#include <cstring>
#include <mln/bp/utils.hpp>
namespace mln::bp
{
/// \brief Swap 2 2D-buffer
///
/// \rst
/// :param a: Buffer A
/// :param b: Buffer B
/// :param width: Width of the 2D-buffers
/// :param height: Height of the 2D-buffers
/// :param astride: Stride of the buffer A (in bytes)
/// :param astride: Stride of the buffer B (in bytes)
/// \endrst
template <class T>
void swap(T* __restrict a, T* __restrict b, int width, int height, std::ptrdiff_t astride,
std::ptrdiff_t bstride) noexcept(std::is_nothrow_swappable_v<T>);
/// \brief Swap two 2D-buffers with bounds known at compile-time (mostly used for small block)
template <int WIDTH, int HEIGHT, class T>
void swap(T* __restrict a, T* __restrict b, void* __restrict tmp, int width, int height, std::ptrdiff_t a_stride,
std::ptrdiff_t b_stride);
/******************************************/
/**** Imlementation ****/
/******************************************/
namespace impl
{
template <class T>
inline void swap_generic(T* __restrict a, T* __restrict b, int width, int height, std::ptrdiff_t a_stride,
std::ptrdiff_t b_stride) noexcept(std::is_nothrow_swappable_v<T>)
{
for (int y = 0; y < height; ++y)
{
auto a_ptr = ptr_offset(a, y * a_stride);
auto b_ptr = ptr_offset(b, y * b_stride);
std::swap_ranges(a_ptr, a_ptr + width, b_ptr);
}
}
[[gnu::always_inline]]
inline void swap_buffer(void* __restrict a, void* __restrict b, void* __restrict tmp, std::size_t n)
{
std::memcpy(tmp, a, n);
std::memcpy(a, b, n);
std::memcpy(b, tmp, n);
}
[[gnu::noinline]]
void swap_raw(std::byte* __restrict a, std::byte* __restrict b, std::size_t len, int h, std::ptrdiff_t a_stride, std::ptrdiff_t b_stride) noexcept;
} // namespace impl
// Generic implementation
template <class T>
void swap(T* __restrict a, T* __restrict b, int width, int height, //
std::ptrdiff_t a_stride, std::ptrdiff_t b_stride) noexcept(std::is_nothrow_swappable_v<T>)
{
impl::swap_generic(a, b, width, height, a_stride, b_stride);
}
// Specialization of trivially copyable type
template <class T>
requires std::is_trivially_copyable_v<T>
inline void swap(T* __restrict a, T* __restrict b, int width, int height, std::ptrdiff_t a_stride, std::ptrdiff_t b_stride)
{
impl::swap_raw((std::byte*)a, (std::byte*)b, width * sizeof(T), height, a_stride, b_stride);
}
template <int WIDTH, int HEIGHT, class T>
inline void swap(T* __restrict a, T* __restrict b, void* __restrict tmp, int width, int height,
std::ptrdiff_t a_stride, std::ptrdiff_t b_stride)
{
mln_assume(width <= WIDTH);
mln_assume(height <= HEIGHT);
for (int y = 0; y < height; ++y)
impl::swap_buffer((std::byte*)a + y * a_stride, (std::byte*)b + y * b_stride, tmp, width * sizeof(T));
}
}
#pragma once
#include <cstdint>
#include <cstddef>
#include <type_traits>
#include <utility>
#include <mln/bp/utils.hpp>
namespace mln::bp
{
/// \brief Transpose inplace a 2D array of size n × n
///
/// The lines must be a correctly aligned on 32 bytes boundaries
/// (consider padding the data if this is not the case)
///
/// The implementation is optimized with simd instructions when possible
template <class T>
void transpose_inplace(T* buffer, int n, std::ptrdiff_t byte_stride) noexcept(std::is_nothrow_swappable_v<T>);
/// \brief Transpose the data from \p src into \p dst
template <class T>
void transpose(const T* src, T* dst, int width, int height, std::ptrdiff_t src_stride, std::ptrdiff_t dst_stride) noexcept(std::is_nothrow_swappable_v<T>);
void transpose_inplace(uint8_t* buffer, int n, std::ptrdiff_t byte_stride, int block_size);
void transpose_inplace(int8_t* buffer, int n, std::ptrdiff_t byte_stride, int block_size);
void transpose_inplace(uint16_t* buffer, int n, std::ptrdiff_t byte_stride, int block_size);
void transpose_inplace(int16_t* buffer, int n, std::ptrdiff_t byte_stride, int block_size);
void transpose_inplace(uint32_t* buffer, int n, std::ptrdiff_t byte_stride, int block_size);
void transpose_inplace(int32_t* buffer, int n, std::ptrdiff_t byte_stride, int block_size);
void transpose_inplace(uint64_t* buffer, int n, std::ptrdiff_t byte_stride, int block_size);
void transpose_inplace(int64_t* buffer, int n, std::ptrdiff_t byte_stride, int block_size);
void transpose_inplace(float* buffer, int n, std::ptrdiff_t byte_stride, int block_size);
void transpose_inplace(double* buffer, int n, std::ptrdiff_t byte_stride, int block_size);
} // namespace mln::bp
/******************************************/
/**** Implementation ****/
/******************************************/
namespace mln::bp
{
namespace impl
{
template <class T>
void transpose_inplace_naive(T* buffer, int n, std::ptrdiff_t byte_stride) noexcept(std::is_nothrow_swappable_v<T>)
{
for (int y = 1; y < n; ++y)
{
std::byte* row_ptr = (std::byte*)buffer + y * byte_stride;
std::byte* col_ptr = (std::byte*)buffer + y * sizeof(T);
for (int x = 0; x < y; ++x)
std::swap(reinterpret_cast<T*>(row_ptr)[x], *reinterpret_cast<T*>(col_ptr + x * byte_stride));
}
}
// \brief (same as bebore but at different location <=> transpose_swap (location should not overlap)
// \param (width, height) Dimensions of the input buffer (output buffer is supposed to heigth x width)
template <class T>
void transpose_swap_naive(T* __restrict in, T* __restrict out, int width, int height, //
std::ptrdiff_t in_byte_stride,
std::ptrdiff_t out_byte_stride) noexcept(std::is_nothrow_swappable_v<T>)
{
for (int y = 0; y < height; ++y)
{
T* row_ptr = ptr_offset(in, y * in_byte_stride);
T* col_ptr = out + y;
for (int x = 0; x < width; ++x)
std::swap(*ptr_offset(col_ptr, x * out_byte_stride), row_ptr[x]);
}
}
// \brief Copy and transpose an input buffer in an output buffer
// \param (width, height) Dimensions of the **output** buffer (input buffer is supposed to heigth x width)
template <class T>
inline void transpose_naive(const T* __restrict src, T* __restrict dst, //
int width, int height, //
std::ptrdiff_t src_stride,
std::ptrdiff_t dst_stride) noexcept(std::is_nothrow_swappable_v<T>)
{
for (int y = 0; y < height; ++y)
{
T* row_ptr = mln::bp::ptr_offset(dst, y * dst_stride);
const T* col_ptr = src + y;
for (int x = 0; x < width; ++x)
row_ptr[x] = *mln::bp::ptr_offset(col_ptr, x * src_stride);
}
}
} // namespace impl
template <class T>
void transpose_inplace(T* buffer, int n, std::ptrdiff_t byte_stride) noexcept(std::is_nothrow_swappable_v<T>)
{
impl::transpose_inplace_naive(buffer, n, byte_stride);
}
void transpose_inplace(uint8_t* buffer, int n, std::ptrdiff_t stride) noexcept;
void transpose_inplace(int8_t* buffer, int n, std::ptrdiff_t stride) noexcept;
void transpose_inplace(uint16_t* buffer, int n, std::ptrdiff_t stride) noexcept;
void transpose_inplace(int16_t* buffer, int n, std::ptrdiff_t stride) noexcept;
void transpose_inplace(uint32_t* buffer, int n, std::ptrdiff_t stride) noexcept;
void transpose_inplace(int32_t* buffer, int n, std::ptrdiff_t stride) noexcept;
void transpose_inplace(uint64_t* buffer, int n, std::ptrdiff_t stride) noexcept;
void transpose_inplace(int64_t* buffer, int n, std::ptrdiff_t stride) noexcept;
void transpose_inplace(float* buffer, int n, std::ptrdiff_t stride) noexcept;
void transpose_inplace(double* buffer, int n, std::ptrdiff_t stride) noexcept;
template <class T>
requires(std::is_trivially_copyable_v<T> && (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8)) //
void transpose_inplace(T* buffer, int n, std::ptrdiff_t byte_stride) noexcept
{
switch (sizeof(T))
{
case 1:
transpose_inplace((uint8_t*)buffer, n, byte_stride);
break;
case 2:
transpose_inplace((uint16_t*)buffer, n, byte_stride);
break;
case 4:
transpose_inplace((uint32_t*)buffer, n, byte_stride);
break;
case 8:
transpose_inplace((uint64_t*)buffer, n, byte_stride);
break;
}
}
template <class T>
[[gnu::noinline]] void transpose(const T* __restrict src, T* __restrict dst, int w, int h, //
std::ptrdiff_t src_stride,
std::ptrdiff_t dst_stride) noexcept(std::is_nothrow_swappable_v<T>)
{
impl::transpose_naive(src, dst, w, h, src_stride, dst_stride);
}
void transpose(const uint8_t* src, uint8_t* dst, int w, int h, std::ptrdiff_t src_stride, std::ptrdiff_t dst_stride) noexcept;
void transpose(const int8_t* src, int8_t* dst, int w, int h, std::ptrdiff_t src_stride, std::ptrdiff_t dst_stride) noexcept;
void transpose(const uint16_t* src, uint16_t* dst, int w, int h, std::ptrdiff_t src_stride, std::ptrdiff_t dst_stride) noexcept;
void transpose(const int16_t* src, int16_t* dst, int w, int h, std::ptrdiff_t src_stride, std::ptrdiff_t dst_stride) noexcept;
void transpose(const uint32_t* src, uint32_t* dst, int w, int h, std::ptrdiff_t src_stride, std::ptrdiff_t dst_stride) noexcept;
void transpose(const int32_t* src, int32_t* dst, int w, int h, std::ptrdiff_t src_stride, std::ptrdiff_t dst_stride) noexcept;
void transpose(const uint64_t* src, uint64_t* dst, int w, int h, std::ptrdiff_t src_stride, std::ptrdiff_t dst_stride) noexcept;
void transpose(const int64_t* src, int64_t* dst, int w, int h, std::ptrdiff_t src_stride, std::ptrdiff_t dst_stride) noexcept;
void transpose(const float* src, float* dst, int w, int h, std::ptrdiff_t src_stride, std::ptrdiff_t dst_stride) noexcept;
void transpose(const double* src, double* dst, int w, int h, std::ptrdiff_t src_stride, std::ptrdiff_t dst_stride) noexcept;
template <class T>
requires(std::is_trivially_copyable_v<T> && (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8)) //
void transpose(const T* __restrict src, T* __restrict dst, int width, int height, //
std::ptrdiff_t src_stride, std::ptrdiff_t dst_stride) noexcept
{
switch (sizeof(T))
{
case 1:
transpose((uint8_t*)src, (uint8_t*)dst, width, height, src_stride, dst_stride);
break;
case 2:
transpose((uint16_t*)src, (uint16_t*)dst, width, height, src_stride, dst_stride);
break;
case 4: