Commit cd4d99fa authored by Edwin Carlinet's avatar Edwin Carlinet
Browse files

Use collaborative loading.

parent 5c43839b
Pipeline #16180 passed with stages
in 11 minutes and 23 seconds
...@@ -133,16 +133,14 @@ namespace mln::morpho::details ...@@ -133,16 +133,14 @@ namespace mln::morpho::details
{ {
mln_entering("Running specialization for vertical dilation over 2d buffer with arithmetic types"); mln_entering("Running specialization for vertical dilation over 2d buffer with arithmetic types");
mln::morpho::experimental::details::vertical_running_max_algo_t<T, BinaryFunction> alg(sup); mln::morpho::experimental::details::running_max_2d<T>(in, out, sup, roi, k, /* use_extension = */ true, /* vertical = */ true);
alg.running_max_v2d(in, out, roi, k, true);
return; return;
} }
else if (line.is_horizontal()) else if (line.is_horizontal())
{ {
mln_entering("Running specialization for horizontal dilation over 2d buffer with arithmetic types"); mln_entering("Running specialization for horizontal dilation over 2d buffer with arithmetic types");
mln::morpho::experimental::details::vertical_running_max_algo_t<T, BinaryFunction> alg(sup); mln::morpho::experimental::details::running_max_2d<T>(in, out, sup, roi, k, /* use_extension = */ true, /* vertical = */ false);
alg.running_max_h2d(in, out, roi, k, true);
return; return;
} }
......
...@@ -9,34 +9,55 @@ ...@@ -9,34 +9,55 @@
namespace mln::morpho::experimental::details namespace mln::morpho::experimental::details
{ {
class vertical_running_max_algo_base_t template <class T, class I, class J, class BinaryFunction>
void running_max_2d(I& input, J& output, BinaryFunction sup, mln::experimental::box2d roi, int k, bool use_extension, bool vertical);
/******************************************/
/**** Implementation ****/
/******************************************/
class TileLoaderBase
{ {
// Load tile from memory (roi is in the vertical layout coordinates system)
virtual void load_tile(std::byte* out, std::ptrdiff_t byte_stride, mln::experimental::box2d roi) = 0;
};
class TileWriterBase
{
// Copy a line to output (coordinates and size are in the vertical layout coordinates system)
virtual void write_tile(const std::byte* in, std::ptrdiff_t byte_stride, mln::experimental::box2d roi) = 0;
};
class vertical_running_max_algo_base_t
{
private:
// Accumulate the supremum column-wise (eq to the python A.cumsum(axis=0)) // Accumulate the supremum column-wise (eq to the python A.cumsum(axis=0))
virtual void partial_sum_block2d(const std::byte* __restrict in, std::byte* __restrict out, int width, int height, virtual void partial_sum_block2d(const std::byte* __restrict in, std::byte* __restrict out, int width, int height,
std::ptrdiff_t in_byte_stride, std::ptrdiff_t out_byte_stride) = 0; std::ptrdiff_t in_byte_stride, std::ptrdiff_t out_byte_stride) = 0;
// Apply PW OUT[x] = SUP(A[x], B[x]) // Apply PW OUT[x] = SUP(A[x], B[x])
virtual void apply_sup(std::byte* __restrict A, std::byte* __restrict B, std::byte* __restrict OUT, std::size_t n) = 0; virtual void apply_sup(std::byte* __restrict A, std::byte* __restrict B, std::byte* __restrict OUT, int width) = 0;
virtual int get_block_width() const = 0; virtual int get_block_width() const = 0;
virtual std::size_t get_sample_size() const = 0; virtual std::size_t get_sample_size() const = 0;
TileLoaderBase* m_tile_loader = nullptr;
TileWriterBase* m_tile_writer = nullptr;
public: public:
// Apply the running max algorithm over a block // Apply the running max algorithm over a block
// Memory has already been allocated // Memory has already been allocated
void running_max_block2d(std::byte* f, std::byte* g, std::byte* h, int width, int height, std::ptrdiff_t f_byte_stride, void running_max_block2d(std::byte* f, std::byte* g, std::byte* h, std::ptrdiff_t f_byte_stride,
std::ptrdiff_t g_byte_stride, std::ptrdiff_t h_byte_stride, int k, bool use_extension); std::ptrdiff_t g_byte_stride, std::ptrdiff_t h_byte_stride, mln::experimental::box2d roi,
int k, bool use_extension);
// Running the 2D dilation vertically using tiling // Apply the running max algorithm over a roi using tiling
template <class I, class T> void execute(mln::experimental::box2d roi, int k, bool use_extension, bool vertical = true);
void running_max_v2d(I& in, mln::experimental::image2d<T>& out, mln::experimental::box2d roi, int k, bool use_extension);
// Running the 2D dilation vertically using tiling void set_tile_reader(TileLoaderBase* r) { m_tile_loader = r; }
template <class I, class T> void set_tile_writer(TileWriterBase* w) { m_tile_writer = w; }
void running_max_h2d(I& in, mln::experimental::image2d<T>& out, mln::experimental::box2d roi, int k, bool use_extension);
}; };
...@@ -50,26 +71,57 @@ namespace mln::morpho::experimental::details ...@@ -50,26 +71,57 @@ namespace mln::morpho::experimental::details
static_assert(::ranges::regular_invocable<BinaryFunction, simd_t, simd_t>); static_assert(::ranges::regular_invocable<BinaryFunction, simd_t, simd_t>);
static_assert(::ranges::regular_invocable<BinaryFunction, T, T>); static_assert(::ranges::regular_invocable<BinaryFunction, T, T>);
void apply_sup(std::byte* __restrict A, std::byte* __restrict B, std::byte* __restrict OUT, std::size_t n) final; void apply_sup(std::byte* __restrict A, std::byte* __restrict B, std::byte* __restrict OUT, int n) final;
void partial_sum_block2d(const std::byte* __restrict in, std::byte* __restrict out, int width, int height, void partial_sum_block2d(const std::byte* __restrict in, std::byte* __restrict out, int width, int height,
std::ptrdiff_t in_byte_stride, std::ptrdiff_t out_byte_stride) final; std::ptrdiff_t in_byte_stride, std::ptrdiff_t out_byte_stride) final;
int get_block_width() const final { return BLOCK_WIDTH; } int get_block_width() const final { return BLOCK_WIDTH; }
std::size_t get_sample_size() const final { return sizeof(T); } std::size_t get_sample_size() const final { return sizeof(T); }
BinaryFunction m_sup;
public: public:
vertical_running_max_algo_t(BinaryFunction sup) vertical_running_max_algo_t(BinaryFunction sup)
: m_sup{std::move(sup)} : m_sup{std::move(sup)}
{ {
} }
protected:
BinaryFunction m_sup;
}; };
template <class T, class BinaryFunction> template <class T, class BinaryFunction>
void vertical_running_max_algo_t<T, BinaryFunction>::apply_sup(std::byte* __restrict A, std::byte* __restrict B, std::byte* __restrict out, std::size_t n) void vertical_running_max_algo_t<T, BinaryFunction>::apply_sup(std::byte* __restrict A_, std::byte* __restrict B_, std::byte* __restrict out_, int width)
{ {
std::transform((T*)A, (T*)A + n, (T*)B, (T*)out, m_sup); const T* A = (T*)A_;
const T* B = (T*)B_;
T* out = (T*)out_;
const int K = width / WARP_SIZE;
const int rem = width % WARP_SIZE;
assert(width <= BLOCK_WIDTH);
for (int k = 0; k < K; k++)
{
simd_t a;
simd_t b;
simd_t c;
a.copy_from(A + k * WARP_SIZE, std::experimental::element_aligned);
b.copy_from(B + k * WARP_SIZE, std::experimental::element_aligned);
c = m_sup(a, b);
c.copy_to(out + k * WARP_SIZE, std::experimental::element_aligned);
}
if (rem > 0)
{
A += K * WARP_SIZE;
B += K * WARP_SIZE;
out += K * WARP_SIZE;
std::transform(A, A + rem, B, out, m_sup);
}
} }
...@@ -80,7 +132,6 @@ namespace mln::morpho::experimental::details ...@@ -80,7 +132,6 @@ namespace mln::morpho::experimental::details
constexpr int MAX_WARP_COUNT = BLOCK_WIDTH / WARP_SIZE; constexpr int MAX_WARP_COUNT = BLOCK_WIDTH / WARP_SIZE;
const int K = width / WARP_SIZE; const int K = width / WARP_SIZE;
const int rem = width % WARP_SIZE; const int rem = width % WARP_SIZE;
...@@ -139,8 +190,8 @@ namespace mln::morpho::experimental::details ...@@ -139,8 +190,8 @@ namespace mln::morpho::experimental::details
template <class I, class T> template <class I, class T>
[[gnu::noinline]] void copy_block(I& in, mln::experimental::box2d roi, T* __restrict out, std::ptrdiff_t out_stride) [[gnu::noinline]] void copy_block(I& in, mln::experimental::box2d roi, T* __restrict out, std::ptrdiff_t out_stride)
{ {
const int x0 = roi.tl().x(); const int x0 = roi.x();
const int y0 = roi.tl().y(); const int y0 = roi.y();
for (int y = 0; y < roi.height(); ++y) for (int y = 0; y < roi.height(); ++y)
{ {
...@@ -153,8 +204,8 @@ namespace mln::morpho::experimental::details ...@@ -153,8 +204,8 @@ namespace mln::morpho::experimental::details
template <class I, class T> template <class I, class T>
[[gnu::noinline]] void copy_block(T* __restrict in, std::ptrdiff_t istride, mln::experimental::box2d roi, I& out) [[gnu::noinline]] void copy_block(T* __restrict in, std::ptrdiff_t istride, mln::experimental::box2d roi, I& out)
{ {
const int x0 = roi.tl().x(); const int x0 = roi.x();
const int y0 = roi.tl().y(); const int y0 = roi.y();
for (int y = 0; y < roi.height(); ++y) for (int y = 0; y < roi.height(); ++y)
{ {
...@@ -167,8 +218,8 @@ namespace mln::morpho::experimental::details ...@@ -167,8 +218,8 @@ namespace mln::morpho::experimental::details
template <class I, class T> template <class I, class T>
[[gnu::noinline]] void transpose_block2d(I& in, mln::experimental::box2d input_roi, T* __restrict out, std::ptrdiff_t out_stride) [[gnu::noinline]] void transpose_block2d(I& in, mln::experimental::box2d input_roi, T* __restrict out, std::ptrdiff_t out_stride)
{ {
const int x0 = input_roi.tl().x(); const int x0 = input_roi.x();
const int y0 = input_roi.tl().y(); const int y0 = input_roi.y();
for (int y = 0; y < input_roi.height(); ++y) for (int y = 0; y < input_roi.height(); ++y)
...@@ -179,106 +230,75 @@ namespace mln::morpho::experimental::details ...@@ -179,106 +230,75 @@ namespace mln::morpho::experimental::details
template <class I, class T> template <class I, class T>
[[gnu::noinline]] void transpose_block2d(T* __restrict in, std::ptrdiff_t istride, mln::experimental::box2d output_roi, I& out) [[gnu::noinline]] void transpose_block2d(T* __restrict in, std::ptrdiff_t istride, mln::experimental::box2d output_roi, I& out)
{ {
const int x0 = output_roi.tl().x(); const int x0 = output_roi.x();
const int y0 = output_roi.tl().y(); const int y0 = output_roi.y();
for (int y = 0; y < output_roi.height(); ++y) for (int y = 0; y < output_roi.height(); ++y)
for (int x = 0; x < output_roi.width(); ++x) for (int x = 0; x < output_roi.width(); ++x)
out.at({x0 + x, y0 + y}) = *(in + x * istride + y); out.at({x0 + x, y0 + y}) = *(in + x * istride + y);
} }
template <class I, class T> template <class I, class T>
void vertical_running_max_algo_base_t::running_max_v2d(I& in, mln::experimental::image2d<T>& out, mln::experimental::box2d roi, int k, bool use_extension) class TileLoader : public TileLoaderBase
{ {
int kBlockWidth = this->get_block_width(); public:
auto sz = this->get_sample_size(); // Load tile from memory (roi is in the vertical layout coordinates system)
std::ptrdiff_t kBlockByteSize = sz * kBlockWidth; void load_tile(std::byte* out, std::ptrdiff_t byte_stride, mln::experimental::box2d roi) override
const int x0 = roi.tl().x();
const int y0 = roi.tl().y();
const int y1 = roi.br().y();
const int width = roi.width();
const int height = roi.height();
std::byte* f = (std::byte*) std::malloc(sz * kBlockWidth * (height + 2 * k));
std::byte* g = (std::byte*) std::malloc(sz * kBlockWidth * (height + 2 * k));
std::byte* h = (std::byte*) std::malloc(sz * kBlockWidth * (height + 2 * k));
for (int x = 0; x < width; x += kBlockWidth)
{ {
int w = std::min(kBlockWidth, width - x); if (m_vertical)
copy_block(*m_input, roi, (T*)out, byte_stride / sizeof(T));
else
// Copy the block
{ {
mln::experimental::box2d region = {{x + x0, y0 - k}, {x + x0 + w, y1 + k}}; mln::experimental::box2d region(roi.y(), roi.x(), roi.height(), roi.width());
copy_block(in, region, (T*)f, kBlockWidth); transpose_block2d(*m_input, region, (T*)out, byte_stride / sizeof(T));
}
this->running_max_block2d(f + k * kBlockByteSize, //
g + k * kBlockByteSize, //
h + k * kBlockByteSize, //
w, height, kBlockByteSize, kBlockByteSize, kBlockByteSize, k, use_extension);
// Copy back
{
mln::experimental::box2d region = {{x + x0, y0}, {x + x0 + w, y1}};
copy_block((T*)f + kBlockWidth * k, kBlockWidth, region, out);
} }
} }
std::free(f); TileLoader(I& input, bool vertical) : m_input(&input), m_vertical{vertical} {}
std::free(g);
std::free(h);
}
private:
I* m_input;
bool m_vertical;
};
template <class I, class T> template <class I, class T>
void vertical_running_max_algo_base_t::running_max_h2d(I& in, mln::experimental::image2d<T>& out, mln::experimental::box2d roi, int k, bool use_extension) class TileWriter : public TileWriterBase
{ {
int kBlockWidth = this->get_block_width(); public:
auto sz = this->get_sample_size(); // Copy a line to output (coordinates and size are in the vertical layout coordinates system)
std::ptrdiff_t kBlockByteSize = sz * kBlockWidth; void write_tile(const std::byte* in, std::ptrdiff_t byte_stride, mln::experimental::box2d roi) override
const int x0 = roi.tl().x();
const int y0 = roi.tl().y();
const int x1 = roi.br().x();
const int width = roi.width();
const int height = roi.height();
std::byte* f = (std::byte*) std::malloc(sz * kBlockWidth * (width + 2 * k));
std::byte* g = (std::byte*) std::malloc(sz * kBlockWidth * (width + 2 * k));
std::byte* h = (std::byte*) std::malloc(sz * kBlockWidth * (width + 2 * k));
for (int y = 0; y < height; y += kBlockWidth)
{ {
int H = std::min(kBlockWidth, height - y); if (m_vertical)
copy_block((const T*)in, byte_stride / sizeof(T), roi, *m_output);
// Copy the block else
{ {
mln::experimental::box2d region = {{x0 - k, y0 + y}, {x1 + k, y0 + y + H}}; mln::experimental::box2d region(roi.y(), roi.x(), roi.height(), roi.width());
transpose_block2d(in, region, (T*)f, kBlockWidth); transpose_block2d((const T*)in, byte_stride / sizeof(T), region, *m_output);
} }
}
this->running_max_block2d(f + k * kBlockByteSize, // TileWriter(I& output, bool vertical) : m_output(&output), m_vertical{vertical} {}
g + k * kBlockByteSize, //
h + k * kBlockByteSize, // private:
H, width, kBlockByteSize, kBlockByteSize, kBlockByteSize, k, use_extension); I* m_output;
bool m_vertical;
};
// Copy back
{
mln::experimental::box2d region = {{x0, y0 + y}, {x1, y0 + y + H}};
transpose_block2d((T*)f + kBlockWidth * k, kBlockWidth, region, out);
}
}
std::free(f); template <class T, class I, class J, class BinaryFunction>
std::free(g); void running_max_2d(I& input, J& output, BinaryFunction sup, mln::experimental::box2d roi, int k, bool use_extension, bool vertical)
std::free(h); {
TileLoader<I, T> r(input, vertical);
TileWriter<J, T> w(output, vertical);
vertical_running_max_algo_t<T, BinaryFunction> alg(sup);
alg.set_tile_reader(&r);
alg.set_tile_writer(&w);
alg.execute(roi, k, use_extension, vertical);
} }
} // namespace mln::morpho::details } // namespace mln::morpho::details
...@@ -6,14 +6,17 @@ namespace mln::morpho::experimental::details ...@@ -6,14 +6,17 @@ namespace mln::morpho::experimental::details
{ {
void vertical_running_max_algo_base_t::running_max_block2d(std::byte* __restrict f, std::byte* __restrict g, std::byte* __restrict h, void vertical_running_max_algo_base_t::running_max_block2d(std::byte* __restrict f, std::byte* __restrict g,
int width, int height, std::ptrdiff_t f_byte_stride, std::byte* __restrict h, std::ptrdiff_t f_byte_stride,
std::ptrdiff_t g_byte_stride, std::ptrdiff_t h_byte_stride, std::ptrdiff_t g_byte_stride, std::ptrdiff_t h_byte_stride,
int k, bool use_extension) mln::experimental::box2d roi, int k, bool use_extension)
{ {
assert(width <= this->get_block_width()); int x0 = roi.x();
int y0 = roi.y();
int width = roi.width();
int height = roi.height();
assert(width <= this->get_block_width());
const int alpha = 2 * k + 1; const int alpha = 2 * k + 1;
...@@ -25,6 +28,13 @@ namespace mln::morpho::experimental::details ...@@ -25,6 +28,13 @@ namespace mln::morpho::experimental::details
{ {
int chunk_size = std::min(rem, alpha); int chunk_size = std::min(rem, alpha);
// Copy the into the tile$
if (m_tile_loader)
{
mln::experimental::box2d region(x0, y0 + chunk_start, width, chunk_size);
m_tile_loader->load_tile(f + chunk_start * f_byte_stride, f_byte_stride, region);
}
// Forward pass // Forward pass
// Compute g[x] = Max f(y), y ∈ [α * ⌊x / α⌋ : x] // Compute g[x] = Max f(y), y ∈ [α * ⌊x / α⌋ : x]
this->partial_sum_block2d(f + chunk_start * f_byte_stride, // this->partial_sum_block2d(f + chunk_start * f_byte_stride, //
...@@ -44,15 +54,62 @@ namespace mln::morpho::experimental::details ...@@ -44,15 +54,62 @@ namespace mln::morpho::experimental::details
// out[x] = Max (Max f[x-k:b), Max f[b:x+k]) with b = α.⌈(x-k)/α⌉ = α.⌊(x+k)/α⌋ // out[x] = Max (Max f[x-k:b), Max f[b:x+k]) with b = α.⌈(x-k)/α⌉ = α.⌊(x+k)/α⌋
// = Max( h[x-k], g[x+k] ) // = Max( h[x-k], g[x+k] )
{ {
for (int i = 0; i < height; ++i) const int kBlockHeight = 16;
for (int y = 0; y < height; y += kBlockHeight)
{ {
this->apply_sup(h + (i - k) * h_byte_stride, // int hroi = std::min(kBlockHeight, height - y);
g + (i + k) * g_byte_stride, // for (int i = 0; i < hroi; ++i)
f + i * f_byte_stride, static_cast<std::size_t>(width)); {
this->apply_sup(h + (y + i - k) * h_byte_stride, //
g + (y + i + k) * g_byte_stride, //
f + (y + i) * f_byte_stride, width);
}
// Write the tile
if (m_tile_writer)
{
mln::experimental::box2d region(x0, y0 + y, width, hroi);
m_tile_writer->write_tile(f + y * f_byte_stride, f_byte_stride, region);
}
} }
} }
} }
void vertical_running_max_algo_base_t::execute(mln::experimental::box2d roi, int k, bool use_extension, bool vertical)
{
int kBlockWidth = this->get_block_width();
auto sz = this->get_sample_size();
std::ptrdiff_t kBlockByteSize = sz * kBlockWidth;
const int x0 = (vertical) ? roi.x() : roi.y();
const int y0 = (vertical) ? roi.y() : roi.x();
const int width = (vertical) ? roi.width() : roi.height();
const int height = (vertical) ? roi.height() : roi.width();
std::byte* f = (std::byte*)std::malloc(kBlockByteSize * (height + 2 * k));
std::byte* g = (std::byte*)std::malloc(kBlockByteSize * (height + 2 * k));
std::byte* h = (std::byte*)std::malloc(kBlockByteSize * (height + 2 * k));
std::byte* f_shifted = f + k * kBlockByteSize;
std::byte* g_shifted = g + k * kBlockByteSize;
std::byte* h_shifted = h + k * kBlockByteSize;
for (int x = 0; x < width; x += kBlockWidth)
{
int w = std::min(kBlockWidth, width - x);
mln::experimental::box2d region(x0 + x, y0, w, height);
this->running_max_block2d(f_shifted, g_shifted, h_shifted, //
kBlockByteSize, kBlockByteSize, kBlockByteSize, region, k, use_extension);
}
std::free(f);
std::free(g);
std::free(h);
}
} // namespace mln::morpho::experimental::details } // namespace mln::morpho::experimental::details
...@@ -114,11 +114,12 @@ public: ...@@ -114,11 +114,12 @@ public:
sup_t sup = {m_sup, m_sup_vec}; sup_t sup = {m_sup, m_sup_vec};
mln::morpho::experimental::details::vertical_running_max_algo_t<int, sup_t> algo(sup); mln::morpho::experimental::details::vertical_running_max_algo_t<int, sup_t> algo(sup);
mln::experimental::box2d roi(width, height);