Commit cd4d99fa authored by Edwin Carlinet's avatar Edwin Carlinet
Browse files

Use collaborative loading.

parent 5c43839b
Pipeline #16180 passed with stages
in 11 minutes and 23 seconds
......@@ -133,16 +133,14 @@ namespace mln::morpho::details
{
mln_entering("Running specialization for vertical dilation over 2d buffer with arithmetic types");
mln::morpho::experimental::details::vertical_running_max_algo_t<T, BinaryFunction> alg(sup);
alg.running_max_v2d(in, out, roi, k, true);
mln::morpho::experimental::details::running_max_2d<T>(in, out, sup, roi, k, /* use_extension = */ true, /* vertical = */ true);
return;
}
else if (line.is_horizontal())
{
mln_entering("Running specialization for horizontal dilation over 2d buffer with arithmetic types");
mln::morpho::experimental::details::vertical_running_max_algo_t<T, BinaryFunction> alg(sup);
alg.running_max_h2d(in, out, roi, k, true);
mln::morpho::experimental::details::running_max_2d<T>(in, out, sup, roi, k, /* use_extension = */ true, /* vertical = */ false);
return;
}
......
......@@ -9,34 +9,55 @@
namespace mln::morpho::experimental::details
{
class vertical_running_max_algo_base_t
template <class T, class I, class J, class BinaryFunction>
void running_max_2d(I& input, J& output, BinaryFunction sup, mln::experimental::box2d roi, int k, bool use_extension, bool vertical);
/******************************************/
/**** Implementation ****/
/******************************************/
class TileLoaderBase
{
// Load tile from memory (roi is in the vertical layout coordinates system)
virtual void load_tile(std::byte* out, std::ptrdiff_t byte_stride, mln::experimental::box2d roi) = 0;
};
class TileWriterBase
{
// Copy a line to output (coordinates and size are in the vertical layout coordinates system)
virtual void write_tile(const std::byte* in, std::ptrdiff_t byte_stride, mln::experimental::box2d roi) = 0;
};
class vertical_running_max_algo_base_t
{
private:
// Accumulate the supremum column-wise (eq to the python A.cumsum(axis=0))
virtual void partial_sum_block2d(const std::byte* __restrict in, std::byte* __restrict out, int width, int height,
std::ptrdiff_t in_byte_stride, std::ptrdiff_t out_byte_stride) = 0;
// Apply PW OUT[x] = SUP(A[x], B[x])
virtual void apply_sup(std::byte* __restrict A, std::byte* __restrict B, std::byte* __restrict OUT, std::size_t n) = 0;
virtual void apply_sup(std::byte* __restrict A, std::byte* __restrict B, std::byte* __restrict OUT, int width) = 0;
virtual int get_block_width() const = 0;
virtual std::size_t get_sample_size() const = 0;
TileLoaderBase* m_tile_loader = nullptr;
TileWriterBase* m_tile_writer = nullptr;
public:
// Apply the running max algorithm over a block
// Memory has already been allocated
void running_max_block2d(std::byte* f, std::byte* g, std::byte* h, int width, int height, std::ptrdiff_t f_byte_stride,
std::ptrdiff_t g_byte_stride, std::ptrdiff_t h_byte_stride, int k, bool use_extension);
void running_max_block2d(std::byte* f, std::byte* g, std::byte* h, std::ptrdiff_t f_byte_stride,
std::ptrdiff_t g_byte_stride, std::ptrdiff_t h_byte_stride, mln::experimental::box2d roi,
int k, bool use_extension);
// Running the 2D dilation vertically using tiling
template <class I, class T>
void running_max_v2d(I& in, mln::experimental::image2d<T>& out, mln::experimental::box2d roi, int k, bool use_extension);
// Apply the running max algorithm over a roi using tiling
void execute(mln::experimental::box2d roi, int k, bool use_extension, bool vertical = true);
// Running the 2D dilation vertically using tiling
template <class I, class T>
void running_max_h2d(I& in, mln::experimental::image2d<T>& out, mln::experimental::box2d roi, int k, bool use_extension);
void set_tile_reader(TileLoaderBase* r) { m_tile_loader = r; }
void set_tile_writer(TileWriterBase* w) { m_tile_writer = w; }
};
......@@ -50,26 +71,57 @@ namespace mln::morpho::experimental::details
static_assert(::ranges::regular_invocable<BinaryFunction, simd_t, simd_t>);
static_assert(::ranges::regular_invocable<BinaryFunction, T, T>);
void apply_sup(std::byte* __restrict A, std::byte* __restrict B, std::byte* __restrict OUT, std::size_t n) final;
void apply_sup(std::byte* __restrict A, std::byte* __restrict B, std::byte* __restrict OUT, int n) final;
void partial_sum_block2d(const std::byte* __restrict in, std::byte* __restrict out, int width, int height,
std::ptrdiff_t in_byte_stride, std::ptrdiff_t out_byte_stride) final;
int get_block_width() const final { return BLOCK_WIDTH; }
std::size_t get_sample_size() const final { return sizeof(T); }
BinaryFunction m_sup;
public:
vertical_running_max_algo_t(BinaryFunction sup)
: m_sup{std::move(sup)}
{
}
protected:
BinaryFunction m_sup;
};
template <class T, class BinaryFunction>
void vertical_running_max_algo_t<T, BinaryFunction>::apply_sup(std::byte* __restrict A, std::byte* __restrict B, std::byte* __restrict out, std::size_t n)
void vertical_running_max_algo_t<T, BinaryFunction>::apply_sup(std::byte* __restrict A_, std::byte* __restrict B_, std::byte* __restrict out_, int width)
{
std::transform((T*)A, (T*)A + n, (T*)B, (T*)out, m_sup);
const T* A = (T*)A_;
const T* B = (T*)B_;
T* out = (T*)out_;
const int K = width / WARP_SIZE;
const int rem = width % WARP_SIZE;
assert(width <= BLOCK_WIDTH);
for (int k = 0; k < K; k++)
{
simd_t a;
simd_t b;
simd_t c;
a.copy_from(A + k * WARP_SIZE, std::experimental::element_aligned);
b.copy_from(B + k * WARP_SIZE, std::experimental::element_aligned);
c = m_sup(a, b);
c.copy_to(out + k * WARP_SIZE, std::experimental::element_aligned);
}
if (rem > 0)
{
A += K * WARP_SIZE;
B += K * WARP_SIZE;
out += K * WARP_SIZE;
std::transform(A, A + rem, B, out, m_sup);
}
}
......@@ -80,7 +132,6 @@ namespace mln::morpho::experimental::details
constexpr int MAX_WARP_COUNT = BLOCK_WIDTH / WARP_SIZE;
const int K = width / WARP_SIZE;
const int rem = width % WARP_SIZE;
......@@ -139,8 +190,8 @@ namespace mln::morpho::experimental::details
template <class I, class T>
[[gnu::noinline]] void copy_block(I& in, mln::experimental::box2d roi, T* __restrict out, std::ptrdiff_t out_stride)
{
const int x0 = roi.tl().x();
const int y0 = roi.tl().y();
const int x0 = roi.x();
const int y0 = roi.y();
for (int y = 0; y < roi.height(); ++y)
{
......@@ -153,8 +204,8 @@ namespace mln::morpho::experimental::details
template <class I, class T>
[[gnu::noinline]] void copy_block(T* __restrict in, std::ptrdiff_t istride, mln::experimental::box2d roi, I& out)
{
const int x0 = roi.tl().x();
const int y0 = roi.tl().y();
const int x0 = roi.x();
const int y0 = roi.y();
for (int y = 0; y < roi.height(); ++y)
{
......@@ -167,8 +218,8 @@ namespace mln::morpho::experimental::details
template <class I, class T>
[[gnu::noinline]] void transpose_block2d(I& in, mln::experimental::box2d input_roi, T* __restrict out, std::ptrdiff_t out_stride)
{
const int x0 = input_roi.tl().x();
const int y0 = input_roi.tl().y();
const int x0 = input_roi.x();
const int y0 = input_roi.y();
for (int y = 0; y < input_roi.height(); ++y)
......@@ -179,8 +230,8 @@ namespace mln::morpho::experimental::details
template <class I, class T>
[[gnu::noinline]] void transpose_block2d(T* __restrict in, std::ptrdiff_t istride, mln::experimental::box2d output_roi, I& out)
{
const int x0 = output_roi.tl().x();
const int y0 = output_roi.tl().y();
const int x0 = output_roi.x();
const int y0 = output_roi.y();
for (int y = 0; y < output_roi.height(); ++y)
for (int x = 0; x < output_roi.width(); ++x)
......@@ -189,96 +240,65 @@ namespace mln::morpho::experimental::details
template <class I, class T>
void vertical_running_max_algo_base_t::running_max_v2d(I& in, mln::experimental::image2d<T>& out, mln::experimental::box2d roi, int k, bool use_extension)
class TileLoader : public TileLoaderBase
{
int kBlockWidth = this->get_block_width();
auto sz = this->get_sample_size();
std::ptrdiff_t kBlockByteSize = sz * kBlockWidth;
const int x0 = roi.tl().x();
const int y0 = roi.tl().y();
const int y1 = roi.br().y();
const int width = roi.width();
const int height = roi.height();
std::byte* f = (std::byte*) std::malloc(sz * kBlockWidth * (height + 2 * k));
std::byte* g = (std::byte*) std::malloc(sz * kBlockWidth * (height + 2 * k));
std::byte* h = (std::byte*) std::malloc(sz * kBlockWidth * (height + 2 * k));
for (int x = 0; x < width; x += kBlockWidth)
{
int w = std::min(kBlockWidth, width - x);
// Copy the block
public:
// Load tile from memory (roi is in the vertical layout coordinates system)
void load_tile(std::byte* out, std::ptrdiff_t byte_stride, mln::experimental::box2d roi) override
{
mln::experimental::box2d region = {{x + x0, y0 - k}, {x + x0 + w, y1 + k}};
copy_block(in, region, (T*)f, kBlockWidth);
}
this->running_max_block2d(f + k * kBlockByteSize, //
g + k * kBlockByteSize, //
h + k * kBlockByteSize, //
w, height, kBlockByteSize, kBlockByteSize, kBlockByteSize, k, use_extension);
// Copy back
if (m_vertical)
copy_block(*m_input, roi, (T*)out, byte_stride / sizeof(T));
else
{
mln::experimental::box2d region = {{x + x0, y0}, {x + x0 + w, y1}};
copy_block((T*)f + kBlockWidth * k, kBlockWidth, region, out);
mln::experimental::box2d region(roi.y(), roi.x(), roi.height(), roi.width());
transpose_block2d(*m_input, region, (T*)out, byte_stride / sizeof(T));
}
}
std::free(f);
std::free(g);
std::free(h);
}
TileLoader(I& input, bool vertical) : m_input(&input), m_vertical{vertical} {}
private:
I* m_input;
bool m_vertical;
};
template <class I, class T>
void vertical_running_max_algo_base_t::running_max_h2d(I& in, mln::experimental::image2d<T>& out, mln::experimental::box2d roi, int k, bool use_extension)
class TileWriter : public TileWriterBase
{
int kBlockWidth = this->get_block_width();
auto sz = this->get_sample_size();
std::ptrdiff_t kBlockByteSize = sz * kBlockWidth;
const int x0 = roi.tl().x();
const int y0 = roi.tl().y();
const int x1 = roi.br().x();
const int width = roi.width();
const int height = roi.height();
std::byte* f = (std::byte*) std::malloc(sz * kBlockWidth * (width + 2 * k));
std::byte* g = (std::byte*) std::malloc(sz * kBlockWidth * (width + 2 * k));
std::byte* h = (std::byte*) std::malloc(sz * kBlockWidth * (width + 2 * k));
for (int y = 0; y < height; y += kBlockWidth)
public:
// Copy a line to output (coordinates and size are in the vertical layout coordinates system)
void write_tile(const std::byte* in, std::ptrdiff_t byte_stride, mln::experimental::box2d roi) override
{
int H = std::min(kBlockWidth, height - y);
// Copy the block
if (m_vertical)
copy_block((const T*)in, byte_stride / sizeof(T), roi, *m_output);
else
{
mln::experimental::box2d region = {{x0 - k, y0 + y}, {x1 + k, y0 + y + H}};
transpose_block2d(in, region, (T*)f, kBlockWidth);
mln::experimental::box2d region(roi.y(), roi.x(), roi.height(), roi.width());
transpose_block2d((const T*)in, byte_stride / sizeof(T), region, *m_output);
}
}
TileWriter(I& output, bool vertical) : m_output(&output), m_vertical{vertical} {}
private:
I* m_output;
bool m_vertical;
};
this->running_max_block2d(f + k * kBlockByteSize, //
g + k * kBlockByteSize, //
h + k * kBlockByteSize, //
H, width, kBlockByteSize, kBlockByteSize, kBlockByteSize, k, use_extension);
// Copy back
template <class T, class I, class J, class BinaryFunction>
void running_max_2d(I& input, J& output, BinaryFunction sup, mln::experimental::box2d roi, int k, bool use_extension, bool vertical)
{
mln::experimental::box2d region = {{x0, y0 + y}, {x1, y0 + y + H}};
transpose_block2d((T*)f + kBlockWidth * k, kBlockWidth, region, out);
}
}
TileLoader<I, T> r(input, vertical);
TileWriter<J, T> w(output, vertical);
std::free(f);
std::free(g);
std::free(h);
vertical_running_max_algo_t<T, BinaryFunction> alg(sup);
alg.set_tile_reader(&r);
alg.set_tile_writer(&w);
alg.execute(roi, k, use_extension, vertical);
}
} // namespace mln::morpho::details
......@@ -6,14 +6,17 @@ namespace mln::morpho::experimental::details
{
void vertical_running_max_algo_base_t::running_max_block2d(std::byte* __restrict f, std::byte* __restrict g, std::byte* __restrict h,
int width, int height, std::ptrdiff_t f_byte_stride,
void vertical_running_max_algo_base_t::running_max_block2d(std::byte* __restrict f, std::byte* __restrict g,
std::byte* __restrict h, std::ptrdiff_t f_byte_stride,
std::ptrdiff_t g_byte_stride, std::ptrdiff_t h_byte_stride,
int k, bool use_extension)
mln::experimental::box2d roi, int k, bool use_extension)
{
assert(width <= this->get_block_width());
int x0 = roi.x();
int y0 = roi.y();
int width = roi.width();
int height = roi.height();
assert(width <= this->get_block_width());
const int alpha = 2 * k + 1;
......@@ -25,6 +28,13 @@ namespace mln::morpho::experimental::details
{
int chunk_size = std::min(rem, alpha);
// Copy the into the tile$
if (m_tile_loader)
{
mln::experimental::box2d region(x0, y0 + chunk_start, width, chunk_size);
m_tile_loader->load_tile(f + chunk_start * f_byte_stride, f_byte_stride, region);
}
// Forward pass
// Compute g[x] = Max f(y), y ∈ [α * ⌊x / α⌋ : x]
this->partial_sum_block2d(f + chunk_start * f_byte_stride, //
......@@ -44,15 +54,62 @@ namespace mln::morpho::experimental::details
// out[x] = Max (Max f[x-k:b), Max f[b:x+k]) with b = α.⌈(x-k)/α⌉ = α.⌊(x+k)/α⌋
// = Max( h[x-k], g[x+k] )
{
for (int i = 0; i < height; ++i)
const int kBlockHeight = 16;
for (int y = 0; y < height; y += kBlockHeight)
{
this->apply_sup(h + (i - k) * h_byte_stride, //
g + (i + k) * g_byte_stride, //
f + i * f_byte_stride, static_cast<std::size_t>(width));
int hroi = std::min(kBlockHeight, height - y);
for (int i = 0; i < hroi; ++i)
{
this->apply_sup(h + (y + i - k) * h_byte_stride, //
g + (y + i + k) * g_byte_stride, //
f + (y + i) * f_byte_stride, width);
}
// Write the tile
if (m_tile_writer)
{
mln::experimental::box2d region(x0, y0 + y, width, hroi);
m_tile_writer->write_tile(f + y * f_byte_stride, f_byte_stride, region);
}
}
}
}
void vertical_running_max_algo_base_t::execute(mln::experimental::box2d roi, int k, bool use_extension, bool vertical)
{
int kBlockWidth = this->get_block_width();
auto sz = this->get_sample_size();
std::ptrdiff_t kBlockByteSize = sz * kBlockWidth;
const int x0 = (vertical) ? roi.x() : roi.y();
const int y0 = (vertical) ? roi.y() : roi.x();
const int width = (vertical) ? roi.width() : roi.height();
const int height = (vertical) ? roi.height() : roi.width();
std::byte* f = (std::byte*)std::malloc(kBlockByteSize * (height + 2 * k));
std::byte* g = (std::byte*)std::malloc(kBlockByteSize * (height + 2 * k));
std::byte* h = (std::byte*)std::malloc(kBlockByteSize * (height + 2 * k));
std::byte* f_shifted = f + k * kBlockByteSize;
std::byte* g_shifted = g + k * kBlockByteSize;
std::byte* h_shifted = h + k * kBlockByteSize;
for (int x = 0; x < width; x += kBlockWidth)
{
int w = std::min(kBlockWidth, width - x);
mln::experimental::box2d region(x0 + x, y0, w, height);
this->running_max_block2d(f_shifted, g_shifted, h_shifted, //
kBlockByteSize, kBlockByteSize, kBlockByteSize, region, k, use_extension);
}
std::free(f);
std::free(g);
std::free(h);
}
} // namespace mln::morpho::experimental::details
......@@ -115,10 +115,11 @@ public:
sup_t sup = {m_sup, m_sup_vec};
mln::morpho::experimental::details::vertical_running_max_algo_t<int, sup_t> algo(sup);
mln::experimental::box2d roi(width, height);
algo.running_max_block2d((std::byte*)(f.data() + radius * stride), //
(std::byte*)(g.data() + radius * stride), //
(std::byte*)(h.data() + radius * stride), //
width, height, stride * sizeof(int), stride * sizeof(int), stride * sizeof(int), radius,
stride * sizeof(int), stride * sizeof(int), stride * sizeof(int), roi, radius,
true);
compare_span2d(gref.data(), g.data(), width, height + 2 * radius);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment