...
 
Commits (3)
...@@ -74,16 +74,22 @@ ...@@ -74,16 +74,22 @@
C++ scientific library.} C++ scientific library.}
} }
@misc{pybind11, @inproceedings{expsimd,
author = {Wenzel Jakob and Jason Rhinelander and Dean Moldovan}, author = {Malossi, A. Cristiano I. and Ineichen, Yves and Bekas, Costas and Curioni, Alessandro},
year = {2017}, year = {2015},
note = {https://github.com/pybind/pybind11}, month = {01},
title = {pybind11 -- Seamless operability between C++11 and Python} pages = {},
} title = {Fast Exponential Computation on SIMD Architectures},
doi = {10.13140/2.1.4362.3207}
@misc{niebler1999boost,
title={Boost},
author={Niebler, Eric},
year={1999}
} }
@article{tiling_performances,
author = {Wittenbrink, Craig and Somani, Arun},
year = {1993},
month = {01},
pages = {12-22},
title = {Cache tiling for high performance morphological image processing},
volume = {7},
journal = {Machine Vision and Applications},
doi = {10.1007/BF01212412}
}
\ No newline at end of file
...@@ -2,7 +2,8 @@ ...@@ -2,7 +2,8 @@
\usepackage[utf8]{inputenc} \usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc} \usepackage[T1]{fontenc}
\usepackage[french]{babel} \usepackage[english]{babel}
\usepackage[]{wrapfig}
\usetheme{Warsaw} \usetheme{Warsaw}
...@@ -30,49 +31,95 @@ ...@@ -30,49 +31,95 @@
\maketitle \maketitle
\begin{frame} \begin{frame}
\frametitle{Rappel: la situation} \frametitle{The context}
\structure{Point de départ.} Milena, une bibliothèque \emph{générique et performante} \structure{The library} \\
de traitement d'image codée en C++. \textit{Pylene} is \emph{generic} image processing library written in C++
\footnote{\tiny Practical Genericity : Writing Image Processing Algorithms Both Reusable \footnote{\tiny Practical Genericity : Writing Image Processing Algorithms Both Reusable
and Efficient.R. Levillain et al., \textit{ICPR'14}.} and Efficient.R. Levillain et al., \textit{ICPR'14}.}
\footnote{\tiny An Image Processing Library in Modern C++: Getting Simplicity and Efficiency \footnote{\tiny An Image Processing Library in Modern C++: Getting Simplicity and Efficiency
with Generic Programming. M. Roynard, E. Carlinet, T. Géraud, \textit{RRPR'18}.}\\[1pt] with Generic Programming. M. Roynard, E. Carlinet, T. Géraud, \textit{RRPR'18}.} \\
It contains a wide variety of algorithms, grouped in three main categories:
\begin{itemize}
\item Point-wise (PW) algorithms
\item Local algorithms
\item Global algorithms
\end{itemize}
\end{frame}
\structure{Objectif.} Faciliter l'usage d'Olena à travers une interface \begin{frame}
Python.\\[1pt] \frametitle{The context - bis}
\structure{Performances} \\
Pylene as a library is already relatively fast but we know that it can be faster than even
OpenCV if we add parallelism and/or other speedup mechanics.
\footnote{\tiny Paper being worked on by M.Roynard \& E.Carlinet}
(The following screenshot shows results using tiling \footnote{\tiny Cache tiling for high performance morphological image processing. Wittenbrink, A. et al \textit{Machine Vision and Applications, 1993}.} with SIMD)
\includegraphics[width=6cm]{figs/bench_pln_vs_cv.png}
\end{frame}
\structure{Difficultés.} \begin{frame}
\frametitle{Regarding performances - SIMD}
{\emph{SIMD}, or \emph{vectorization}, is the process of treating variables as part of a vector of data.
Instead of running instructions one by one, we run them all at the same time (limited by architectures).}
\\
\includegraphics[width=7cm]{figs/simd.png}
\end{frame}
\begin{frame}
\frametitle{Regarding performances - Tiling}
{\emph{Tiling} is a technique in image processing where you process an image by pieces, rather than trying to fit it all in memory.}
\begin{wrapfigure}{r}{9cm}
\includegraphics[width=9cm]{figs/tiling.png}
\end{wrapfigure}
\end{frame}
\begin{frame}
\frametitle{Objective and problems}
\structure{Objective: improving our performances} \\
Improve the performances of the library through the aforementioned means.
The current short- and long-term points of interest:
\begin{itemize}
\item Short term: work on PW algorithms
\item Longer term: work on local algorithms
\end{itemize}
\structure{Problems}
\begin{itemize} \begin{itemize}
\item Appeler du code \textit{statique} (templates) depuis un \item SIMD not always available
environnement \textit{dynamique}\\ \item \textbf{Scalability and genericity have to be maintained}
\item Compatibilité avec Numpy
\end{itemize} \end{itemize}
\end{frame} \end{frame}
\begin{frame}
\frametitle{The solutions}
\begin{itemize}
\item \underline{SIMD}: T.B.D., but it doesn't look like a lost cause \footnote{\tiny Fast Exponential Computation on SIMD Architectures. Malossi, A et al. (2015)}
\item \underline{Scalable and generic parallel coding}: Having meta classes that are simd-compatible acting as bases for the algorithms
\end{itemize}
\end{frame}
\begin{frame}[fragile] \begin{frame}[fragile]
\frametitle{La progression} \frametitle{Work done so far}
\begin{itemize} \begin{itemize}
\item Implémentation d'un container, \textit{any\_ref}, inspiré de Boost \item Reading documentation
\footnote{\tiny Boost. E. Niebler et al., \textit{1999}.} \item Understanding our code
\item Implémentation d'une méthode de conversion d'un type à un autre dans les \textit{value\_set} \item Trying to emulate code already written for PW algorithms
\end{itemize} \end{itemize}
On obtient ainsi des résultats très concrets, comme le montre la présentation qui suit!
\end{frame} \end{frame}
\begin{frame} \begin{frame}
\frametitle{Les prochains pas} \frametitle{Next steps}
\begin{itemize} \begin{itemize}
\item Rajouter les différentes fonctions non implémentées qui servent à traiter des images 2D \item Discussion with Edwin regarding implementation details
\item Permettre l'utilisation de coercision en tandem avec de la répartition dynamique \item Probably more papers to read
\item Utiliser la compilation à la volée quand/si possible
\end{itemize} \end{itemize}
\end{frame} \end{frame}
\begin{frame} \begin{frame}
\frametitle{Bibliographie} \frametitle{Bibliography}
\scriptsize \scriptsize
\nocite{levillain.14.ciarp, roynard.18.rrpr, pybind11, niebler1999boost} \nocite{levillain.14.ciarp, roynard.18.rrpr, expsimd,tiling_performances}
\bibliography{biblio} \bibliography{biblio}
\bibliographystyle{apalike} \bibliographystyle{apalike}
\end{frame} \end{frame}
......
*.pdf
*.log
*.aux
*.bbl
*.blg
*.fdb_latexmk
*.fls
*.nav
*.out
*.snm
*.synctex.gz
*.toc
*.vrb
tmp*
share/
include share/make/share.mk
include share/make/tex.mk
TEXI2PDFFLAGS += --shell-escape
all: slides.pdf
clean:
${RM} slides.pdf* tmp*
@InProceedings{ levillain.14.ciarp,
author = {Roland Levillain and Thierry G\'eraud and Laurent Najman
and Edwin Carlinet},
title = {Practical Genericity: Writing Image Processing Algorithms
Both Reusable and Efficient},
booktitle = {Progress in Pattern Recognition, Image Analysis, Computer
Vision, and Applications -- Proceedings of the 19th
Iberoamerican Congress on Pattern Recognition (CIARP)},
address = {Puerto Vallarta, Mexico},
month = nov,
year = {2014},
pages = {70--79},
editor = {Eduardo Bayro and Edwin Hancock},
publisher = {Springer-Verlag},
series = {Lecture Notes in Computer Science},
volume = {8827},
lrdeprojects = {Olena},
abstract = {An important topic for the image processing and pattern
recognition community is the construction of open source
and efficient libraries. An increasing number of software
frameworks are said to be generic: they allow users to
write reusable algorithms compatible with many input image
types. However, this design choice is often made at the
expense of performance. We present an approach to preserve
efficiency in a generic image processing framework, by
leveraging data types features. Variants of generic
algorithms taking advantage of image types properties can
be defined, offering an adjustable trade-off between
genericity and efficiency. Our experiments show that these
generic optimizations can match dedicated code in terms of
execution times, and even sometimes perform better than
routines optimized by hand. Digital Topology software
should reflect the generality of the underlying
mathematics: mapping the latter to the former requires
genericity. By designing generic solutions, one can
effectively reuse digital topology data structures and
algorithms. We propose an image processing framework
focused on the Generic Programming paradigm in which an
algorithm on the paper can be turned into a single code,
written once and usable with various input types. This
approach enables users to design and implement new methods
at a lower cost, try cross-domain experiments and help
generalize results.},
keywords = {Generic Programming, Image Processing, Performance,
Olena},
lrdepaper = {http://www.lrde.epita.fr/dload/papers/levillain.14.ciarp.pdf},
lrdeslides = {http://www.lrde.epita.fr/dload/papers/levillain.14.ciarp.slides.pdf},
lrdenewsdate = {2014-09-10}
}
@InProceedings{ roynard.18.rrpr,
title = {An Image Processing Library in Modern {C++}: Getting
Simplicity and Efficiency with Generic Programming},
author = {Micha\"el Roynard and Edwin Carlinet and Thierry G\'eraud},
booktitle = {Proceedings of the 2nd Workshop on Reproducible Research
in Pattern Recognition (RRPR)},
year = {2018},
abstract = {As there are as many clients as many usages of an Image
Processing library, each one may expect different services
from it. Some clients may look for efficient and
production-quality algorithms, some may look for a large
tool set, while others may look for extensibility and
genericity to inter-operate with their own code base... but
in most cases, they want a simple-to-use and stable
product. For a C++ Image Processing library designer, it is
difficult to conciliate genericity, efficiency and
simplicity at the same time. Modern C++ (post 2011) brings
new features for library developers that will help
designing a software solution combining those three points.
In this paper, we develop a method using these facilities
to abstract the library components and augment the
genericity of the algorithms. Furthermore, this method is
not specific to image processing; it can be applied to any
C++ scientific library.}
}
@inproceedings{expsimd,
author = {Malossi, A. Cristiano I. and Ineichen, Yves and Bekas, Costas and Curioni, Alessandro},
year = {2015},
month = {01},
pages = {},
title = {Fast Exponential Computation on SIMD Architectures},
doi = {10.13140/2.1.4362.3207}
}
@article{tiling_performances,
author = {Wittenbrink, Craig and Somani, Arun},
year = {1993},
month = {01},
pages = {12-22},
title = {Cache tiling for high performance morphological image processing},
volume = {7},
journal = {Machine Vision and Applications},
doi = {10.1007/BF01212412}
}
\ No newline at end of file
\documentclass[bigger]{beamer}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage[english]{babel}
\usepackage[]{wrapfig}
\usepackage{tikz}
\usetikzlibrary{positioning,calc,shapes,arrows}
\usetheme{Warsaw}
\setbeamertemplate{navigation symbols}{}
\setbeamertemplate{footline} [frame number]
\renewcommand{\maketitle}
{%
\begin{frame}
\titlepage%
\vspace{-10mm}%
\begin{flushright}%
\includegraphics[width=3cm]{figs/logo}%
\end{flushright}%
\end{frame}%
}
\date[25-03-2020]{Lightning Talk \#2: March 25, 2020}
\author{Celian \textsc{Gossec}}
\title[Parallelism in Pylene]{Introducing parallelism in a generic image
processing framework for fun \& performances}
\institute[LRDE]{LRDE\\\textit{Laboratoire de Recherche et Développement de l'EPITA}}
\renewcommand{\footnotesize}{\fontsize{6pt}{8pt}\selectfont}
\begin{document}
\maketitle
\begin{frame}
\frametitle{The context}
\structure{The library} \\
\textit{Pylene} is \emph{generic} image processing library written in C++
\footnote{\tiny Practical Genericity : Writing Image Processing Algorithms Both Reusable
and Efficient.R. Levillain et al., \textit{ICPR'14}.}
\footnote{\tiny An Image Processing Library in Modern C++: Getting Simplicity and Efficiency
with Generic Programming. M. Roynard, E. Carlinet, T. Géraud, \textit{RRPR'18}.} \\
It contains a wide variety of algorithms, grouped in three main categories:
\begin{itemize}
\item Point-wise (PW) algorithms
\item Local algorithms
\item Global algorithms
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{The context - bis}
\structure{Performances} \\
Pylene as a library is already relatively fast but we know that it can be faster than even
OpenCV if we add parallelism and/or other speedup mechanics.
\footnote{\tiny Paper being worked on by M.Roynard \& E.Carlinet}
(Michaël and Edwin worked on benchmarks that showed a speedup coefficient of up to 4x using tiling
\footnote{\tiny Cache tiling for high performance morphological image processing. Wittenbrink, A. et al \textit{Machine Vision and Applications, 1993}.}
and SIMD)
\end{frame}
\begin{frame}
\frametitle{Regarding performances - SIMD}
{\emph{SIMD}, or \emph{vectorization}, is the process of treating variables as part of a vector of data.
Instead of running instructions one by one, we run them all at the same time (limited by architectures).}
\\
\includegraphics[width=7cm]{figs/simd.png}
\end{frame}
\begin{frame}
\frametitle{Regarding performances - Tiling}
{\emph{Tiling} is a technique in image processing where you process an image by pieces, rather than trying to fit it all in memory.}
\begin{wrapfigure}{r}{9cm}
\includegraphics[width=9cm]{figs/tiling.png}
\end{wrapfigure}
\end{frame}
\begin{frame}
\frametitle{Objective and problems}
\structure{Objective: improving our performances} \\
Improve the performances of the library through the aforementioned means.
The current short- and long-term points of interest:
\begin{itemize}
\item Short term: work on PW algorithms
\item Longer term: work on local algorithms
\end{itemize}
\structure{Our main problem}
\begin{itemize}
\item \textbf{Scalability and genericity have to be maintained}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{The work done}
\begin{itemize}
\item Designing a durable code pattern that would work with every pointwise algorithm.
\item Implementation (partly) of chosen design pattern as POC
\item Adding benchmarks and tests for what has been implemented
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{The design pattern}
\begin{tikzpicture}
\tikzset{land/.style={draw}, obj/.style={draw,fill=red!20}};
\tikzstyle{inheritance}=[->, >=open triangle 90, thick]
\tikzstyle{line}=[-, thick]
\tikzstyle{class}=[draw, fill=red!30, rectangle split, rectangle split parts=2]
\draw node[] (header) {\tiny{Header}} -- ++(6,0) node[] (src) {\tiny{Source code}};
\coordinate (midBar) at ($(header)!0.4!(src)$); % Mid way between both
\draw (midBar) -- ++(0,-5);
\node[obj] [below = 0.4cm of header] (includehpp) {\tiny \#include <mln/algorithms.hpp> };
\node[obj] [below = 0.1cm of src] (includecpp1) {\tiny \#include <mln/parallel\_pointwise.hpp>};
\node[obj] [below = 0.7cm of src] (includecpp2) {\tiny \#include <tbb.h>};
\node (canvas) [class, align=left] [below = 1.3cm of src]
{
\footnotesize ParallelPointwiseBase
\nodepart{second} \tiny operator()(mln::box2d tile) \{ this->execTile(this->gDomain); \} \\
\tiny virtual void execTile(t) const = 0; \\
\tiny virtual mln::box2d gDomain() const = 0;
};
\node (foreach) [class, align=left, anchor=north] [below = 1.3cm of header]
{
\footnotesize ForEachParallel<Img, Func>
\nodepart{second} \tiny box2d gDomain() \{ return Img.dom; \} \\
\tiny void execTile(t) \{ for\_each(t.elm, fun); \}
};
\node (transform) [class, align=left, anchor=north] [below = 3.4cm of header]
{
\footnotesize TransformParallel<Img, Img, Func>
\nodepart{second} \tiny box2d gDomain() \{ return Img.dom; \} \\
\tiny void execTile(t) \{ transform(t.elm, out.elm, fun); \}
};
\node[draw, fill=red!50] (exec) [below = 4cm of src, align=left]
{
\tiny parallel\_call(Base\* canvas) \{ \\
\tiny parallel\_for(canvas->size, *canvas) \\
\tiny \}
};
\draw[inheritance] (foreach) -- (canvas);
\draw[inheritance] (transform) -- (canvas);
\end{tikzpicture}
\begin{itemize}
\item soft dependancy on tbb
\item parallel\_for calls operator(), allows for algorithm-specific optimizations
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Performances obtained}
for each pixel, increment value by 1 \\
Speedup negligible / no speedup (4.4G/s vs 4G/s) \footnotemark[5]
\includegraphics[width=12cm]{figs/bench_easy.png}
For each pixel, apply a gamma correction \\
(pixel = pixel**(1/2.2)) \\
Speedup 5x (175M/s vs 36M/s)
\footnote{\tiny All benchmarks ran on my lousy laptop}
\includegraphics[width=12cm]{figs/bench_hard.png}
\end{frame}
\begin{frame}
\frametitle{Next steps}
\begin{itemize}
\item Finishing implementation of PW algorithms
\item Thinking about the design pattern for the next step: local algorithms
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Bibliography}
\scriptsize
\nocite{levillain.14.ciarp, roynard.18.rrpr, expsimd, tiling_performances}
\bibliography{biblio}
\bibliographystyle{apalike}
\end{frame}
\end{document}