Commit 40d51800 authored by Guillaume Lazzara's avatar Guillaume Lazzara
Browse files

Add a specific toolchain for Nepomuk integration.

	* convert/from_qimage.hh: New. convert an image2d to a QImage.

	* core/line_info.hh: Add a new member has_text.

	* text/recognition.hh: Remove an invalid precondition.

	* toolchain/nepomuk/text_extraction.hh: New. Specific toolchain
	for Nepomuk.

	* tests/Makefile.am: Add toolchain/* subdirs.

	* tests/toolchain/Makefile.am,
	* tests/toolchain/nepomuk/Makefile.am: New.

	* tests/img/wildly.pbm: New. New test image.

	* tests/toolchain/nepomuk/text_extraction.cc: New. New test.
parent cbbdf4e0
2010-06-03 Guillaume Lazzara <z@lrde.epita.fr>
Add a specific toolchain for Nepomuk integration.
* convert/from_qimage.hh: New. convert an image2d to a QImage.
* core/line_info.hh: Add a new member has_text.
* text/recognition.hh: Remove an invalid precondition.
* toolchain/nepomuk/text_extraction.hh: New. Specific toolchain
for Nepomuk.
* tests/Makefile.am: Add toolchain/* subdirs.
* tests/toolchain/Makefile.am,
* tests/toolchain/nepomuk/Makefile.am: New.
* tests/img/wildly.pbm: New. New test image.
* tests/toolchain/nepomuk/text_extraction.cc: New. New test.
2010-05-25 Guillaume Lazzara <z@lrde.epita.fr>
Cleanup sample tools.
......
// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
//
// This file is part of Olena.
//
// Olena is free software: you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation, version 2 of the License.
//
// Olena is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Olena. If not, see <http://www.gnu.org/licenses/>.
//
// As a special exception, you may use this file as part of a free
// software project without restriction. Specifically, if other files
// instantiate templates or use macros or inline functions from this
// file, or you compile this file and link it with other files to produce
// an executable, this file does not by itself cause the resulting
// executable to be covered by the GNU General Public License. This
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.
#ifndef SCRIBO_CONVERT_FROM_QIMAGE_HH
# define SCRIBO_CONVERT_FROM_QIMAGE_HH
/// \file
///
/// Extract text from a document.
# include <QtGui/QImage>
# include <mln/value/qt/rgb32.hh>
# if QT_VERSION < 0x040000
# error "Qt library too old. You need at least Qt 4.x."
# endif // ! QT_VERSION
namespace scribo
{
namespace convert
{
/*! \brief Convert a QImage to mln::image2d.
\param[in] ima A QImage. Prefer using QImage::Format_RGB32
image format to avoid conversions.
\return A RGB8 2D image in Milena's format.
*/
mln::image2d<mln::value::qt::rgb32>
from_qimage(const QImage& ima);
# ifndef MLN_INCLUDE_ONLY
mln::image2d<mln::value::qt::rgb32>
from_qimage(const QImage& ima)
{
QImage tmp = ima;
if (ima.format() != QImage::Format_RGB32)
tmp = ima.convertToFormat(QImage::Format_RGB32);
const int
nrows = tmp.height(),
ncols = tmp.width();
mln::image2d<mln::value::qt::rgb32> output(nrows, ncols, 0);
QImage qima(ncols, nrows, QImage::Format_RGB32);
std::memcpy(output.buffer(),
tmp.scanLine(0),
output.nelements() * 4);
return output;
}
# endif // ! MLN_INCLUDE_ONLY
} // end of namespace scribo::convert
} // end of namespace scribo
#endif // ! SCRIBO_CONVERT_FROM_QIMAGE_HH
......@@ -125,6 +125,7 @@ namespace scribo
bool indented() const;
bool has_text() const;
const std::string& text() const;
void update_text(const std::string& str);
......@@ -586,6 +587,13 @@ namespace scribo
return indented_;
}
template <typename L>
bool
line_info<L>::has_text() const
{
return !text_.empty();
}
template <typename L>
const std::string&
line_info<L>::text() const
......
......@@ -25,6 +25,7 @@ SUBDIRS = \
preprocessing \
table \
text \
toolchain \
unit_test
# Regen files recursively.
......
# Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE).
#
# This file is part of Olena.
#
# Olena is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation, version 2 of the License.
#
# Olena is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Olena. If not, see <http://www.gnu.org/licenses/>.
#
## Process this file through Automake to create Makefile.in.
include $(top_srcdir)/scribo/tests/tests.mk
SUBDIRS = \
nepomuk
\ No newline at end of file
# Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE).
#
# This file is part of Olena.
#
# Olena is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation, version 2 of the License.
#
# Olena is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Olena. If not, see <http://www.gnu.org/licenses/>.
#
## Process this file through Automake to create Makefile.in.
include $(top_srcdir)/scribo/tests/tests.mk
check_PROGRAMS =
if HAVE_QT
if HAVE_TESSERACT
check_PROGRAMS += text_extraction
text_extraction_SOURCES = text_extraction.cc
text_extraction_CXXFLAGS = $(QT_CXXFLAGS) $(AM_CXXFLAGS)
text_extraction_CPPFLAGS = $(QT_CPPFLAGS) $(AM_CPPFLAGS) \
$(TESSERACT_CPPFLAGS) \
$(TIFF_CPPFLAGS)
text_extraction_LDFLAGS = $(QT_LDFLAGS) $(LDFLAGS) \
$(TESSERACT_LDFLAGS) \
$(TIFF_LDFLAGS) \
-lpthread
text_extraction_LDADD = $(QT_LIBS) $(LDADD)
endif HAVE_TESSERACT
endif HAVE_QT
TESTS = $(check_PROGRAMS)
// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
//
// This file is part of Olena.
//
// Olena is free software: you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation, version 2 of the License.
//
// Olena is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Olena. If not, see <http://www.gnu.org/licenses/>.
//
// As a special exception, you may use this file as part of a free
// software project without restriction. Specifically, if other files
// instantiate templates or use macros or inline functions from this
// file, or you compile this file and link it with other files to produce
// an executable, this file does not by itself cause the resulting
// executable to be covered by the GNU General Public License. This
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.
/// \file
///
/// Test of scribo::toolchain::nepomuk::text_extraction
#include <QtGui/QImage>
#include <QtCore>
#include <scribo/toolchain/nepomuk/text_extraction.hh>
#include <scribo/tests/data.hh>
int main()
{
QImage ima(SCRIBO_IMG_DIR "/wildly.pbm");
QSet<QString> words = scribo::toolchain::nepomuk::text_extraction(ima);
mln_assertion(words.size() == 1);
mln_assertion(words.contains("Wildly"));
return 0;
}
......@@ -104,8 +104,6 @@ namespace scribo
{
trace::entering("scribo::text::recognition");
mln_precondition(lines.is_valid());
// Initialize Tesseract.
TessBaseAPI::InitWithLanguage(NULL, NULL, language, NULL, false, 0, NULL);
......
// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
//
// This file is part of Olena.
//
// Olena is free software: you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation, version 2 of the License.
//
// Olena is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Olena. If not, see <http://www.gnu.org/licenses/>.
//
// As a special exception, you may use this file as part of a free
// software project without restriction. Specifically, if other files
// instantiate templates or use macros or inline functions from this
// file, or you compile this file and link it with other files to produce
// an executable, this file does not by itself cause the resulting
// executable to be covered by the GNU General Public License. This
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.
#ifndef SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH
# define SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH
/// \file
///
/// Extract text from a document.
# include <QtCore/QString>
# include <QtCore/QStringList>
# include <QtGui/QImage>
# include <mln/core/image/image2d.hh>
# include <mln/data/transform.hh>
# include <mln/logical/not.hh>
# include <mln/value/qt/rgb32.hh>
# include <mln/fun/v2v/qt_rgb_to_int_u.hh>
# include <scribo/convert/from_qimage.hh>
# include <scribo/binarization/sauvola_ms.hh>
# include <scribo/preprocessing/deskew.hh>
# include <scribo/toolchain/text_in_doc.hh>
namespace scribo
{
namespace toolchain
{
namespace nepomuk
{
/*! \brief Extract text from a document.
This is a convenient routine to be used in Nepomuk.
\param[in] ima A document image. The
\return A set of recognized words.
*/
QSet<QString>
text_extraction(const QImage& input);
# ifndef MLN_INCLUDE_ONLY
QSet<QString>
text_extraction(const QImage& input)
{
trace::entering("scribo::toolchain::nepomuk::text_extraction");
mln_precondition(!input.isNull());
typedef image2d<scribo::def::lbl_type> L;
// Convert image to Milena's format.
mln::image2d<mln::value::qt::rgb32>
input_mln = scribo::convert::from_qimage(input);
image2d<bool> input_bin;
// Preprocess
{
// Convert to Gray level image.
image2d<value::int_u8>
input_gl = data::transform(input_mln,
mln::fun::v2v::qt_rgb_to_int_u<8>());
// Deskew if needed.
input_gl = preprocessing::deskew(input_gl);
// Binarize foreground to use it in the processing chain.
input_bin = scribo::binarization::sauvola_ms(input_gl, 101, 3);
}
line_set<L> lines_bg, lines_fg;
// Process
{
// Run document toolchain.
lines_bg = scribo::toolchain::text_in_doc(input_bin, false, false);
// Negate document.
logical::not_inplace(input_bin);
// Run document toolchain.
lines_fg = scribo::toolchain::text_in_doc(input_bin, false, false);
}
QSet<QString> output;
// Construct output
{
QTextCodec *codec = QTextCodec::codecForName("UTF-8");
QString tmp_out;
QTextStream stream(&tmp_out, QIODevice::WriteOnly);
stream.setCodec("UTF-8");
for_all_lines(l, lines_bg)
if (lines_bg(l).has_text())
stream << " " << codec->toUnicode(lines_bg(l).text().c_str());
for_all_lines(l, lines_fg)
if (lines_fg(l).has_text())
stream << " " << codec->toUnicode(lines_fg(l).text().c_str());
QStringList list = tmp_out.split(' ', QString::SkipEmptyParts);
output = QSet<QString>::fromList(list);
}
trace::exiting("scribo::toolchain::nepomuk::text_extraction");
return output;
}
# endif // ! MLN_INCLUDE_ONLY
} // end of namespace scribo::toolchain::nepomuk
} // end of namespace scribo::toolchain
} // end of namespace scribo
#endif // ! SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment