Commit ff1e5aae authored by Guillaume Lazzara's avatar Guillaume Lazzara
Browse files

Prepare DIA tools for Nuxeo/XWiki.

	* scribo/src/Makefile.am: Add a new target.

	* scribo/src/text_in_article_preprocess.cc: New.

	* scribo/src/text_in_article_pbm.cc: Add optional cropping and
	makes the debug optionnal.

	* scribo/text/clean.hh: Improve cleanup.

	* scribo/text/recognition.hh: Remove last '\n' in Tesseract's
	output.
parent c7cb9f3c
2010-05-04 Guillaume Lazzara <z@lrde.epita.fr>
Prepare DIA tools for Nuxeo/XWiki.
* scribo/src/Makefile.am: Add a new target.
* scribo/src/text_in_article_preprocess.cc: New.
* scribo/src/text_in_article_pbm.cc: Add optional cropping and
makes the debug optionnal.
* scribo/text/clean.hh: Improve cleanup.
* scribo/text/recognition.hh: Remove last '\n' in Tesseract's
output.
2010-04-30 Guillaume Lazzara <z@lrde.epita.fr>
Improve OCR recognition.
......
......@@ -82,6 +82,15 @@ if HAVE_TIFF
text_in_article_pbm_SOURCES = text_in_article_pbm.cc
bin_PROGRAMS += text_in_article_preprocess
text_in_article_preprocess_CPPFLAGS = $(AM_CPPFLAGS) \
`Magick++-config --cppflags`
text_in_article_preprocess_LDFLAGS = $(AM_LDFLAGS) \
-lpthread `Magick++-config --libs`
text_in_article_preprocess_SOURCES = text_in_article_preprocess.cc
bin_PROGRAMS += text_in_photo_fast
text_in_photo_fast_SOURCES = text_in_photo_fast.cc
text_in_photo_fast_CPPFLAGS = $(AM_CPPFLAGS) \
......
......@@ -40,6 +40,8 @@
#include <mln/value/rgb8.hh>
#include <mln/value/label_16.hh>
#include <mln/data/paste_without_localization.hh>
#include <scribo/core/line_set.hh>
#include <scribo/primitive/extract/components.hh>
......@@ -85,6 +87,10 @@ const char *args_desc[][2] =
for the background." },
{ "out.txt", "Text output" },
{ "denoise", "1 enables denoising, 0 disables it. (enabled by default)" },
{ "pmin_row", "Row index of the top left corner of the Region of interest." },
{ "pmin_col", "Col index of the top left corner of the Region of interest." },
{ "pmax_row", "Row index of the bottom right corner of the Region of interest." },
{ "pmax_col", "Col index of the bottom right corner of the Region of interest." },
{ "debug_dir", "Output directory for debug image" },
{0, 0}
};
......@@ -95,14 +101,20 @@ int main(int argc, char* argv[])
using namespace scribo;
using namespace mln;
if (argc != 3 && argc != 4 && argc != 5)
if (argc != 3 && argc != 4 && argc != 5 && argc != 8 && argc != 9)
return scribo::debug::usage(argv,
"Find text lines using left/right validation and display x-height in a binarized article.",
"input.pbm out.txt <denoise: 0|1> <debug_dir>",
"input.pbm out.txt <denoise: 0|1> [<pmin_row> <pmin_col> <pmax_row> <pmax_col>] <debug_dir>",
args_desc);
if (argc == 5)
scribo::make::internal::debug_filename_prefix = argv[4];
bool debug = false;
// Enable debug output.
if (argc == 5 || argc == 9)
{
scribo::make::internal::debug_filename_prefix = argv[argc - 1];
debug = true;
}
trace::entering("main");
......@@ -110,6 +122,27 @@ int main(int argc, char* argv[])
image2d<bool> input;
mln::io::pbm::load(input, argv[1]);
// Optional Cropping
if (argc >= 8)
{
def::coord
minr = atoi(argv[4]),
minc = atoi(argv[5]),
maxr = atoi(argv[6]),
maxc = atoi(argv[7]);
box2d roi = mln::make::box2d(minr, minc, maxr, maxc);
image2d<bool> tmp(maxr - minr + 1, maxc - minc + 1);
data::paste_without_localization(input | roi, tmp);
input = tmp;
if (debug)
mln::io::pbm::save(input,
scribo::make::debug_filename("input_cropped.pbm"));
}
typedef value::label_16 V;
typedef image2d<V> L;
......@@ -129,7 +162,9 @@ int main(int argc, char* argv[])
// whitespaces += separators;
mln::io::pbm::save(separators, "vseparators.pbm");
if (debug)
mln::io::pbm::save(separators,
scribo::make::debug_filename("vseparators.pbm"));
// mln::io::pbm::save(whitespaces, "separators.pbm");
// mln::io::pbm::save(input_cleaned, "input_no_separators.pbm");
......@@ -214,18 +249,20 @@ int main(int argc, char* argv[])
//===== DEBUG =====
// Bboxes image.
scribo::debug::save_bboxes_image(input, lines,
scribo::make::debug_filename("step1_bboxes.ppm"));
if (debug)
{
// Bboxes enlarged
mln::io::ppm::save(scribo::debug::bboxes_enlarged_image(input, lines),
scribo::make::debug_filename("step1_bboxes_enlarged.ppm"));
// Bboxes image.
scribo::debug::save_bboxes_image(input, lines,
scribo::make::debug_filename("step1_bboxes.ppm"));
// Looks like a text line
mln::io::ppm::save(scribo::debug::looks_like_a_text_line_image(input, lines),
scribo::make::debug_filename("step1_looks_like_a_text_line.ppm"));
// Bboxes enlarged
mln::io::ppm::save(scribo::debug::bboxes_enlarged_image(input, lines),
scribo::make::debug_filename("step1_bboxes_enlarged.ppm"));
// Looks like a text line
mln::io::ppm::save(scribo::debug::looks_like_a_text_line_image(input, lines),
scribo::make::debug_filename("step1_looks_like_a_text_line.ppm"));
// // Bboxes + line infos
// {
......@@ -261,10 +298,11 @@ int main(int argc, char* argv[])
// }
// mean and base lines.
mln::io::ppm::save(scribo::debug::mean_and_base_lines_image(input, lines),
scribo::make::debug_filename("step1_x_height.ppm"));
// mean and base lines.
mln::io::ppm::save(scribo::debug::mean_and_base_lines_image(input, lines),
scribo::make::debug_filename("step1_x_height.ppm"));
}
//===== END OF DEBUG =====
......@@ -277,56 +315,62 @@ int main(int argc, char* argv[])
//===== DEBUG =====
// mean and base lines.
mln::io::ppm::save(scribo::debug::mean_and_base_lines_image(input, lines),
scribo::make::debug_filename("step2_x_height.ppm"));
if (debug)
{
// Looks like a text line
mln::io::ppm::save(scribo::debug::looks_like_a_text_line_image(input, lines),
scribo::make::debug_filename("step2_looks_like_a_text_line.ppm"));
// mean and base lines.
mln::io::ppm::save(scribo::debug::mean_and_base_lines_image(input, lines),
scribo::make::debug_filename("step2_x_height.ppm"));
// Bboxes image.
scribo::debug::save_bboxes_image(input, lines,
scribo::make::debug_filename("step2_bboxes.ppm"));
// Looks like a text line
mln::io::ppm::save(scribo::debug::looks_like_a_text_line_image(input, lines),
scribo::make::debug_filename("step2_looks_like_a_text_line.ppm"));
// Bboxes image.
scribo::debug::save_bboxes_image(input, lines,
scribo::make::debug_filename("step2_bboxes.ppm"));
{
std::ofstream file(scribo::make::debug_filename("step2_bboxes_100p.txt").c_str());
// std::ofstream file_50p(scribo::make::debug_filename("step2_bboxes_50p.txt").c_str());
for_all_lines(l, lines)
if (lines(l).tag() != line::Merged
&& lines(l).tag() != line::Ignored
&& lines(l).tag() != line::Pathological)
{
file << lines(l).bbox().pmin().row() << " "
<< lines(l).bbox().pmin().col() << " "
<< lines(l).bbox().pmax().row() << " "
<< lines(l).bbox().pmax().col() << " "
<< lines(l).card() << " "
<< lines(l).baseline() << " "
<< lines(l).x_height() << " "
<< lines(l).meanline() << " "
<< lines(l).d_height() << " "
<< lines(l).a_height() << " "
<< lines(l).char_space() << " "
<< lines(l).char_width() << std::endl;
// file_50p << lines(l).bbox().pmin().row() / 2 << " "
// << lines(l).bbox().pmin().col() / 2 << " "
// << lines(l).bbox().pmax().row() / 2 << " "
// << lines(l).bbox().pmax().col() / 2 << std::endl;
}
file.close();
// file_50p.close();
}
}
//===== END OF DEBUG =====
// {
// std::ofstream file(scribo::make::debug_filename("step2_bboxes_100p.txt").c_str());
// // std::ofstream file_50p(scribo::make::debug_filename("step2_bboxes_50p.txt").c_str());
// for_all_lines(l, lines)
// if (lines(l).tag() != line::Merged
// && lines(l).tag() != line::Ignored
// && lines(l).tag() != line::Pathological)
// {
// file << lines(l).bbox().pmin().row() << " "
// << lines(l).bbox().pmin().col() << " "
// << lines(l).bbox().pmax().row() << " "
// << lines(l).bbox().pmax().col() << " "
// << lines(l).card() << " "
// << lines(l).baseline() << " "
// << lines(l).x_height() << " "
// << lines(l).meanline() << " "
// << lines(l).d_height() << " "
// << lines(l).a_height() << " "
// << lines(l).char_space() << " "
// << lines(l).char_width() << std::endl;
// // file_50p << lines(l).bbox().pmin().row() / 2 << " "
// // << lines(l).bbox().pmin().col() / 2 << " "
// // << lines(l).bbox().pmax().row() / 2 << " "
// // << lines(l).bbox().pmax().col() / 2 << std::endl;
// }
// file.close();
// // file_50p.close();
// }
//===== END OF DEBUG =====
scribo::io::xml::save_text_lines(argv[1], lines, "out.xml");
scribo::io::xml::save_text_lines(argv[1], lines, "out.xml");
......
// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
//
// This file is part of Olena.
//
// Olena is free software: you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation, version 2 of the License.
//
// Olena is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Olena. If not, see <http://www.gnu.org/licenses/>.
//
// As a special exception, you may use this file as part of a free
// software project without restriction. Specifically, if other files
// instantiate templates or use macros or inline functions from this
// file, or you compile this file and link it with other files to produce
// an executable, this file does not by itself cause the resulting
// executable to be covered by the GNU General Public License. This
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.
#include <libgen.h>
#include <iostream>
#include <mln/core/image/image2d.hh>
#include <mln/io/magick/load.hh>
#include <mln/io/pbm/save.hh>
#include <mln/value/rgb8.hh>
#include <mln/logical/not.hh>
#include <scribo/binarization/sauvola_ms.hh>
#include <scribo/debug/usage.hh>
#include <scribo/preprocessing/split_bg_fg.hh>
const char *args_desc[][2] =
{
{ "input.*", "An image." },
{ "output.pbm", "A text file with all the recognized text" },
{ "enable fg/bg", "If set to 1 enables foreground extraction. (disabled by default)" },
{ "lambda", "Lambda used in remove fg/bg (Automaticaly deduced by default)." },
{0, 0}
};
int main(int argc, char* argv[])
{
using namespace scribo;
using namespace mln;
if (argc != 3 && argc != 4 && argc != 5)
return scribo::debug::usage(argv,
"Find text in a color document.",
"input.* output.pbm <enable fg/bg> <lambda>",
args_desc);
image2d<value::rgb8> input_rgb;
io::magick::load(input_rgb, argv[1]);
unsigned lambda;
if (argc == 5)
lambda = atoi(argv[4]);
else
lambda = 1.2 * (input_rgb.nrows() + input_rgb.ncols());
// Extract foreground
if (argc == 4 && atoi(argv[3]) == 1)
{
std::cout << "Extracting foreground..." << std::endl;
input_rgb = preprocessing::split_bg_fg(input_rgb, lambda, 32).second();
}
// Binarize foreground to use it in the processing chain.
std::cout << "Binarizing foreground..." << std::endl;
image2d<bool> input = scribo::binarization::sauvola_ms(input_rgb, 101, 3);
logical::not_inplace(input);
mln::io::pbm::save(input, argv[2]);
}
// Copyright (C) 2009 EPITA Research and Development Laboratory (LRDE)
// Copyright (C) 2009, 2010 EPITA Research and Development Laboratory
// (LRDE)
//
// This file is part of Olena.
//
......@@ -113,14 +114,14 @@ namespace scribo
if (fact < 1)
{
std::cout << "Upsampling..." << " - "
<< std::ceil(fact) << std::endl;
while (fact < 1)
<< fact << std::endl;
while (fact < 0.90)
{
output = scribo::upsampling::bs2x(output); // 2x upsampling
fact *= 2.0f;
// std::cout << "fact = " << fact
// << " - output.domain = " << output.domain()
// << std::endl;
// std::cout << "fact = " << fact
// << " - output.domain = " << output.domain()
// << std::endl;
}
}
else if (fact > 2.5f)
......
......@@ -159,7 +159,7 @@ namespace scribo
/// text_ima_cleaned domain is larger than text_ima's.
I text_ima_cleaned = text::clean(lines(i), text_ima);
mln::io::pbm::save(text_ima_cleaned, mln::debug::filename("line.pbm", debug_id++));
// mln::io::pbm::save(text_ima_cleaned, mln::debug::filename("line.pbm", debug_id++));
// Setting objects to 'True'
logical::not_inplace(text_ima_cleaned);
......@@ -182,7 +182,11 @@ namespace scribo
{
std::cerr << s << std::endl;
if (output_file != 0)
file << lines(i).bbox() << " " << s << std::endl;
{
std::string str(s);
str = str.substr(0, str.length() - 1);
file << lines(i).bbox() << " " << str;
}
}
// The string has been allocated by Tesseract. We must free it.
......@@ -241,7 +245,11 @@ namespace scribo
{
std::cout << s << std::endl;
if (output_file != 0)
file << line.domain() << " " << s << std::endl;
{
std::string str(s);
str = str.substr(0, str.length() - 1);
file << line.domain() << " " << str;
}
}
// The string has been allocated by Tesseract. We must free it.
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment