Commit ca67a57f authored by Guillaume Lazzara's avatar Guillaume Lazzara
Browse files

Store OCR results in the line_info structure.

	* core/line_info.hh: Add a new attribute 'text'.

	* text/recognition.hh: Store results in the line_info structure.
parent 21af64ad
2010-05-25 Guillaume Lazzara <z@lrde.epita.fr>
Store OCR results in the line_info structure.
* core/line_info.hh: Add a new attribute 'text'.
* text/recognition.hh: Store results in the line_info structure.
2010-05-25 Guillaume Lazzara <z@lrde.epita.fr>
Improve debug outputs in Sauvola and make binarization tools
......
......@@ -40,6 +40,7 @@
# include <mln/accu/stat/median_h.hh>
# include <mln/accu/shape/bbox.hh>
# include <mln/util/object_id.hh>
# include <mln/value/int_u.hh>
# include <scribo/core/tag/component.hh>
# include <scribo/core/tag/line.hh>
......@@ -124,6 +125,9 @@ namespace scribo
bool indented() const;
const std::string& text() const;
void update_text(const std::string& str);
bool is_valid() const;
......@@ -232,6 +236,8 @@ namespace scribo
bool indented_;
std::string text_;
// Line set holding this element.
line_set<L> holder_;
......@@ -295,6 +301,8 @@ namespace scribo
indented_ = other.indented();
text_ = other.text();
holder_ = other.holder();
}
......@@ -578,6 +586,21 @@ namespace scribo
return indented_;
}
template <typename L>
const std::string&
line_info<L>::text() const
{
return text_;
}
template <typename L>
void
line_info<L>::update_text(const std::string& str)
{
text_ = str;
}
template <typename L>
bool
......@@ -833,7 +856,9 @@ namespace scribo
else
char_width_ = char_width.to_result();
// FIXME: There is a bug here when the input document is too
// large. The baselines indexes are too high for the type used
// in the median accumulator!
baseline_ = absolute_baseline.to_result();
meanline_ = absolute_meanline.to_result();
x_height_ = absolute_baseline - absolute_meanline + 1;
......@@ -890,6 +915,7 @@ namespace scribo
<< ", orientation=" << info.orientation()
<< ", reading_orientation=" << info.reading_orientation()
<< ", indented=" << info.indented()
<< ", text=" << info.text()
<< ")" << std::endl;
}
......
......@@ -81,9 +81,7 @@ namespace scribo
//
template <typename L>
void
recognition(const line_set<L>& lines,
const char *language,
const char *output_file = 0);
recognition(line_set<L>& lines, const char *language);
/// Recognize text from an image.
......@@ -91,7 +89,7 @@ namespace scribo
void
recognition(const Image<I>& line,
const char *language,
const char *output_file = 0);
const std::string& output_file = 0);
......@@ -102,9 +100,7 @@ namespace scribo
template <typename L>
void
recognition(const line_set<L>& lines,
const char *language,
const char *output_file = 0)
recognition(line_set<L>& lines, const char *language)
{
trace::entering("scribo::text::recognition");
......@@ -121,9 +117,6 @@ namespace scribo
0, 9, 0, 9, 0 };
w_window2d_int dmap_win = mln::make::w_window2d_int(vals);
std::ofstream file;
if (output_file != 0)
file.open(output_file);
/// Use text bboxes with Tesseract
for_all_lines(i, lines)
......@@ -141,7 +134,7 @@ namespace scribo
box.enlarge(2);
I text_ima(box);
data::fill(text_ima, true);
data::fill(text_ima, false);
// Careful : background is set to 'False'
const component_set<L>& comp_set = lines.components();
......@@ -152,7 +145,7 @@ namespace scribo
{
unsigned comp_id = comps(e);
data::fill(((text_ima | comp_set(comp_id).bbox()).rw() | (pw::value(lbl) == pw::cst(comp_id))).rw(),
false);
true);
}
/// Improve text quality.
......@@ -161,8 +154,6 @@ namespace scribo
I text_ima_cleaned = text::clean(lines(i), text_ima);
// mln::io::pbm::save(text_ima_cleaned, mln::debug::filename("line.pbm", debug_id++));
// Setting objects to 'True'
logical::not_inplace(text_ima_cleaned);
// Make sure there is no border.
border::resize(text_ima_cleaned, 0);
......@@ -181,29 +172,15 @@ namespace scribo
if (s != 0)
{
std::cerr << s << std::endl;
if (output_file != 0)
{
std::string str(s);
str = str.substr(0, str.length() - 1);
file << lines(i).bbox().pmin().row()
<< " "
<< lines(i).bbox().pmin().col()
<< " "
<< lines(i).bbox().pmax().row()
<< " "
<< lines(i).bbox().pmax().col()
<< " "
<< str;
}
str = str.substr(0, str.length() - 2);
lines(i).update_text(str);
}
// The string has been allocated by Tesseract. We must free it.
// The string has been allocated by Tesseract. It must be released.
free(s);
}
if (output_file != 0)
file.close();
trace::exiting("scribo::text::recognition");
}
......@@ -212,7 +189,7 @@ namespace scribo
void
recognition(const Image<I>& line_,
const char *language,
const char *output_file = 0)
const std::string& output_file = 0)
{
trace::entering("scribo::text::recognition");
......@@ -223,8 +200,8 @@ namespace scribo
TessBaseAPI::InitWithLanguage(NULL, NULL, language, NULL, false, 0, NULL);
std::ofstream file;
if (output_file != 0)
file.open(output_file);
if (!output_file.empty())
file.open(output_file.c_str());
mln_domain(I) box = line.domain();
// Make sure characters are isolated from the borders.
......@@ -252,7 +229,7 @@ namespace scribo
if (s != 0)
{
std::cout << s << std::endl;
if (output_file != 0)
if (!output_file.empty())
{
std::string str(s);
str = str.substr(0, str.length() - 1);
......@@ -271,7 +248,7 @@ namespace scribo
// The string has been allocated by Tesseract. We must free it.
free(s);
if (output_file != 0)
if (!output_file.empty())
file.close();
trace::exiting("scribo::text::recognition");
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment