Commit ca67a57f authored by Guillaume Lazzara's avatar Guillaume Lazzara
Browse files

Store OCR results in the line_info structure.

	* core/line_info.hh: Add a new attribute 'text'.

	* text/recognition.hh: Store results in the line_info structure.
parent 21af64ad
2010-05-25 Guillaume Lazzara <z@lrde.epita.fr>
Store OCR results in the line_info structure.
* core/line_info.hh: Add a new attribute 'text'.
* text/recognition.hh: Store results in the line_info structure.
2010-05-25 Guillaume Lazzara <z@lrde.epita.fr> 2010-05-25 Guillaume Lazzara <z@lrde.epita.fr>
Improve debug outputs in Sauvola and make binarization tools Improve debug outputs in Sauvola and make binarization tools
......
...@@ -40,6 +40,7 @@ ...@@ -40,6 +40,7 @@
# include <mln/accu/stat/median_h.hh> # include <mln/accu/stat/median_h.hh>
# include <mln/accu/shape/bbox.hh> # include <mln/accu/shape/bbox.hh>
# include <mln/util/object_id.hh> # include <mln/util/object_id.hh>
# include <mln/value/int_u.hh>
# include <scribo/core/tag/component.hh> # include <scribo/core/tag/component.hh>
# include <scribo/core/tag/line.hh> # include <scribo/core/tag/line.hh>
...@@ -124,6 +125,9 @@ namespace scribo ...@@ -124,6 +125,9 @@ namespace scribo
bool indented() const; bool indented() const;
const std::string& text() const;
void update_text(const std::string& str);
bool is_valid() const; bool is_valid() const;
...@@ -232,6 +236,8 @@ namespace scribo ...@@ -232,6 +236,8 @@ namespace scribo
bool indented_; bool indented_;
std::string text_;
// Line set holding this element. // Line set holding this element.
line_set<L> holder_; line_set<L> holder_;
...@@ -295,6 +301,8 @@ namespace scribo ...@@ -295,6 +301,8 @@ namespace scribo
indented_ = other.indented(); indented_ = other.indented();
text_ = other.text();
holder_ = other.holder(); holder_ = other.holder();
} }
...@@ -578,6 +586,21 @@ namespace scribo ...@@ -578,6 +586,21 @@ namespace scribo
return indented_; return indented_;
} }
template <typename L>
const std::string&
line_info<L>::text() const
{
return text_;
}
template <typename L>
void
line_info<L>::update_text(const std::string& str)
{
text_ = str;
}
template <typename L> template <typename L>
bool bool
...@@ -833,7 +856,9 @@ namespace scribo ...@@ -833,7 +856,9 @@ namespace scribo
else else
char_width_ = char_width.to_result(); char_width_ = char_width.to_result();
// FIXME: There is a bug here when the input document is too
// large. The baselines indexes are too high for the type used
// in the median accumulator!
baseline_ = absolute_baseline.to_result(); baseline_ = absolute_baseline.to_result();
meanline_ = absolute_meanline.to_result(); meanline_ = absolute_meanline.to_result();
x_height_ = absolute_baseline - absolute_meanline + 1; x_height_ = absolute_baseline - absolute_meanline + 1;
...@@ -890,6 +915,7 @@ namespace scribo ...@@ -890,6 +915,7 @@ namespace scribo
<< ", orientation=" << info.orientation() << ", orientation=" << info.orientation()
<< ", reading_orientation=" << info.reading_orientation() << ", reading_orientation=" << info.reading_orientation()
<< ", indented=" << info.indented() << ", indented=" << info.indented()
<< ", text=" << info.text()
<< ")" << std::endl; << ")" << std::endl;
} }
......
...@@ -81,9 +81,7 @@ namespace scribo ...@@ -81,9 +81,7 @@ namespace scribo
// //
template <typename L> template <typename L>
void void
recognition(const line_set<L>& lines, recognition(line_set<L>& lines, const char *language);
const char *language,
const char *output_file = 0);
/// Recognize text from an image. /// Recognize text from an image.
...@@ -91,7 +89,7 @@ namespace scribo ...@@ -91,7 +89,7 @@ namespace scribo
void void
recognition(const Image<I>& line, recognition(const Image<I>& line,
const char *language, const char *language,
const char *output_file = 0); const std::string& output_file = 0);
...@@ -102,9 +100,7 @@ namespace scribo ...@@ -102,9 +100,7 @@ namespace scribo
template <typename L> template <typename L>
void void
recognition(const line_set<L>& lines, recognition(line_set<L>& lines, const char *language)
const char *language,
const char *output_file = 0)
{ {
trace::entering("scribo::text::recognition"); trace::entering("scribo::text::recognition");
...@@ -121,9 +117,6 @@ namespace scribo ...@@ -121,9 +117,6 @@ namespace scribo
0, 9, 0, 9, 0 }; 0, 9, 0, 9, 0 };
w_window2d_int dmap_win = mln::make::w_window2d_int(vals); w_window2d_int dmap_win = mln::make::w_window2d_int(vals);
std::ofstream file;
if (output_file != 0)
file.open(output_file);
/// Use text bboxes with Tesseract /// Use text bboxes with Tesseract
for_all_lines(i, lines) for_all_lines(i, lines)
...@@ -141,7 +134,7 @@ namespace scribo ...@@ -141,7 +134,7 @@ namespace scribo
box.enlarge(2); box.enlarge(2);
I text_ima(box); I text_ima(box);
data::fill(text_ima, true); data::fill(text_ima, false);
// Careful : background is set to 'False' // Careful : background is set to 'False'
const component_set<L>& comp_set = lines.components(); const component_set<L>& comp_set = lines.components();
...@@ -152,7 +145,7 @@ namespace scribo ...@@ -152,7 +145,7 @@ namespace scribo
{ {
unsigned comp_id = comps(e); unsigned comp_id = comps(e);
data::fill(((text_ima | comp_set(comp_id).bbox()).rw() | (pw::value(lbl) == pw::cst(comp_id))).rw(), data::fill(((text_ima | comp_set(comp_id).bbox()).rw() | (pw::value(lbl) == pw::cst(comp_id))).rw(),
false); true);
} }
/// Improve text quality. /// Improve text quality.
...@@ -161,8 +154,6 @@ namespace scribo ...@@ -161,8 +154,6 @@ namespace scribo
I text_ima_cleaned = text::clean(lines(i), text_ima); I text_ima_cleaned = text::clean(lines(i), text_ima);
// mln::io::pbm::save(text_ima_cleaned, mln::debug::filename("line.pbm", debug_id++)); // mln::io::pbm::save(text_ima_cleaned, mln::debug::filename("line.pbm", debug_id++));
// Setting objects to 'True'
logical::not_inplace(text_ima_cleaned);
// Make sure there is no border. // Make sure there is no border.
border::resize(text_ima_cleaned, 0); border::resize(text_ima_cleaned, 0);
...@@ -180,30 +171,16 @@ namespace scribo ...@@ -180,30 +171,16 @@ namespace scribo
if (s != 0) if (s != 0)
{ {
std::cerr << s << std::endl; std::cerr << s << std::endl;
if (output_file != 0) std::string str(s);
{ str = str.substr(0, str.length() - 2);
std::string str(s); lines(i).update_text(str);
str = str.substr(0, str.length() - 1);
file << lines(i).bbox().pmin().row()
<< " "
<< lines(i).bbox().pmin().col()
<< " "
<< lines(i).bbox().pmax().row()
<< " "
<< lines(i).bbox().pmax().col()
<< " "
<< str;
}
} }
// The string has been allocated by Tesseract. We must free it. // The string has been allocated by Tesseract. It must be released.
free(s); free(s);
} }
if (output_file != 0)
file.close();
trace::exiting("scribo::text::recognition"); trace::exiting("scribo::text::recognition");
} }
...@@ -212,7 +189,7 @@ namespace scribo ...@@ -212,7 +189,7 @@ namespace scribo
void void
recognition(const Image<I>& line_, recognition(const Image<I>& line_,
const char *language, const char *language,
const char *output_file = 0) const std::string& output_file = 0)
{ {
trace::entering("scribo::text::recognition"); trace::entering("scribo::text::recognition");
...@@ -223,8 +200,8 @@ namespace scribo ...@@ -223,8 +200,8 @@ namespace scribo
TessBaseAPI::InitWithLanguage(NULL, NULL, language, NULL, false, 0, NULL); TessBaseAPI::InitWithLanguage(NULL, NULL, language, NULL, false, 0, NULL);
std::ofstream file; std::ofstream file;
if (output_file != 0) if (!output_file.empty())
file.open(output_file); file.open(output_file.c_str());
mln_domain(I) box = line.domain(); mln_domain(I) box = line.domain();
// Make sure characters are isolated from the borders. // Make sure characters are isolated from the borders.
...@@ -252,7 +229,7 @@ namespace scribo ...@@ -252,7 +229,7 @@ namespace scribo
if (s != 0) if (s != 0)
{ {
std::cout << s << std::endl; std::cout << s << std::endl;
if (output_file != 0) if (!output_file.empty())
{ {
std::string str(s); std::string str(s);
str = str.substr(0, str.length() - 1); str = str.substr(0, str.length() - 1);
...@@ -271,7 +248,7 @@ namespace scribo ...@@ -271,7 +248,7 @@ namespace scribo
// The string has been allocated by Tesseract. We must free it. // The string has been allocated by Tesseract. We must free it.
free(s); free(s);
if (output_file != 0) if (!output_file.empty())
file.close(); file.close();
trace::exiting("scribo::text::recognition"); trace::exiting("scribo::text::recognition");
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment