recognition.hh 8.84 KB
Newer Older
1
2
// Copyright (C) 2009, 2010, 2011 EPITA Research and Development
// Laboratory (LRDE)
3
//
4
// This file is part of Olena.
5
//
6
7
8
9
10
// Olena is free software: you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation, version 2 of the License.
//
// Olena is distributed in the hope that it will be useful,
11
12
13
14
15
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public License
16
// along with Olena.  If not, see <http://www.gnu.org/licenses/>.
17
18
//
// As a special exception, you may use this file as part of a free
19
// software project without restriction.  Specifically, if other files
20
// instantiate templates or use macros or inline functions from this
21
22
23
24
25
// file, or you compile this file and link it with other files to produce
// an executable, this file does not by itself cause the resulting
// executable to be covered by the GNU General Public License.  This
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.
26
27
28
29

#ifndef SCRIBO_TEXT_RECOGNITION_HH
# define SCRIBO_TEXT_RECOGNITION_HH

30
/// \file
31
32
///
/// Passes the text bounding boxes to an OCR (Tesseract).
Guillaume Lazzara's avatar
Guillaume Lazzara committed
33
34
35
///
/// \todo For each text bbox, we create a new image. We may like to avoid that.
/// \todo Do not store the result in an image?
36

Guillaume Lazzara's avatar
Guillaume Lazzara committed
37
# include <ostream>
38
# include <clocale>
Guillaume Lazzara's avatar
Guillaume Lazzara committed
39

Roland Levillain's avatar
Roland Levillain committed
40
# include <mln/core/image/dmorph/image_if.hh>
41
42
# include <mln/core/concept/neighborhood.hh>
# include <mln/core/site_set/box.hh>
43

44
45
# include <mln/util/array.hh>
# include <mln/data/fill.hh>
46
# include <mln/data/paste.hh>
Guillaume Lazzara's avatar
Guillaume Lazzara committed
47
# include <mln/data/paste_without_localization.hh>
48
# include <mln/pw/all.hh>
49
50
51
52
53
54

# include <mln/core/alias/w_window2d_int.hh>
# include <mln/make/w_window2d_int.hh>

# include <mln/border/resize.hh>

55
# include <scribo/core/macros.hh>
56

Guillaume Lazzara's avatar
Guillaume Lazzara committed
57
# include <scribo/text/clean_inplace.hh>
58

59
# include <scribo/core/line_set.hh>
60
61


62
# include <tesseract/baseapi.h>
63
64


65
66
67
68
69
# if !defined HAVE_TESSERACT_2 && !defined HAVE_TESSERACT_3
#  define HAVE_TESSERACT_2
# endif


70

71
72
73
74
75
76
namespace scribo
{

  namespace text
  {

Guillaume Lazzara's avatar
Guillaume Lazzara committed
77
78
    using namespace mln;

Guillaume Lazzara's avatar
Guillaume Lazzara committed
79
    /// Passes the text bboxes to Tesseract (OCR).
80
    ///
81
    /// \param[in] lines       The lines of text.
Guillaume Lazzara's avatar
Guillaume Lazzara committed
82
83
84
85
86
    /// \param[in] language    The language which should be recognized by
    ///		               Tesseract. (fra, en, ...)
    /// \param[in] output_file If set, store the recognized text in
    ///                        this file.
    //
87
    template <typename L>
Guillaume Lazzara's avatar
Guillaume Lazzara committed
88
    void
89
    recognition(line_set<L>& lines, const char *language);
90
91
92
93
94
95
96


    /// Recognize text from an image.
    template <typename I>
    void
    recognition(const Image<I>& line,
		const char *language,
Guillaume Lazzara's avatar
Guillaume Lazzara committed
97
		const std::string& output_file = std::string());
98

99
100
101
102


# ifndef MLN_INCLUDE_ONLY

103
104

    template <typename L>
Guillaume Lazzara's avatar
Guillaume Lazzara committed
105
    void
106
    recognition(line_set<L>& lines, const char *language)
107
108
109
    {
      trace::entering("scribo::text::recognition");

110
111
112
113
114
115
      // Tesseract is known to have issues while reading training data
      // depending on the current locales in use. Training data files use
      // float data and the decimal separator can be either '.' or ','
      // causing errors.
      // Setting locale to "C" fix that issue.
      setlocale(LC_ALL, "C");
116

Guillaume Lazzara's avatar
Guillaume Lazzara committed
117
      // Initialize Tesseract.
118
#  ifdef HAVE_TESSERACT_2
119
      TessBaseAPI::InitWithLanguage(NULL, NULL, language, NULL, false, 0, NULL);
120
121
#  else // HAVE_TESSERACT_3
      tesseract::TessBaseAPI tess;
122
      if (tess.Init(NULL, language, tesseract::OEM_DEFAULT) == -1)
123
124
125
126
127
      {
	std::cout << "Error: cannot initialize tesseract!" << std::endl;
	abort();
      }
      tess.SetPageSegMode(tesseract::PSM_SINGLE_LINE);
128

129
#  endif // HAVE_TESSERACT_2
Guillaume Lazzara's avatar
Guillaume Lazzara committed
130

131
132
      typedef mln_ch_value(L,bool) I;

Guillaume Lazzara's avatar
Guillaume Lazzara committed
133

Guillaume Lazzara's avatar
Guillaume Lazzara committed
134
      /// Use text bboxes with Tesseract
135
      for_all_lines(i, lines)
136
      {
Guillaume Lazzara's avatar
Guillaume Lazzara committed
137
	if (! lines(i).is_textline())
138
139
140
	  continue;

	mln_domain(I) box = lines(i).bbox();
Guillaume Lazzara's avatar
Guillaume Lazzara committed
141

Guillaume Lazzara's avatar
Guillaume Lazzara committed
142
143
144
	// Make sure characters are isolated from the borders.
	// Help Tesseract.
	box.enlarge(2);
Guillaume Lazzara's avatar
Guillaume Lazzara committed
145

Guillaume Lazzara's avatar
Guillaume Lazzara committed
146
	I text_ima(box);
147
	data::fill(text_ima, false);
Guillaume Lazzara's avatar
Guillaume Lazzara committed
148
149

	// Careful : background is set to 'False'
150
	const component_set<L>& comp_set = lines.components();
151
152
	const L& lbl = comp_set.labeled_image();

Guillaume Lazzara's avatar
Guillaume Lazzara committed
153
	// Extract each character components to create the line image.
154
155
	const mln::util::array<component_id_t>& comps = lines(i).component_ids();
	for_all_elements(e, lines(i).component_ids())
156
157
158
	{
	  unsigned comp_id = comps(e);
	  data::fill(((text_ima | comp_set(comp_id).bbox()).rw() | (pw::value(lbl) == pw::cst(comp_id))).rw(),
159
		     true);
160
	}
161
162

	/// Improve text quality.
Guillaume Lazzara's avatar
Guillaume Lazzara committed
163
	text::clean_inplace(lines(i), text_ima);
Guillaume Lazzara's avatar
Guillaume Lazzara committed
164

Guillaume Lazzara's avatar
Guillaume Lazzara committed
165
166
	// Make sure characters are isolated from the borders.
	// Help Tesseract.
Guillaume Lazzara's avatar
Guillaume Lazzara committed
167
168
169
170
171
172
	//
	// FIXME: can be improved! We need a morpher for a constant
	// extension set to false (avoid data::fill), a morpher for
	// translating the domain to (0,0) (avoid the creation of a
	// new image), change the default border::thickness to 0 and a
	// morpher to enlarge the domain to a part of the extension.
Guillaume Lazzara's avatar
Guillaume Lazzara committed
173
174
175
176
177
	mln_domain(I) lbox = text_ima.domain();
	lbox.enlarge(lines(i).char_space() + 2);
	I line_image(lbox, 0); // Make sure there is no border!
	data::fill(line_image, false);
	data::paste_without_localization(text_ima, line_image);
Guillaume Lazzara's avatar
Guillaume Lazzara committed
178

Guillaume Lazzara's avatar
Guillaume Lazzara committed
179
	// Recognize characters.
180
#  ifdef HAVE_TESSERACT_2
Guillaume Lazzara's avatar
Guillaume Lazzara committed
181
	char* s = TessBaseAPI::TesseractRect(
Guillaume Lazzara's avatar
Guillaume Lazzara committed
182
183
184
185
186
187
188
	    (unsigned char*) line_image.buffer(),
	    sizeof (bool),			 // Pixel size.
	    line_image.ncols() * sizeof (bool),  // Row_offset
	    0,					 // Left
	    0,					 // Top
	    line_image.ncols(),		         // n cols
	    line_image.nrows());		 // n rows
189
#  else // HAVE_TESSERACT_3
190
	tess.SetImage(
191
192
	  (unsigned char*) line_image.buffer(),
	  line_image.ncols(),		         // n cols
193
194
195
196
	  line_image.nrows(),		         // n rows
	  sizeof (bool),			 // Pixel size.
	  line_image.ncols() * sizeof (bool));    // Row_offset
	char* s = tess.GetUTF8Text();
197
#  endif // ! HAVE_TESSERACT_2
Guillaume Lazzara's avatar
Guillaume Lazzara committed
198
199

	if (s != 0)
200
	{
201
202
203
	  std::string str(s);
	  str = str.substr(0, str.length() - 2);
	  lines(i).update_text(str);
204
	}
Guillaume Lazzara's avatar
Guillaume Lazzara committed
205

206
	// The string has been allocated by Tesseract. It must be released.
207
	delete [] s;
208
209
210
211
212
213
      }

      trace::exiting("scribo::text::recognition");
    }


214
215
216
217
    template <typename I>
    void
    recognition(const Image<I>& line_,
		const char *language,
Guillaume Lazzara's avatar
Guillaume Lazzara committed
218
		const std::string& output_file = std::string())
219
220
221
222
223
224
    {
      trace::entering("scribo::text::recognition");

      const I& line = exact(line_);
      mln_precondition(line.is_valid());

225
226
227
228
229
230
231
      // Tesseract is known to have issues while reading training data
      // depending on the current locales in use. Training data files use
      // float data and the decimal separator can be either '.' or ','
      // causing errors.
      // Setting locale to "C" fix that issue.
      setlocale(LC_ALL, "C");

232
      // Initialize Tesseract.
233
#  ifdef HAVE_TESSERACT_2
234
      TessBaseAPI::InitWithLanguage(NULL, NULL, language, NULL, false, 0, NULL);
235
236
#  else // HAVE_TESSERACT_3
      tesseract::TessBaseAPI tess;
237
      if (tess.Init(NULL, language, tesseract::OEM_DEFAULT) == -1)
238
239
240
241
242
      {
	std::cout << "Error: cannot initialize tesseract!" << std::endl;
	abort();
      }
#  endif // ! HAVE_TESSERACT_2
243
244

      std::ofstream file;
245
246
      if (!output_file.empty())
	file.open(output_file.c_str());
247
248
249
250
251
252
253
254
255
256
257
258
259
260

      mln_domain(I) box = line.domain();
      // Make sure characters are isolated from the borders.
      // Help Tesseract.
      box.enlarge(2);

      I text_ima(box);
      data::fill(text_ima, false);
      data::paste(line, text_ima);

      // Make sure there is no border.
      border::resize(text_ima, 0);

      // Recognize characters.
261
#  ifdef HAVE_TESSERACT_2
262
263
264
265
266
267
268
269
      char* s = TessBaseAPI::TesseractRect(
	(unsigned char*) text_ima.buffer(),
	sizeof (bool),			  // Pixel size.
	text_ima.ncols() * sizeof (bool), // Row_offset
	0,				  // Left
	0,				  // Top
	text_ima.ncols(),		  // n cols
	text_ima.nrows());		  // n rows
270
271
272
273
274
275
276
277
278
279
#  else // HAVE_TESSERACT_3
      char* s = tess.TesseractRect(
	(unsigned char*) text_ima.buffer(),
	sizeof (bool),			  // Pixel size.
	text_ima.ncols() * sizeof (bool), // Row_offset
	0,				  // Left
	0,				  // Top
	text_ima.ncols(),		  // n cols
	text_ima.nrows());		  // n rows
#  endif // ! HAVE_TESSERACT_2
280
281
282

	if (s != 0)
	{
283
	  if (!output_file.empty())
284
285
286
	  {
	    std::string str(s);
	    str = str.substr(0, str.length() - 1);
287
288
289
290
291
292
293
294
295
	    file << line.domain().bbox().pmin().row()
		 << " "
		 << line.domain().bbox().pmin().col()
		 << " "
		 << line.domain().bbox().pmax().row()
		 << " "
		 << line.domain().bbox().pmax().col()
		 << " "
		 << str;
296
	  }
297
298
299
	}

	// The string has been allocated by Tesseract. We must free it.
300
	delete [] s;
301

302
	if (!output_file.empty())
303
304
305
306
307
308
309
	  file.close();

	trace::exiting("scribo::text::recognition");
    }



310
311
312
313
314
315
316
# endif // ! MLN_INCLUDE_ONLY

  } // end of namespace scribo::text

} // end of namespace scribo

#endif // ! SCRIBO_TEXT_RECOGNITION_HH