merging.hh 22.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
//
// This file is part of Olena.
//
// Olena is free software: you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation, version 2 of the License.
//
// Olena is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Olena.  If not, see <http://www.gnu.org/licenses/>.
//
// As a special exception, you may use this file as part of a free
// software project without restriction.  Specifically, if other files
// instantiate templates or use macros or inline functions from this
// file, or you compile this file and link it with other files to produce
// an executable, this file does not by itself cause the resulting
// executable to be covered by the GNU General Public License.  This
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.

#ifndef SCRIBO_TEXT_MERGING_HH
# define SCRIBO_TEXT_MERGING_HH

/// \file
///
/// \brief Merge text component in order to reconstruct text lines.


34
35
36
37
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
38
#include <set>
39
40
41
42
43
44
45
46
47
48
49
#include <algorithm>

#include <mln/core/image/image2d.hh>
#include <mln/core/image/dmorph/image_if.hh>
#include <mln/util/array.hh>
#include <mln/io/pbm/load.hh>
#include <mln/io/pgm/save.hh>

#include <mln/data/fill.hh>
#include <mln/data/wrap.hh>

50
51
#include <mln/make/box2d.hh>

52
53
54
55
56
57
#include <mln/value/rgb8.hh>
#include <mln/io/ppm/save.hh>

#include <mln/draw/box.hh>
#include <mln/data/stretch.hh>
#include <mln/data/wrap.hh>
58
59
#include <mln/util/timer.hh>

Guillaume Lazzara's avatar
Guillaume Lazzara committed
60
#include <scribo/text/look_like_text_lines.hh>
61

62

63
64
65
66
67
68
namespace scribo
{

  namespace text
  {

69
70
    using namespace mln;

71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

    /// \brief Merge text component in order to reconstruct text lines.
    ///
    /// \param[in] lines A line set.
    ///
    /// \return A new line set.  Line ids are preserved and merged
    /// lines (not valid anymore) are tagged with line::Merged.  The
    /// lines produced with this algorithm (valid lines) are tagged
    /// with line::None. Line type is also set either with line::Text
    /// or line::Punctuation.
    //
    template <typename L>
    line_set<L>
    merging(const scribo::line_set<L>& lines);


# ifndef MLN_INCLUDE_ONLY


90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
    namespace internal
    {

      using namespace mln;
      using value::int_u8;


      template <typename T, typename T2>
      void draw_box(image2d<T>& input, const box2d& b, T2 l)
      {
	const unsigned
	  delta = input.delta_index(dpoint2d(1,0)),
	  nrows = b.nrows(),
	  ncols = b.ncols();
	T* p_start = & input(b.pmin());
	T* ptr = p_start;
	for (unsigned r = 0; r < nrows; ++r)
	{
	  ptr = p_start;
	  for (unsigned c = 0; c < ncols; ++c)
	    *ptr++ = l;
	  p_start += delta;
	}
      }



117

118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
      template <typename T, typename T2>
      void draw_box(image2d<T>& input,
		    int pmin_row, int pmin_col,
		    int pmax_row, int pmax_col,
		    T2 l)
      {
	if (pmax_row < pmin_row || pmax_col < pmin_col)
	  return;

	const unsigned
	  input_nrows_1 = input.nrows() - 1,
	  input_ncols_1 = input.ncols() - 1;
	if (pmin_row < 0) pmin_row = 0;
	if (pmin_col < 0) pmin_col = 0;
	if (pmax_row > input_nrows_1) pmax_row = input_nrows_1;
	if (pmax_col > input_ncols_1) pmax_col = input_ncols_1;

	const unsigned
	  delta = input.delta_index(dpoint2d(1,0)),
	  nrows = pmax_row - pmin_row + 1,
	  ncols = pmax_col - pmin_col + 1;
	T* p_start = & input.at_(pmin_row, pmin_col);
	T* ptr = p_start;
	for (unsigned r = 0; r < nrows; ++r)
	{
	  ptr = p_start;
	  for (unsigned c = 0; c < ncols; ++c)
	    *ptr++ = l;
	  p_start += delta;
	}
      }



152

153
      unsigned my_find_root(mln::util::array<unsigned>& parent, unsigned x)
154
155
156
157
158
159
      {
	if (parent[x] == x)
	  return x;
	return parent[x] = my_find_root(parent, parent[x]);
      }

160

161
162
163
164
165
166
167
168
169
170
171
172
      void swap_ordering(unsigned l1, unsigned l2)
      {
	if (l2 > l1)
	  return;
	unsigned l1_ = l1;
	l1 = l2;
	l2 = l1_;
      }



      template <typename L>
173
      unsigned do_union(scribo::line_set<L>& lines,
174
175
			unsigned l1,
			unsigned l2,
176
			mln::util::array<unsigned>& parent)
177
      {
178
179
180
	l1 = my_find_root(parent, l1);
	l2 = my_find_root(parent, l2);
	if (l1 == l2)
181
182
183
184
	  {
	    std::cerr << "what! in'do_union': already merged!!!" << std::endl;
	    return l1;
	  }
185

186
187
	swap_ordering(l1, l2);
	parent[l2] = l1; // The smallest label value is root.
188

189
190
191
192
193
194
195
196
197
198
199
	if (lines(l2).card() > lines(l1).card())
	{
          // we transfer data from the largest item to the root one.
	  scribo::line_info<L> tmp = lines(l1);
	  lines(l1) = lines(l2);
	  lines(l1).fast_merge(tmp);

	  // We must set manually the tag for lines(l2) since it is
	  // not used directly in merge process so its tag cannot be
	  // updated automatically.
	  lines(l2).update_tag(line::Merged);
200
	  lines(l2).set_hidden(true);
201
202
203
	}
	else
	  lines(l1).fast_merge(lines(l2));
204
205
206
207
208
209
210
211

	// l1's tag is automatically set to line::Needs_Precise_Stats_Update
	// l2's tag is automatically set to line::Merged

	return l1;
      }


212
213


214
215
216
217
218
219
220
221
      box2d enlarge(const box2d& b, int delta)
      {
	box2d b_(point2d(b.pmin().row(), b.pmin().col() - delta),
		 point2d(b.pmax().row(), b.pmax().col() + delta));
	return b_;
      }


222
      template <typename L>
223
224
      bool between_separators(const scribo::line_info<L>& l1,
			      const scribo::line_info<L>& l2)
225
      {
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
	unsigned
	  col1 = l1.bbox().pcenter().col(),
	  col2 = l2.bbox().pcenter().col();
	const mln_ch_value(L, bool)&
	  separators = l1.holder().components().separators();

	typedef const bool* sep_ptr_t;
	sep_ptr_t sep_ptr, end;

	if (col1 < col2)
	{
	  sep_ptr = &separators(l1.bbox().pcenter());
	  end = sep_ptr + col2 - col1;
	}
	else
	{
	  sep_ptr = &separators(l2.bbox().pcenter());
	  end = sep_ptr + col1 - col2;
	}

	// If sep_ptr is true, then a separator is reached.
	while (!*sep_ptr && sep_ptr != end)
	  ++sep_ptr;

	return *sep_ptr;
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
      }


      /*! \brief Check whether two lines can merge.

	Criterions:
	- Height ratio must be <= 1.7
	- Baselines delta must be <= 3
	- Boxes must not overlap too much.

      */
      template <typename L>
      bool lines_can_merge(const scribo::line_info<L>& l1,
			   const scribo::line_info<L>& l2)
      {
	// Parameters.
	const float x_ratio_max = 1.7, baseline_delta_max = 3;

	// Similarity of x_height.
270
	{
271
272
273
274
	  float x1 = l1.x_height(), x2 = l2.x_height();
	  float x_ratio = std::max(x1, x2) / std::min(x1, x2);
	  if (x_ratio > x_ratio_max)
	    return false;
275
	}
276
277

	// Same baseline.
278
	{
279
280
	  if (std::abs(l1.baseline() - l2.baseline()) > baseline_delta_max)
	    return false;
281
282
	}

283
284
285
286
287
	// left / right
	unsigned
	  col1 = l1.bbox().pcenter().col(),
	  col2 = l2.bbox().pcenter().col();
	if (col1 < col2)
288
289
290
291
	{
	  if ((col1 + l1.bbox().width() / 4) >= (col2 - l2.bbox().width() / 4))
 	    return false;
	}
292
	else
293
294
295
296
297
298
299
300
301
	  if ((col2 + l2.bbox().width() / 4) >= (col1 - l1.bbox().width() / 4))
	    return false;


 	// Check that there is no separator in between.
	if (l1.holder().components().has_separators())
	  return ! between_separators(l1, l2);

	return true;
302
303
304
      }


305
306


307
      template <typename L>
308
309
      int horizontal_distance(const scribo::line_info<L>& l1,
			      const scribo::line_info<L>& l2)
310
      {
311
312
313
314
	if (l1.bbox().pcenter().col() < l2.bbox().pcenter().col())
	  return l2.bbox().pmin().col() - l1.bbox().pmax().col();
	else
	  return l1.bbox().pmin().col() - l2.bbox().pmax().col();
315
316
317
      }


318
319
320


      /*! \brief Check whether a non text line and a text line can merge.
321

322
323
324
325
	Criterions:
	- Small height (c.height < l.x_height)
	- Character width mean in 'c' must be lower than the character
        width median of 'l'. (c.width / c.ncomps < l.char_width)
326

327
	OR
328

329
330
331
332
	- Small height (c.height < l.x_height)
	- Not so long width (c.width < 5 * l.char_width)
	- Aligned with the 'x' center ((l.baseline + l.meanline / 2) - c.center.row < 7)
	- tiny spacing (horizontal distance < 5)
333

334
      */
335
      template <typename L>
336
337
      bool non_text_and_text_can_merge(const scribo::line_info<L>& l_cur, // current
				       const scribo::line_info<L>& l_ted) // touched
338
      {
339
	if (l_cur.type() == line::Text || l_ted.type() != line::Text)
340
	  return false;
341
342
343
344
	// the current object is a NON-textline
	// the background (touched) object is a textline


345
346
347
348
349
350
 	// Check that there is no separator in between.
	if (l_cur.holder().components().has_separators()
	    && between_separators(l_cur, l_ted))
	  return false;


351
352
	// General case (for tiny components like --> ',:."; <--):
	if (l_cur.bbox().height() < l_ted.x_height()
353
	    && float(l_cur.bbox().width()) / float(l_cur.card()) < l_ted.char_width())
354
355
356
357
358
359
360
361
362
	  return true;


	// Special case for '---':
	if (// small height:
	  l_cur.bbox().height() < l_ted.x_height()
	  // // not so long width:
	  && l_cur.bbox().width() < 5 * l_ted.char_width()
	  // align with the 'x' center:
363
	  && std::abs((l_ted.baseline() + l_ted.meanline()) / 2 - l_cur.bbox().pcenter().row()) < 7
364
	  // tiny spacing:
365
	  && unsigned(horizontal_distance(l_cur, l_ted)) < l_ted.char_width()
366
367
368
369
	  )
	{
	  return true;
	}
370
371


372
	// Special case
373

374
	// Looking for alignement.
Guillaume Lazzara's avatar
Guillaume Lazzara committed
375
	mln::def::coord
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
	  top_row = l_cur.bbox().pmin().row(),
	  bot_row = l_cur.bbox().pmax().row();


// 	std::cout << "top_row = " << top_row << " - bot_row = " << bot_row << std::endl;
// 	std::cout << std::abs(bot_row - l_ted.baseline())
// 		  << " - d "
// 		  << std::abs(bot_row - l_ted.descent())
// 		  << " - dd "
// 		  << std::abs(bot_row - l_ted.ebbox().pmax().row())
// 		  << " - "
// 		  << std::abs(top_row - l_ted.meanline())
// 		  << " - a "
// 		  << std::abs(top_row - l_ted.ascent())
// 		  << " - aa "
// 		  << std::abs(top_row - l_ted.ebbox().pmin().row())
// 		  << " - "
// 		  << l_ted.ascent()
// 		  << " - "
// 		  << l_ted.descent()
// 		  << std::endl;

	if ((std::abs(bot_row - l_ted.baseline()) < 5
	     || std::abs(bot_row - l_ted.ebbox().pmax().row()) < 5)
	    &&
	    (std::abs(top_row - l_ted.meanline()) < 5
	     || std::abs(top_row - l_ted.ebbox().pmin().row()) < 5))
	{
	  return true;
	}
406

407
408
	return false;
      }
409
410


411
412
413



414
415
416
417
418
      /*! \brief Merge text lines.

	This algorithm iterates over all the components ordered by size.
	It uses a 2d labeled image, tmp_box, to draw component bounding
	boxes and uses that image to check bounding box collisions.
419
	Depending on that collisions and whether the lines looks like
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
	a text line or not, bounding boxes are merged.

	\verbatim
	ALGORITHM:
	for each component 'cur' in decreasing order
	  if already merged
	    continue

	    ///
	    /// x-----------x
	    /// |           |
	    /// x     x     x
	    /// |           |
	    /// x-----------x
	    ///

	    Set labels <- Every labels corresponding to the colliding bounding
	                  boxes (uses only the 7 sites detailled above).

	    If label.card == 1
	      l = label.get(0);
	      If l != background
442
443
	        If looks_like_a_text_line(cur)
		  If looks_like_a_text_line(l)
444
445
446
447
448
449
		    // Error case: a line is included in a line.
		  else
		    // Line cur is included in a frame or a drawing.
		    draw_enlarged_box(l)
		  end
		else
450
		  If looks_like_a_text_line(l)
451
452
453
454
455
456
		  // Component cur is a punctuation overlapping with line l.
		  l_ <- do_union(cur, l)
		  draw_enlarged_box(l_)
		end
	      end
	    else
457
	      If looks_like_a_text_line(cur)
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
	        // Component cur is a new line.
		draw_enlarged_box(l)
	      end
	    end
	  else
	    for each label l in labels
	      If l == background
	        continue
	      end

	      If lines_can_merge(cur, l)
	        l_ <- do_union(cur, l)
		draw_enlarged_box(l_)
		continue
	      end

474
475
	      If !looks_like_a_text_line(cur) and looks_like_a_text_line(l)
	     	If non_text_and_text_can_merge(cur, l)
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
		  // A punctuation merge with a line
		  l_ <- do_union(cur, l)
		  draw_enlarged_box(l_)
		  continue
		else
		  // None
		end
	      else
	      // Error case
	    end
	  end

	  \endverbatim
      */
      // FIXME:
      //
      // Important note: after merging two lines, we draw the
      // merged line over the existing one; we have to ensure that we
      // cover the previous rectangle (otherwise we have a label in
495
      // 'billboard' that is not used anymore! and it can mix up the
496
497
498
499
      // detection of upcoming merges...)  so this delta has to remain
      // the same during one pass.  Another solution (yet more costly)
      // could be of erasing the previous rectangle before re-drawing...
      //
500
      template <typename L>
501
      void
502
503
504
505
      one_merge_pass(unsigned ith_pass,
		     const box2d& domain,
		     std::vector<scribo::line_id_t>& v,
		     scribo::line_set<L>& lines,
506
		     mln::util::array<unsigned>& parent)
507
      {
508
509
	image2d<unsigned> billboard(domain);
	data::fill(billboard, 0);
510
511
512
513

	image2d<value::int_u8> log(domain);
	data::fill(log, 0);

514
	const unsigned n = v.size();
515
	unsigned l_;
516
517

	unsigned
518
519
520
521
522
523
524
	  count_txtline_IN_txtline = 0,
	  count_txtline_IN_junk = 0,
	  count_two_lines_merge = 0,
	  count_new_txtline = 0,
	  count_comp_IN_txtline = 0,
	  count_comp_HITS_txtline = 0,
	  count_WTF = 0;
525
526
527
528
529
530
531
532
533
534

	for (int i = n - 1; i >= 0; --i)
	{
	  unsigned l = v[i];

	  if (parent[l] != l) // not a root, so has already merged, thus ignore it
	    continue;

	  box2d b = lines(l).bbox();

535
	  unsigned tl, tr, ml, mc, mr, bl, br;
536
537

	  {
538
	    box2d b_ = lines(l).ebbox();
539

540
541
542
543
544
545
546
547
548
549
	    /*
	      tl             tr
	      x---------------x
	      |               |
	      |       mc      |
	   ml x       x       x mr
	      |               |
	      |               |
	      x---------------x
	      bl             br
550

551
	    */
552
553


554
555
556
557
558
559
560
	    tl = billboard(b_.pmin());
	    tr = billboard.at_(b_.pmin().row(), b_.pmax().col());
	    ml = billboard.at_(b_.pcenter().row(), b_.pmin().col());
	    mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col());
	    mr = billboard.at_(b_.pcenter().row(), b_.pmax().col());
	    bl = billboard.at_(b_.pmax().row(), b_.pmin().col());
	    br = billboard(b_.pmax());
561
	  }
562

563
564
565
566
567
568
569
570
571
572
	  typedef std::set<unsigned> set_t;
	  std::set<unsigned> labels;
	  labels.insert(tl);
	  labels.insert(tl);
	  labels.insert(tr);
	  labels.insert(ml);
	  labels.insert(mc);
	  labels.insert(mr);
	  labels.insert(bl);
	  labels.insert(br);
573
574


575
576
577
578
579
580
581
582
583
584
585
586
587
	  for (set_t::const_iterator it = labels.begin();
	       it != labels.end();
	       ++it)
	  {
	    if (*it == 0)
	      continue;

	    if (lines(*it).type() != line::Text)
	      std::cerr << "outch: we have hit, so drawn, a non-text..." << std::endl;
	  }


	  if (labels.size() == 1) // Same behavior for all anchors.
588
589
	  {
	    if (mc != 0)
590
	    {
591
	      // Main case: it is an "included" box (falling in an already drawn box)
592

593
	      if (lines(l).type() == line::Text) // the current object IS a text line
594
	      {
595
		if (lines(mc).type() == line::Text) // included in a text line => weird
596
		{
597
		  ++count_txtline_IN_txtline;
598
//		  std::cout << "weird: inclusion of a txt_line in a txt_line!" << std::endl;
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615

		  /// Merge is perform if the current line is a
		  /// petouille considered as a line.
// 		  if ((std::abs(lines(l).ascent() - lines(mc).ascent()) >= 5)
// 		      || (std::abs(lines(l).descent() - lines(mc).descent()) >= 5))
// 		      continue;

// 		  // FIXME: Is it valid?
// 		  // A text line is included in another text line.
// 		  // They are merged.
// 		  //
// 		  l_ = do_union(lines, mc, l, parent);
// 		  draw_box(billboard, lines(l_).ebbox(), l_);

 		  // Log:
 		  draw_box(log, b, 126);

616
		}
617
618

		else  // FIXME: Remove!  since included in a non-text-line, so not drawn, so inclusion impossible!!!!!!!!!!
619
		{
620
		  std::cout << "error: should NOT happen (a text line included in a NON-text-line (so not drawn!!!)" << std::endl;
621
		  ++count_txtline_IN_junk;
622

623
624
		  // a non-text-line (probably a drawing or a frame) includes a text line
		  draw_box(billboard, lines(l).ebbox(), l);
625
		  // Log:
626
		  draw_box(log, b, 100);
627
		}
628

629
	      }
630
	      else // the current object is NOT a text line
631
	      {
632
		if (lines(mc).type() == line::Text) // included in a text line
633
		{
634
635
		  ++count_comp_IN_txtline;

636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
		  // The current object is supposed to be punctuation
		  // since it is fully included in the bbox of a line
		  // of text, so we always want to merge this object
		  // with the line.
		  //
		  // However, in case of a bad quality document, we
		  // may not always want to merge since this object
		  // could be noise or garbage... So adding new
		  // criterions could fix this issue.
		  //
 		  //if (!non_text_and_text_can_merge(lines(l), lines(mc)))
 		  //  continue;

		  // Mark current line as punctuation.
		  lines(l).update_type(line::Punctuation);

		  // Merge non-text-line #l into text line #mc.
		  l_ = do_union(lines, mc, l, parent);
		  // We have to re-draw the original largest text line since
655
		  // it may change of label (take the one of the included line).
656
		  draw_box(billboard, lines(l_).ebbox(), l_);
657
658

		  // Log:
659
		  draw_box(log, b, 128);
660
661
		}
	      }
662
	    }
663
	    else // mc == 0
664
665
666
667
	    {
	      // Main case: it is a "new" box, that might be drawn in the background.

	      // we only draw this box if it is a text-line!!!
668
	      if (lines(l).type() == line::Text)
669
	      {
670
		++count_new_txtline;
671
		draw_box(billboard, lines(l).ebbox(), l);
672
		// Log:
673
		draw_box(log, b, 127);
674
	      }
675
676
	      else
		draw_box(log, b, 1);
677
	    }
678
679
680
	  }
	  else
	  {
681
682
	    l_ = l; // current label.

683
684
685
686
	    // Particular cases.
	    for (set_t::const_iterator it = labels.begin();
		 it != labels.end();
		 ++it)
687
	    {
688
	      unsigned lcand = *it;
689

690
691
	      if (lcand == 0) // Skip background.
		continue;
692

693
694
695
696
697
	      if (lines(lcand).type() != line::Text)
		std::cerr << "again!" << std::endl;


	      if (lines(l_).type() == line::Text)
698
	      {
699
700
701
702
703
		// l_ and lcand look like text line chunks.
		if (lines_can_merge(lines(l_), lines(lcand)))
		{
		  ++count_two_lines_merge;
		  l_ = do_union(lines, l_, lcand,  parent);
704

705
706
707
708
709
710
711
712
713
714
		  draw_box(billboard, lines(l_).ebbox(), l_);
		  // Log:
		  draw_box(log, b, 151);
		  continue;
		}
		else
		{
		  ++count_WTF;
		  // Log:
		  draw_box(log, b, 255);
715

716
717
718
719
720
		  // (*) SEE BELOW
		  draw_box(billboard, lines(l_).ebbox(), l_);
		}
	      }
	      else
721
	      {
722
		// l_ does NOT looks like a text line chunk.
723
		++count_comp_HITS_txtline;
724
		if (non_text_and_text_can_merge(lines(l_), lines(lcand)))
725
		  // a petouille merges with a text line?
726
		{
727
		  ++count_comp_HITS_txtline;
728
729
730
		  l_ = do_union(lines, l_, lcand,  parent);
		  draw_box(billboard, lines(l_).ebbox(), l_);

731
		  // Log:
732
733
		  draw_box(log, b, 169);
		  continue;
734
735
736
737
		}
		else
		{
		  // Log:
738
		  draw_box(log, b, 254);
739
740
		}
	      }
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773


		/*  (*)  Text lines verticaly overlap.

		   --------------------------
		   |           l            |
		   |                        |
		   --------------------------
		   |           lcand        |
		   |                        |
		   --------------------------

		   or

		   --------------------------
		   |          l             |
		   |                        |
		   |---------------------------
		   |------------------------- |
		   |         lcand            |
		   ----------------------------

		   or

		   --------------------------
		   |          lcand         |
		   |                        |
 		   |---------------------------
		   |------------------------- |
		   |           l              |
		   ----------------------------

		 */
774
775
776

	    }
	  }
777

778
779
780
	}


Guillaume Lazzara's avatar
Guillaume Lazzara committed
781
782
783
784
785
786
787
788
// 	std::cout
// 	  << "   new txtline        = " << count_new_txtline        << std::endl
// 	  << "   comp IN txtline    = " << count_comp_IN_txtline    << std::endl
// 	  << "   2 lines merge      = " << count_two_lines_merge    << std::endl
// 	  << "   comp HITS txtline  = " << count_comp_HITS_txtline  << std::endl
// 	  << "   txtline IN junk    = " << count_txtline_IN_junk    << std::endl
// 	  << "   txtline IN txtline = " << count_txtline_IN_txtline << std::endl
// 	  << "   WTF!               = " << count_WTF << std::endl;
789
790
791


	(void) ith_pass;
Guillaume Lazzara's avatar
Guillaume Lazzara committed
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
// 	if (ith_pass == 1)
// 	{
// 	  mln::io::pgm::save(log, "log_1.pgm");
// 	  mln::io::pgm::save(data::wrap(int_u8(), billboard), "log_1e.pgm");
// 	}
// 	else if (ith_pass == 2)
// 	{
// 	  mln::io::pgm::save(log, "log_2.pgm");
// 	  mln::io::pgm::save(data::wrap(int_u8(), billboard), "log_2e.pgm");
// 	}
// 	else if (ith_pass == 3)
// 	{
// 	  mln::io::pgm::save(log, "log_3.pgm");
// 	  mln::io::pgm::save(data::wrap(int_u8(), billboard), "log_3e.pgm");
// 	}
807
808
809
810
      }



811

812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
      template <typename L>
      struct order_lines_id
      {
	order_lines_id(const scribo::line_set<L>& lines)
	  : lines_(lines)
	{
	}

	bool operator()(const scribo::line_id_t& l1, const scribo::line_id_t& l2) const
	{
	  if (lines_(l1).bbox().nsites() == lines_(l2).bbox().nsites())
	    return l1 < l2;
	  return lines_(l1).bbox().nsites() < lines_(l2).bbox().nsites();
	}

	scribo::line_set<L> lines_;
      };

830

831
832
833
834
835
836
837
838
839
840
841
842
843
      template <typename L>
      scribo::line_set<L>
      draw_boxes(const box2d& input_domain,
		 const scribo::line_set<L>& lines_)
      {
	scribo::line_set<L> lines = lines_.duplicate();
	const unsigned n = lines.nelements();

	order_lines_id<L> func(lines);
	std::vector<scribo::line_id_t> v;
	v.reserve(n);

	// Union-find parent data, used to merge lines.
844
	mln::util::array<unsigned> parent(n + 1);
845
846
847
848
849
850
851
852
853
854
855
856

	// Initialize data
	parent(0) = 0;
	for (unsigned l = 1; l < parent.nelements(); ++l)
	{
	  v.push_back(l);
	  parent[l] = l;
	}

	// Sort lines by bbox.nelements() and ids.
	std::sort(v.begin(), v.end(), func);

Guillaume Lazzara's avatar
Guillaume Lazzara committed
857
// 	mln::util::timer t;
858
859


860
	// Setting lines as text lines according to specific criterions.
861
	for_all_lines(l, lines)
862
863
864
	  if (looks_like_a_text_line(lines(l)))
	    lines(l).update_type(line::Text);

865
866

	// First pass
Guillaume Lazzara's avatar
Guillaume Lazzara committed
867
// 	t.start();
868
	one_merge_pass(1, input_domain, v, lines, parent);
Guillaume Lazzara's avatar
Guillaume Lazzara committed
869
870
// 	float ts = t.stop();
// 	std::cout << "time " << ts << std::endl;
871

872
873
874
875
876
877
878
879

//	lines.force_stats_update();

	// Sort lines by bbox.nelements() and ids again!
	// line may have grown differently since the first pass.
	std::sort(v.begin(), v.end(), func);


880
	// Second pass
Guillaume Lazzara's avatar
Guillaume Lazzara committed
881
// 	t.start();
882
	one_merge_pass(2, input_domain, v, lines, parent); // <- last pass
Guillaume Lazzara's avatar
Guillaume Lazzara committed
883
884
// 	ts = t.stop();
// 	std::cout << "time " << ts << std::endl;
885
886


887
	lines.force_stats_update();
888
889
890
891
892
893
894
895
896
897
898
899

	return lines;
      }

    } // end of namespace scribo::text::internal



    // Facade

    template <typename L>
    line_set<L>
900
    merging(const scribo::line_set<L>& lines)
901
902
903
    {
      using namespace mln;

Guillaume Lazzara's avatar
Guillaume Lazzara committed
904
905
//       mln::util::timer t;
//       t.start();
906

907
908
909
      scribo::line_set<L> output
	= internal::draw_boxes(lines.components().labeled_image().domain(),
			       lines);
Guillaume Lazzara's avatar
Guillaume Lazzara committed
910
911
//       float ts = t.stop();
//       std::cout << ts << std::endl;
912
913
914
915

      return output;
    }

916
# endif // ! MLN_INCLUDE_ONLY
917
918
919
920

  } // end of namespace scribo::text

} // end of namespace scribo
921
922

#endif // ! SCRIBO_TEXT_MERGING_HH