Commit f5024327 authored by Guillaume Lazzara's avatar Guillaume Lazzara
Browse files

scribo/io/xml/save.hh: Make use of document structure.

parent 222955ac
2010-11-15 Guillaume Lazzara <z@lrde.epita.fr>
* scribo/io/xml/save.hh: Make use of document structure.
2010-11-15 Guillaume Lazzara <z@lrde.epita.fr> 2010-11-15 Guillaume Lazzara <z@lrde.epita.fr>
* scribo/primitive/extract/elements.hh: New routine. * scribo/primitive/extract/elements.hh: New routine.
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
/// \file /// \file
/// ///
/// \brief Save text line information as XML. /// \brief Save document information as XML.
# include <fstream> # include <fstream>
# include <sstream> # include <sstream>
...@@ -46,7 +46,7 @@ namespace scribo ...@@ -46,7 +46,7 @@ namespace scribo
namespace xml namespace xml
{ {
/*! \brief Save text line information as XML. /*! \brief Save document information as XML.
We use a XML Schema part of the PAGE (Page Analysis and Ground We use a XML Schema part of the PAGE (Page Analysis and Ground
truth Elements) image representation framework. truth Elements) image representation framework.
...@@ -60,8 +60,7 @@ namespace scribo ...@@ -60,8 +60,7 @@ namespace scribo
*/ */
template <typename L> template <typename L>
void void
save(const std::string& input_name, save(const document<L>& doc,
const line_set<L>& lines,
const std::string& output_name, const std::string& output_name,
bool extended_format); bool extended_format);
...@@ -86,12 +85,34 @@ namespace scribo ...@@ -86,12 +85,34 @@ namespace scribo
return input; return input;
} }
void print_box_coords(std::ofstream& ostr, const box2d& b,
const char *space)
{
std::string sc = space;
std::string sp = sc + " ";
ostr << sc << "<coords>" << std::endl
<< sp << "<point x=\"" << b.pmin().col()
<< "\" y=\"" << b.pmin().row() << "\"/>"
<< std::endl
<< sp << "<point x=\"" << b.pmax().col()
<< "\" y=\"" << b.pmin().row() << "\"/>"
<< std::endl
<< sp << "<point x=\"" << b.pmax().col()
<< "\" y=\"" << b.pmax().row() << "\"/>"
<< std::endl
<< sp << "<point x=\"" << b.pmin().col()
<< "\" y=\"" << b.pmax().row() << "\"/>"
<< std::endl
<< sc << "</coords>" << std::endl;
}
} // end of namespace scribo::io::xml::internal } // end of namespace scribo::io::xml::internal
template <typename L> template <typename L>
void void
save(const std::string& input_name, save(const document<L>& doc,
const line_set<L>& lines,
const std::string& output_name, const std::string& output_name,
bool extended_format) bool extended_format)
{ {
...@@ -100,9 +121,12 @@ namespace scribo ...@@ -100,9 +121,12 @@ namespace scribo
std::ofstream file(output_name.c_str()); std::ofstream file(output_name.c_str());
if (! file) if (! file)
{ {
std::cerr << "error: cannot open file '" << input_name << "'!"; std::cerr << "error: cannot open file '" << doc.filename() << "'!";
abort(); abort();
} }
const line_set<L>& lines = doc.text();
std::map<char, std::string> html_map; std::map<char, std::string> html_map;
html_map['\"'] = "&quot;"; html_map['\"'] = "&quot;";
html_map['<'] = "&lt;"; html_map['<'] = "&lt;";
...@@ -111,13 +135,13 @@ namespace scribo ...@@ -111,13 +135,13 @@ namespace scribo
file << "<?xml version=\"1.0\"?>" << std::endl; file << "<?xml version=\"1.0\"?>" << std::endl;
if (extended_format) if (extended_format)
{ {
file << "<pcGts>" << std::endl; file << "<pcGts>" << std::endl;
} }
else else
{ {
file << "<pcGts xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16 http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16/pagecontent.xsd\" pcGtsId=\"" << input_name << "\">" << std::endl; file << "<pcGts xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16 http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16/pagecontent.xsd\" pcGtsId=\"" << doc.filename() << "\">" << std::endl;
} }
file << " <PcMetadata>" << std::endl; file << " <PcMetadata>" << std::endl;
file << " <PcCreator>LRDE</PcCreator>" << std::endl; file << " <PcCreator>LRDE</PcCreator>" << std::endl;
...@@ -126,110 +150,86 @@ namespace scribo ...@@ -126,110 +150,86 @@ namespace scribo
file << " <PcComments>Generated by Scribo from Olena.</PcComments>" << std::endl; file << " <PcComments>Generated by Scribo from Olena.</PcComments>" << std::endl;
file << " </PcMetadata>" << std::endl; file << " </PcMetadata>" << std::endl;
file << " <page image_filename=\"" << input_name file << " <page image_filename=\"" << doc.filename()
<< "\" image_width=\"" << lines.components().labeled_image().ncols() << "\" image_width=\"" << lines.components().labeled_image().ncols()
<< "\" image_height=\"" << lines.components().labeled_image().nrows() << "\" image_height=\"" << lines.components().labeled_image().nrows()
<< "\">" << std::endl; << "\">" << std::endl;
for_all_lines(l, lines) for_all_lines(l, lines)
{
if (! lines(l).is_valid()
|| lines(l).tag() != line::None
|| lines(l).type() != line::Text) // Is NOT a text line.
continue;
{ {
if (! lines(l).is_valid() file << " <text_region id=\"" << lines(l).id()
|| lines(l).tag() != line::None << "\" txt_orientation=\"" << lines(l).orientation()
|| lines(l).type() != line::Text) // Is NOT a text line. << "\" txt_reading_orientation=\"" << lines(l).reading_orientation()
continue; << "\" txt_reading_direction=\"" << lines(l).reading_direction()
<< "\" txt_text_type=\"" << lines(l).type()
<< "\" txt_reverse_video=\"" << (lines(l).reverse_video() ? "true" : "false")
<< "\" txt_indented=\"" << (lines(l).indented() ? "true" : "false")
<< "\" kerning=\"" << lines(l).char_space();
// EXTENSIONS - Not officially supported
if (extended_format)
{ {
file << " <text_region id=\"" << lines(l).id() file << "\" baseline=\"" << lines(l).baseline()
<< "\" txt_orientation=\"" << lines(l).orientation() << "\" meanline=\"" << lines(l).meanline()
<< "\" txt_reading_orientation=\"" << lines(l).reading_orientation() << "\" x_height=\"" << lines(l).x_height()
<< "\" txt_reading_direction=\"" << lines(l).reading_direction() << "\" d_height=\"" << lines(l).d_height()
<< "\" txt_text_type=\"" << lines(l).type() << "\" a_height=\"" << lines(l).a_height()
<< "\" txt_reverse_video=\"" << (lines(l).reverse_video() ? "true" : "false") << "\" char_width=\"" << lines(l).char_width();
<< "\" txt_indented=\"" << (lines(l).indented() ? "true" : "false") }
<< "\" kerning=\"" << lines(l).char_space(); // End of EXTENSIONS
file << "\">"
// EXTENSIONS - Not officially supported << std::endl;
if (extended_format)
{
file << "\" baseline=\"" << lines(l).baseline()
<< "\" meanline=\"" << lines(l).meanline()
<< "\" x_height=\"" << lines(l).x_height()
<< "\" d_height=\"" << lines(l).d_height()
<< "\" a_height=\"" << lines(l).a_height()
<< "\" char_width=\"" << lines(l).char_width();
}
// End of EXTENSIONS
file << "\">"
<< std::endl;
if (extended_format) internal::print_box_coords(file, lines(l).bbox(), " ");
{
file << " <coords>" << std::endl if (extended_format)
<< " <point x=\"" << lines(l).bbox().pmin().col() {
<< "\" y=\"" << lines(l).bbox().pmin().row() << "\"/>" file << " <paragraph>" << std::endl;
<< std::endl
<< " <point x=\"" << lines(l).bbox().pmax().col() internal::print_box_coords(file, lines(l).bbox(), " ");
<< "\" y=\"" << lines(l).bbox().pmin().row() << "\"/>"
<< std::endl if (lines(l).has_text())
<< " <point x=\"" << lines(l).bbox().pmax().col() {
<< "\" y=\"" << lines(l).bbox().pmax().row() << "\"/>" std::string tmp = lines(l).text();
<< std::endl tmp = internal::html_markups_replace(tmp, html_map);
<< " <point x=\"" << lines(l).bbox().pmin().col()
<< "\" y=\"" << lines(l).bbox().pmax().row() << "\"/>" file << " <line text=\""
<< std::endl << tmp
<< " </coords>" << std::endl; << "\">" << std::endl;
}
else
file << " <paragraph>" << std::endl; file << " <line>" << std::endl;
file << " <coords>" << std::endl internal::print_box_coords(file, lines(l).bbox(), " ");
<< " <point x=\"" << lines(l).bbox().pmin().col()
<< "\" y=\"" << lines(l).bbox().pmin().row() << "\"/>"
<< std::endl
<< " <point x=\"" << lines(l).bbox().pmax().col()
<< "\" y=\"" << lines(l).bbox().pmin().row() << "\"/>"
<< std::endl
<< " <point x=\"" << lines(l).bbox().pmax().col()
<< "\" y=\"" << lines(l).bbox().pmax().row() << "\"/>"
<< std::endl
<< " <point x=\"" << lines(l).bbox().pmin().col()
<< "\" y=\"" << lines(l).bbox().pmax().row() << "\"/>"
<< std::endl
<< " </coords>" << std::endl;
if (lines(l).has_text())
{
std::string tmp = lines(l).text();
tmp = internal::html_markups_replace(tmp, html_map);
file << " <line text=\""
<< tmp
<< "\">" << std::endl;
}
else
file << " <line>" << std::endl;
file << " <coords>" << std::endl
<< " <point x=\"" << lines(l).bbox().pmin().col()
<< "\" y=\"" << lines(l).bbox().pmin().row() << "\"/>"
<< std::endl
<< " <point x=\"" << lines(l).bbox().pmax().col()
<< "\" y=\"" << lines(l).bbox().pmin().row() << "\"/>"
<< std::endl
<< " <point x=\"" << lines(l).bbox().pmax().col()
<< "\" y=\"" << lines(l).bbox().pmax().row() << "\"/>"
<< std::endl
<< " <point x=\"" << lines(l).bbox().pmin().col()
<< "\" y=\"" << lines(l).bbox().pmax().row() << "\"/>"
<< std::endl
<< " </coords>" << std::endl;
file << " </line>" << std::endl;
file << " </paragraph>" << std::endl;
}
file << " </text_region>" << std::endl; file << " </line>" << std::endl;
file << " </paragraph>" << std::endl;
} }
file << " </text_region>" << std::endl;
}
}
const component_set<L>& elts = doc.elements();
for_all_comps(e, elts)
if (elts(e).is_valid())
{
file << " <image_region id=\"ir" << elts(e).id()
<< "\" img_colour_type=\"24_Bit_Colour\""
<< " img_orientation=\"0.000000\" "
<< " img_emb_text=\"No\" "
<< " img_bgcolour=\"White\">" << std::endl;
internal::print_box_coords(file, elts(e).bbox(), " ");
file << " </image_region>" << std::endl;
} }
file << " </page>" << std::endl; file << " </page>" << std::endl;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment