Commit 7c0d10c3 authored by Arthur Crepin-Leblond's avatar Arthur Crepin-Leblond Committed by Guillaume Lazzara
Browse files

XML transform program.

        * sandbox/arthur/xml_to_html/xml_to_html.hh,
        * sandbox/arthur/xml_to_html/xml_to_html.cc,
        * sandbox/arthur/xml_to_html/xml_to_html.pro: Remove.

        * sandbox/arthur/xml_to_html/README,
	* sandbox/arthur/xml_to_html/main.cc,
        * sandbox/arthur/xml_to_html/domitem.cc,
        * sandbox/arthur/xml_to_html/domitem.hh,
        * sandbox/arthur/xml_to_html/dommodel.cc,
        * sandbox/arthur/xml_to_html/dommodel.hh: Move to...

        * sandbox/arthur/xml_transform/README,
        * sandbox/arthur/xml_transform/main.cc,
        * sandbox/arthur/xml_transform/domitem.cc,
        * sandbox/arthur/xml_transform/domitem.hh,
        * sandbox/arthur/xml_transform/dommodel.cc,
        * sandbox/arthur/xml_transform/dommodel.hh: ...this.

        * sandbox/arthur/xml_to_html/patterns/css.css,
        * sandbox/arthur/xml_to_html/patterns/xsl.xsl: Move to...

        * sandbox/arthur/xml_transform/templates/html/css.css,
        * sandbox/arthur/xml_transform/templates/html/xsl.xsl: ...this.

        * sandbox/arthur/xml_transform/image_crop.cc,
        * sandbox/arthur/xml_transform/loader.cc: New.

        * sandbox/arthur/xml_transform/templates/html/html_generator.sh,
        * sandbox/arthur/xml_transform/templates/pdf/line.xsl,
        * sandbox/arthur/xml_transform/templates/pdf/main.xsl,
	* sandbox/arthur/xml_transform/templates/pdf/main_crop.xsl,
        * sandbox/arthur/xml_transform/templates/pdf/pdf_generator.sh,
	* sandbox/arthur/xml_transform/templates/pdf/regions.xsl,
        * sandbox/arthur/xml_transform/xml_transform.pro: New.
parent 4a399a3c
2010-07-08 Arthur Crepin-Leblond <crepin@ptaouchnok.lrde.epita.fr>
XML transform program.
* sandbox/arthur/xml_to_html/xml_to_html.hh,
* sandbox/arthur/xml_to_html/xml_to_html.cc,
* sandbox/arthur/xml_to_html/xml_to_html.pro: Remove.
* sandbox/arthur/xml_to_html/README,
* sandbox/arthur/xml_to_html/main.cc,
* sandbox/arthur/xml_to_html/domitem.cc,
* sandbox/arthur/xml_to_html/domitem.hh,
* sandbox/arthur/xml_to_html/dommodel.cc,
* sandbox/arthur/xml_to_html/dommodel.hh: Move to...
* sandbox/arthur/xml_transform/README,
* sandbox/arthur/xml_transform/main.cc,
* sandbox/arthur/xml_transform/domitem.cc,
* sandbox/arthur/xml_transform/domitem.hh,
* sandbox/arthur/xml_transform/dommodel.cc,
* sandbox/arthur/xml_transform/dommodel.hh: ...this.
* sandbox/arthur/xml_to_html/patterns/css.css,
* sandbox/arthur/xml_to_html/patterns/xsl.xsl: Move to...
* sandbox/arthur/xml_transform/templates/html/css.css,
* sandbox/arthur/xml_transform/templates/html/xsl.xsl: ...this.
* sandbox/arthur/xml_transform/image_crop.cc,
* sandbox/arthur/xml_transform/loader.cc: New.
* sandbox/arthur/xml_transform/templates/html/html_generator.sh,
* sandbox/arthur/xml_transform/templates/pdf/line.xsl,
* sandbox/arthur/xml_transform/templates/pdf/main.xsl,
* sandbox/arthur/xml_transform/templates/pdf/main_crop.xsl,
* sandbox/arthur/xml_transform/templates/pdf/pdf_generator.sh,
* sandbox/arthur/xml_transform/templates/pdf/regions.xsl,
* sandbox/arthur/xml_transform/xml_transform.pro: New.
2010-06-30 Arthur Crepin-Leblond <crepin@stockholm.lrde.epita.fr>
Extended XML mode support.
......
xml_to_html
Produce HTML view from ICDAR (or LRDE extended) XML.
*Compilation:
Just change the paths to olena and milena in xml_to_html.pro
*Usage:
-m, --merge: Merge an XML with an other XML
args: <xml_file_1> <xml_file_2> [xml_output]
Useful to merge LRDE XML extended mode files with ICDAR XML files to combine both images and recognized text.
-c, --create-html: Crop images (if exist) from the ppm file using the XML file and produce an html output.
args: <xml_file> [ppm_file] [output dir]
If no PPM file is given, the program will only produce an HTML output without images.
If a PPM file is given, the program will crop image_region, separator_region, graphic_regions, chart_region and table_region to output_dir/img/id.png
*Result:
To view the result run your internet browser on output_dir/output.xml (Does not work with Chrome on local, use a LAMPP server to fix this ;-) ).
\ No newline at end of file
#include <iostream>
#include "xml_to_html.hh"
int main(int argc, char **argv)
{
std::string man;
man = "xml_to_html: ...usage\n -m, --merge: Merge an XML with an other XML\n \targs: <xml_file_1> <xml_file_2> [xml_output]\n\n -c, --create-html: Crop images (if exist) from the ppm file using the XML file and produce an html output.\n \targs: <xml_file> [ppm_file] [output dir]\n";
if (argc > 2)
{
new QApplication::QApplication(argc, argv, false);
ImageCrop crop;
std::string merge = "--merge";
std::string m = "-m";
std::string scrop = "--create-html";
std::string c = "-c";
if (argc == 3 && (scrop.compare(argv[1]) == 0 || c.compare(argv[1]) == 0))
{
bool b = false;
b = crop.load_xml(argv[2], QString::Null());
if (b)
crop.find_image_regions();
}
else if (argc > 3)
{
if (merge.compare(argv[1]) == 0 || m.compare(argv[1]) == 0)
{
if (argc > 4)
crop.merge(argv[2], argv[3], argv[4]);
else
crop.merge(argv[2], argv[3]);
}
else if (scrop.compare(argv[1]) == 0 || c.compare(argv[1]) == 0)
{
bool b = false;
if (argc > 4)
b = crop.load_xml(argv[2], argv[3], argv[4]);
else
b = crop.load_xml(argv[2], argv[3]);
if (b)
crop.find_image_regions();
}
else
std::cout << man;
}
else
std::cout << man;
}
else
std::cout << man;
return 0;
}
xml_transform <option> <xml_file> <ppm_file> <output_dir>
Transforms an ICDAR XML file to different kind of output.
OPTIONS:
--html: Produce a HTML output
Once process is finished, open output.xml with your internet browser. If this one supports XSLT, there is no problem to display the result but if it does not support it (like Konqueror or Chrome in local), you should run `sh html_generator.sh` (xsltproc package required) to generate a HTML file readable by all browsers.
--pdf: Produce a PDF output, regions will be cropped as follows in the XML file.
--pdf-no-crop: Produce a PDF output without cropping regions, the entire picture will be displayed.
PDF is not directly created once the process is finished, to produce it, go to the output_dir and run `sh pdf_generator.sh` (fop >= 0.95 required).
BUILD:
Chnage the environment variable QMAKE_CXXFLAGS to indicate the correct paths to milena and olena then, just type qmake and make.
\ No newline at end of file
......@@ -23,8 +23,10 @@
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.
# include "xml_to_html.hh"
# include "image_crop.hh"
# include "loader.hh"
# include "dommodel.hh"
# include <limits.h>
#include <scribo/preprocessing/crop.hh>
......@@ -34,8 +36,7 @@
#include <mln/io/magick/save.hh>
#include <mln/io/ppm/all.hh>
ImageCrop::ImageCrop():
regions_(false)
ImageCrop::ImageCrop()
{
}
......@@ -43,162 +44,48 @@ ImageCrop::~ImageCrop()
{
}
void ImageCrop::merge(QString in, QString other, QString output)
void ImageCrop::save_image(QString image, QString output)
{
if (in.endsWith(".xml") && other.endsWith(".xml") && output.endsWith(".xml"))
{
QFile f_in(in);
QFile f_other(other);
QFile f_output(output);
qDebug() << "Merging to " << output << "...";
f_in.open(QIODevice::ReadOnly);
f_other.open(QIODevice::ReadOnly);
f_output.open(QIODevice::ReadWrite);
QTextStream stream_in(&f_in);
QTextStream stream_other(&f_other);
QTextStream output_stream(&f_output);
QString line = stream_in.readLine();
output_stream << line;
line = stream_in.readLine();
while (!line.contains("</page>"))
{
output_stream << "\n" << line;
line = stream_in.readLine();
}
line = stream_other.readLine();
while (!line.contains("<page"))
line = stream_other.readLine();
line = stream_other.readLine();
while (!line.contains("</pcGts>"))
{
output_stream << "\n" << line;
line = stream_other.readLine();
}
using namespace mln;
output_stream << "\n" << line;
image2d<value::rgb8> ima;
io::ppm::load(ima, image.toStdString());
f_in.close();
f_other.close();
f_output.close();
}
else
std::cout << "merge : error, only xml files are needed." << std::endl;
io::magick::save(ima, output.toStdString() + "img/image.png");
}
bool ImageCrop::load_xml(QString xml_file, QString image_file, QString path)
void ImageCrop::crop_regions(QString xml_file, QString image_file, QString output)
{
if (xml_file.endsWith(".xml") && (image_file.endsWith(".ppm") || image_file == QString::Null()) )
{
image_file_ = image_file;
regions_ = (image_file != QString::Null());
path_ = path;
if (!path_.endsWith("/"))
path_.append("/");
QDir dir(path_);
if (!dir.mkpath("img"))
{
path_ = "output/";
dir.mkpath(path_);
dir.setPath(path_);
dir.mkpath("img");
}
if (QFile::exists(xml_file))
{
QFile file(xml_file);
QFile output(path_ + "output.xml");
QFile xsl("patterns/xsl.xsl");
QFile css("patterns/css.css");
xsl.copy(path_ + "xsl.xsl");
css.copy(path_ + "css.css");
if (QFile::exists(path_ + "output.xml"))
output.remove();
output.open(QIODevice::ReadWrite);
file.open(QIODevice::ReadOnly);
QTextStream stream_in(&file);
QTextStream stream_out(&output);
stream_in.setCodec("UTF-8");
stream_out.setCodec("UTF-8");
Loader loader;
QString line = stream_in.readLine();
stream_out << line;
stream_out << "\n<?xml-stylesheet type=\"text/xsl\" href=\"xsl.xsl\" ?>";
QFile f(image_file);
// /!\ attributes of ICDAR PcGts removed.
line = stream_in.readLine();
stream_out << "\n<pcGts>";
line = stream_in.readLine();
while (!line.contains("</pcGts>"))
{
stream_out << "\n" << line;
line = stream_in.readLine();
}
stream_out << "\n" << line;
output.close();
file.close();
qDebug() << "Output saved to " + path_;
if (file.open(QIODevice::ReadOnly))
{
QDomDocument document;
if (document.setContent(&file))
{
layout_ = new DomModel(document, this);
}
else
qDebug() << "Error while loading the XML file, please choose another.";
file.close();
}
}
return true;
if (!f.exists())
{
qDebug() << "Image doesn't exist !";
abort();
}
else
{
std::cout << "Wrong files format !" << std::endl;
return false;
}
}
void ImageCrop::find_image_regions()
{
if (layout_ && regions_)
DomModel* layout = loader.xml_to_dom(xml_file);
if (layout)
{
QModelIndex pgGts = layout_->index(1, 0);
QModelIndex page = layout_->index(1, 0, pgGts);
QModelIndex pcGts = layout->index(1, 0);
QModelIndex page = layout->index(1, 0, pcGts);
QModelIndex region;
QModelIndex attributes;
QModelIndex coords;
QModelIndex point;
bool regions_found = false;
for (int i = 0; true; ++i)
{
region = layout_->index(i, 0, page);
attributes = layout_->index(i, 1, page);
QString name = layout_->data(region, Qt::DisplayRole).toString();
coords = layout_->index(0, 0, region);
region = layout->index(i, 0, page);
attributes = layout->index(i, 1, page);
QString name = layout->data(region, Qt::DisplayRole).toString();
coords = layout->index(0, 0, region);
bool is_image_region = name == QString("image_region")
bool is_region =
name == QString("image_region")
|| name == QString("graphic_region")
|| name == QString("separator_region")
|| name == QString("chart_region")
......@@ -207,18 +94,17 @@ void ImageCrop::find_image_regions()
if (!region.isValid() || !coords.isValid())
break;
if (is_image_region)
if (is_region)
{
regions_found = true;
QMap<QString, QVariant> data =
layout_->data(attributes, Qt::UserRole).toMap();
layout->data(attributes, Qt::UserRole).toMap();
QString id;
QMap<QString, QVariant>::iterator it = data.find("id");
if (it == data.end() || it.key() != "id")
qDebug() << "No image region.";
qDebug() << "WTF_Error : No image region.";
while (it != data.end() && it.key() == "id")
{
......@@ -237,12 +123,12 @@ void ImageCrop::find_image_regions()
for (int j = 0; true; ++j)
{
// Navigate to the coordinate list
point = layout_->index(j, 1, coords);
point = layout->index(j, 1, coords);
if (!point.isValid())
break;
QMap<QString, QVariant> data =
layout_->data(point, Qt::UserRole).toMap();
layout->data(point, Qt::UserRole).toMap();
int x = data["x"].toInt();
int y = data["y"].toInt();
......@@ -265,7 +151,7 @@ void ImageCrop::find_image_regions()
box2d box = make::box2d(y_min, x_min, y_max, x_max);
image2d<value::rgb8> ima;
io::ppm::load(ima, image_file_.toStdString());
io::ppm::load(ima, image_file.toStdString());
ima = scribo::preprocessing::crop(ima, box);
// image2d<bool> mask = make::box2d(y_min, x_min, y_max, x_max);
......@@ -273,20 +159,26 @@ void ImageCrop::find_image_regions()
/*for (int a = 1; a < vect.size(); ++a)
{
int x = vect[a]["x"].toInt();
int y = vect[a]["y"].toInt();
data::fill((mask | make::box2d(y, x, y, x)).rw(), false);
}*/
int x = vect[a]["x"].toInt();
int y = vect[a]["y"].toInt();
data::fill((mask | make::box2d(y, x, y, x)).rw(), false);
}*/
// io::pbm::save(mask, "output/img/mask_" + id.toStdString());
io::magick::save(ima, path_.toStdString() + "img/" + id.toStdString() + ".png");
io::magick::save(ima, output.toStdString() + "img/" + id.toStdString() + ".png");
}
}
if (!regions_found)
qDebug() << "No regions found.";
}
else
{
qDebug() << "Error with XML file.";
}
}
......@@ -34,16 +34,13 @@ class ImageCrop : public QObject
{
Q_OBJECT
public:
ImageCrop();
~ImageCrop();
void merge(QString in, QString other, QString output = "output.xml");
bool load_xml(QString xml_file, QString image_file = "", QString path = "output");
void find_image_regions();
private:
DomModel* layout_;
QString image_file_;
QString path_;
bool regions_;
void save_image(QString image, QString output);
void crop_regions(QString xml_file, QString image_file, QString output);
};
#endif /* !IMAGE_CROP_HH */
// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
//
// This file is part of Olena.
//
// Olena is free software: you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation, version 2 of the License.
//
// Olena is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Olena. If not, see <http://www.gnu.org/licenses/>.
//
// As a special exception, you may use this file as part of a free
// software project without restriction. Specifically, if other files
// instantiate templates or use macros or inline functions from this
// file, or you compile this file and link it with other files to produce
// an executable, this file does not by itself cause the resulting
// executable to be covered by the GNU General Public License. This
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.
# include "loader.hh"
# include "dommodel.hh"
# include "xml_transform.hh"
Loader::Loader()
{
}
Loader::~Loader()
{
}
DomModel* Loader::xml_to_dom(QString output)
{
QString file_path = output;
QFile file(file_path);
if (file.open(QIODevice::ReadOnly))
{
QDomDocument document;
if (document.setContent(&file))
{
return new DomModel(document);
}
}
return 0;
}
bool Loader::set_output(QString& output)
{
QDir dir(output);
if (!output.endsWith("/"))
output.append("/");
if (!dir.exists())
{
return dir.mkpath(".");
}
else
{
QStringList list = dir.entryList(QDir::Writable | QDir::AllDirs);
return list.size() != 0;
}
}
void Loader::add_html_templates(QString output)
{
QFile gen("templates/html/html_generator.sh");
gen.copy(output + "html_generator.sh");
QFile css("templates/html/css.css");
css.copy(output + "css.css");
QFile xsl("templates/html/xsl.xsl");
xsl.copy(output + "xsl.xsl");
}
void Loader::add_pdf_templates(bool crop, QString output)
{
QFile regions("templates/pdf/regions.xsl");
regions.copy(output + "regions.xsl");
QFile gen("templates/pdf/pdf_generator.sh");
gen.copy(output + "pdf_generator.sh");
QFile line("templates/pdf/line.xsl");
line.copy(output + "line.xsl");
if (crop)
{
QFile xsl("templates/pdf/main_crop.xsl");
xsl.copy(output + "main.xsl");
}
else
{
QFile xsl("templates/pdf/main.xsl");
xsl.copy(output + "main.xsl");
}
}
bool Loader::load_xml(QString xml_file, bool html, QString output)
{
QFile file(xml_file);
if (file.exists())
{
file.open(QIODevice::ReadOnly);
set_output(output);
QFile out_file(output + "output.xml");
out_file.open(QIODevice::ReadWrite);
QTextStream stream_in(&file);
QTextStream stream_out(&out_file);
QString line = stream_in.readLine();
while(!line.contains("<?xml"))
line = stream_in.readLine();
stream_out << line;
if (html)
stream_out << "\n<?xml-stylesheet type=\"text/xsl\" href=\"xsl.xsl\" ?>";
// /!\ attributes of ICDAR PcGts removed.
while(!line.contains("<pcGts"))
line = stream_in.readLine();