XML Worker examples

Tags: languagestext extractionparsing PDFFontProvideriText 5XML WorkeriText 5 examples

These examples were written in the context of an old XML Worker tutorial.

They are used in answer to questions such as:

Files: 
package sandbox.xmlworker;
 
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
 
import org.jsoup.Jsoup;
 
/**
 * Converts an HTML file into an XTHML file.
 */
public class D00_XHTML {
 
    /** The name of a HTML file */
    public static final String WALDEN = "resources/html/walden.html";
    /** The name of a HTML file */
    public static final String THOREAU = "resources/html/thoreau.html";
 
    /** The main method. */
    public static void main(String[] args) throws IOException {
        tidyUp(WALDEN);
        tidyUp(THOREAU);
    }
 
    public static void tidyUp(String path) throws IOException {
        File html = new File(path);
        byte[] xhtml = Jsoup.parse(html, "US-ASCII").html().getBytes();
        File dir = new File("results/xml");
        dir.mkdirs();
        FileOutputStream fos = new FileOutputStream(new File(dir, html.getName()));
        fos.write(xhtml);
        fos.close();
    }
}
package sandbox.xmlworker;
 
import com.itextpdf.text.Element;
import com.itextpdf.tool.xml.ElementHandler;
import com.itextpdf.tool.xml.Writable;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.itextpdf.tool.xml.pipeline.WritableElement;
import sandbox.WrapToTest;
 
import java.io.FileInputStream;
import java.io.IOException;
import java.util.List;
 
//without @WrapToTest annotation, because this test only illustrated custom element handler
public class D01_CustomElementHandler {
 
    public static final String SRC = "resources/xml/walden.html";
 
    public static void main(String[] args) throws IOException {
        XMLWorkerHelper.getInstance().parseXHtml(new ElementHandler() {
            public void add(final Writable w) {
                if (w instanceof WritableElement) {
                    List<Element> elements = ((WritableElement) w).elements();
                    for (Element element : elements) {
                        System.out.println(element.getClass().getName());
                    }
                }
 
            }
        }, new FileInputStream(SRC), null);
    }
 
 
 
}
package sandbox.xmlworker;
 
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorkerHelper;
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
 
import sandbox.WrapToTest;
 
@WrapToTest
public class D02_ParseHtml {
 
    public static final String HTML = "resources/xml/walden.html";
    public static final String DEST = "results/xmlworker/walden1.pdf";
 
    /**
     * Html to pdf conversion example.
     * @param file
     * @throws IOException
     * @throws DocumentException
     */
    public void createPdf(String file) throws IOException, DocumentException {
        // step 1
        Document document = new Document();
        // step 2
        PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(file));
        // step 3
        document.open();
        // step 4
        XMLWorkerHelper.getInstance().parseXHtml(writer, document,
                new FileInputStream(HTML));
        // step 5
        document.close();
    }
 
    /**
     * Main method
     */
    public static void main(String[] args) throws IOException, DocumentException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        new D02_ParseHtml().createPdf(DEST);
    }
}
package sandbox.xmlworker;
 
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorker;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.itextpdf.tool.xml.html.Tags;
import com.itextpdf.tool.xml.parser.XMLParser;
import com.itextpdf.tool.xml.pipeline.css.CSSResolver;
import com.itextpdf.tool.xml.pipeline.css.CssResolverPipeline;
import com.itextpdf.tool.xml.pipeline.end.PdfWriterPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
 
import sandbox.WrapToTest;
 
@WrapToTest
public class D03_ParseHtmlPipelines {
 
    public static final String HTML = "resources/xml/walden.html";
    public static final String DEST = "results/xmlworker/walden2.pdf";
 
    public void createPdf(String file) throws IOException, DocumentException {
        // step 1
        Document document = new Document();
 
        // step 2
        PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(file));
        writer.setInitialLeading(12.5f);
 
        // step 3
        document.open();
 
        // step 4
 
        // CSS
        CSSResolver cssResolver =
                XMLWorkerHelper.getInstance().getDefaultCssResolver(false);
 
        // HTML
        HtmlPipelineContext htmlContext = new HtmlPipelineContext(null);
        htmlContext.setTagFactory(Tags.getHtmlTagProcessorFactory());
        htmlContext.autoBookmark(false);
 
        // Pipelines
        PdfWriterPipeline pdf = new PdfWriterPipeline(document, writer);
        HtmlPipeline html = new HtmlPipeline(htmlContext, pdf);
        CssResolverPipeline css = new CssResolverPipeline(cssResolver, html);
 
        // XML Worker
        XMLWorker worker = new XMLWorker(css, true);
        XMLParser p = new XMLParser(worker);
        p.parse(new FileInputStream(HTML));
 
        // step 5
        document.close();
    }
 
    /**
     * Main method
     */
    public static void main(String[] args) throws IOException, DocumentException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        new D03_ParseHtmlPipelines().createPdf(DEST);
    }
}
package sandbox.xmlworker;
 
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorker;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.itextpdf.tool.xml.css.CssFile;
import com.itextpdf.tool.xml.css.StyleAttrCSSResolver;
import com.itextpdf.tool.xml.html.Tags;
import com.itextpdf.tool.xml.parser.XMLParser;
import com.itextpdf.tool.xml.pipeline.css.CSSResolver;
import com.itextpdf.tool.xml.pipeline.css.CssResolverPipeline;
import com.itextpdf.tool.xml.pipeline.end.PdfWriterPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
 
import sandbox.WrapToTest;
 
@WrapToTest
public class D04_ParseHtmlCss {
 
    public static final String HTML = "resources/xml/walden.html";
    public static final String CSS = "resources/xml/walden.css";
    public static final String DEST = "results/xmlworker/walden3.pdf";
 
    public void createPdf(String file) throws IOException, DocumentException {
        // step 1
        Document document = new Document();
 
        // step 2
        PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(file));
        writer.setInitialLeading(12.5f);
 
        // step 3
        document.open();
 
        // step 4
 
        // CSS
        CSSResolver cssResolver = new StyleAttrCSSResolver();
        CssFile cssFile = XMLWorkerHelper.getCSS(new FileInputStream(CSS));
        cssResolver.addCss(cssFile);
 
        // HTML
        HtmlPipelineContext htmlContext = new HtmlPipelineContext(null);
        htmlContext.setTagFactory(Tags.getHtmlTagProcessorFactory());
 
        // Pipelines
        PdfWriterPipeline pdf = new PdfWriterPipeline(document, writer);
        HtmlPipeline html = new HtmlPipeline(htmlContext, pdf);
        CssResolverPipeline css = new CssResolverPipeline(cssResolver, html);
 
        // XML Worker
        XMLWorker worker = new XMLWorker(css, true);
        XMLParser p = new XMLParser(worker);
        p.parse(new FileInputStream(HTML));
 
        // step 5
        document.close();
    }
 
    /**
     * Main method
     */
    public static void main(String[] args) throws IOException, DocumentException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        new D04_ParseHtmlCss().createPdf(DEST);
    }
}
package sandbox.xmlworker;
 
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorker;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.itextpdf.tool.xml.html.Tags;
import com.itextpdf.tool.xml.net.FileRetrieve;
import com.itextpdf.tool.xml.net.FileRetrieveImpl;
import com.itextpdf.tool.xml.parser.XMLParser;
import com.itextpdf.tool.xml.pipeline.css.CSSResolver;
import com.itextpdf.tool.xml.pipeline.css.CssResolverPipeline;
import com.itextpdf.tool.xml.pipeline.end.PdfWriterPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
 
import sandbox.WrapToTest;
 
@WrapToTest
public class D05_ParseHtmlCssLink {
 
    public static final String HTML = "resources/xml/test.html";
    public static final String CSS_DIR = "resources/xml/";
    public static final String DEST = "results/xmlworker/test.pdf";
 
    /**
     * @param file
     * @throws IOException
     * @throws DocumentException
     */
    public void createPdf(String file) throws IOException, DocumentException {
        // step 1
        Document document = new Document();
        // step 2
        PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(file));
        writer.setInitialLeading(12.5f);
        // step 3
        document.open();
        // step 4
 
        // CSS
        CSSResolver cssResolver =
                XMLWorkerHelper.getInstance().getDefaultCssResolver(false);
        FileRetrieve retrieve = new FileRetrieveImpl(CSS_DIR);
        cssResolver.setFileRetrieve(retrieve);
 
        // HTML
        HtmlPipelineContext htmlContext = new HtmlPipelineContext(null);
        htmlContext.setTagFactory(Tags.getHtmlTagProcessorFactory());
        htmlContext.autoBookmark(false);
 
        // Pipelines
        PdfWriterPipeline pdf = new PdfWriterPipeline(document, writer);
        HtmlPipeline html = new HtmlPipeline(htmlContext, pdf);
        CssResolverPipeline css = new CssResolverPipeline(cssResolver, html);
 
        // XML Worker
        XMLWorker worker = new XMLWorker(css, true);
        XMLParser p = new XMLParser(worker);
        p.parse(new FileInputStream(HTML));
 
        // step 5
        document.close();
    }
 
    /**
     * Main method
     */
    public static void main(String[] args) throws IOException, DocumentException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        new D05_ParseHtmlCssLink().createPdf(DEST);
    }
}
package sandbox.xmlworker;
 
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorker;
import com.itextpdf.tool.xml.XMLWorkerFontProvider;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.itextpdf.tool.xml.css.CssFile;
import com.itextpdf.tool.xml.css.StyleAttrCSSResolver;
import com.itextpdf.tool.xml.html.CssAppliers;
import com.itextpdf.tool.xml.html.CssAppliersImpl;
import com.itextpdf.tool.xml.html.Tags;
import com.itextpdf.tool.xml.parser.XMLParser;
import com.itextpdf.tool.xml.pipeline.css.CSSResolver;
import com.itextpdf.tool.xml.pipeline.css.CssResolverPipeline;
import com.itextpdf.tool.xml.pipeline.end.PdfWriterPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
 
import sandbox.WrapToTest;
 
@WrapToTest
public class D06_ParseHtmlFonts {
 
    public static final String HTML = "resources/xml/walden.html";
    public static final String CSS = "resources/xml/walden.css";
    public static final String DEST = "results/xmlworker/walden4.pdf";
 
    public void createPdf(String file) throws IOException, DocumentException {
        // step 1
        Document document = new Document();
 
        // step 2
        PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(file));
        writer.setInitialLeading(12.5f);
 
        // step 3
        document.open();
 
        // step 4
 
        // CSS
        CSSResolver cssResolver = new StyleAttrCSSResolver();
        CssFile cssFile = XMLWorkerHelper.getCSS(new FileInputStream(CSS));
        cssResolver.addCss(cssFile);
 
        // HTML
        XMLWorkerFontProvider fontProvider = new XMLWorkerFontProvider(XMLWorkerFontProvider.DONTLOOKFORFONTS);
        fontProvider.register("resources/fonts/Cardo-Regular.ttf");
        fontProvider.register("resources/fonts/Cardo-Bold.ttf");
        fontProvider.register("resources/fonts/Cardo-Italic.ttf");
        fontProvider.addFontSubstitute("lowagie", "cardo");
        CssAppliers cssAppliers = new CssAppliersImpl(fontProvider);
        HtmlPipelineContext htmlContext = new HtmlPipelineContext(cssAppliers);
        htmlContext.setTagFactory(Tags.getHtmlTagProcessorFactory());
 
        // Pipelines
        PdfWriterPipeline pdf = new PdfWriterPipeline(document, writer);
        HtmlPipeline html = new HtmlPipeline(htmlContext, pdf);
        CssResolverPipeline css = new CssResolverPipeline(cssResolver, html);
 
        // XML Worker
        XMLWorker worker = new XMLWorker(css, true);
        XMLParser p = new XMLParser(worker);
        p.parse(new FileInputStream(HTML));
 
        // step 5
        document.close();
    }
 
    /**
     * Main method
     */
    public static void main(String[] args) throws IOException, DocumentException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        new D06_ParseHtmlFonts().createPdf(DEST);
    }
}
package sandbox.xmlworker;
 
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorkerHelper;
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.charset.Charset;
 
public class D07_ParseHtmlAsian {
 
    public static final String HTML = "resources/xml/hero.html";
    public static final String DEST = "results/xmlworker/hero.pdf";
 
    /**
     * Creates a PDF with the words "Hello World"
     * @param file
     * @throws IOException
     * @throws DocumentException
     */
    public void createPdf(String file) throws IOException, DocumentException {
        // step 1
        Document document = new Document();
        // step 2
        PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(file));
        // step 3
        document.open();
        // step 4
        XMLWorkerHelper.getInstance().parseXHtml(writer, document,
                new FileInputStream(HTML), Charset.forName("UTF-8"));
        // step 5
        document.close();
    }
 
    /**
     * Main method
     */
    public static void main(String[] args) throws IOException, DocumentException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        new D07_ParseHtmlAsian().createPdf(DEST);
    }
}
package sandbox.xmlworker;
 
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorker;
import com.itextpdf.tool.xml.XMLWorkerFontProvider;
import com.itextpdf.tool.xml.css.StyleAttrCSSResolver;
import com.itextpdf.tool.xml.html.CssAppliers;
import com.itextpdf.tool.xml.html.CssAppliersImpl;
import com.itextpdf.tool.xml.html.Tags;
import com.itextpdf.tool.xml.parser.XMLParser;
import com.itextpdf.tool.xml.pipeline.css.CSSResolver;
import com.itextpdf.tool.xml.pipeline.css.CssResolverPipeline;
import com.itextpdf.tool.xml.pipeline.end.PdfWriterPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.charset.Charset;
 
import sandbox.WrapToTest;
 
@WrapToTest
public class D07bis_ParseHtmlAsian {
 
    public static final String HTML = "resources/xml/hero.html";
    public static final String DEST = "results/xmlworker/asian.pdf";
 
    /**
     * Creates a PDF with the words "Hello World"
     * @param file
     * @throws IOException
     * @throws DocumentException
     */
    public void createPdf(String file) throws IOException, DocumentException {
        // step 1
        Document document = new Document();
        // step 2
        PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(file));
        // step 3
        document.open();
        // step 4
        // CSS
        CSSResolver cssResolver = new StyleAttrCSSResolver();
 
        // HTML
        XMLWorkerFontProvider fontProvider = new XMLWorkerFontProvider(XMLWorkerFontProvider.DONTLOOKFORFONTS);
        fontProvider.register("resources/fonts/cfmingeb.ttf", "MS Mincho");
        fontProvider.register("resources/fonts/PT_Serif-Web-Regular.ttf", "Serif");
        CssAppliers cssAppliers = new CssAppliersImpl(fontProvider);
        HtmlPipelineContext htmlContext = new HtmlPipelineContext(cssAppliers);
        htmlContext.setTagFactory(Tags.getHtmlTagProcessorFactory());
 
        // Pipelines
        PdfWriterPipeline pdf = new PdfWriterPipeline(document, writer);
        HtmlPipeline html = new HtmlPipeline(htmlContext, pdf);
        CssResolverPipeline css = new CssResolverPipeline(cssResolver, html);
 
        // XML Worker
        XMLWorker worker = new XMLWorker(css, true);
        XMLParser p = new XMLParser(worker);
        p.parse(new FileInputStream(HTML), Charset.forName("UTF-8"));
        // step 5
        document.close();
    }
 
    /**
     * Main method
     */
    public static void main(String[] args) throws IOException, DocumentException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        new D07bis_ParseHtmlAsian().createPdf(DEST);
    }
}
package sandbox.xmlworker;
 
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorker;
import com.itextpdf.tool.xml.XMLWorkerFontProvider;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.itextpdf.tool.xml.css.CssFile;
import com.itextpdf.tool.xml.css.StyleAttrCSSResolver;
import com.itextpdf.tool.xml.html.CssAppliers;
import com.itextpdf.tool.xml.html.CssAppliersImpl;
import com.itextpdf.tool.xml.html.Tags;
import com.itextpdf.tool.xml.parser.XMLParser;
import com.itextpdf.tool.xml.pipeline.css.CSSResolver;
import com.itextpdf.tool.xml.pipeline.css.CssResolverPipeline;
import com.itextpdf.tool.xml.pipeline.end.PdfWriterPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
 
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.charset.Charset;
 
import sandbox.WrapToTest;
 
@WrapToTest
public class D07tris_ParseHtmlAsian {
 
    public static final String HTML = "resources/xml/hero2.html";
    public static final String DEST = "results/xmlworker/asian2.pdf";
 
    /**
     * Creates a PDF with the words "Hello World"
     * @param file
     * @throws IOException
     * @throws DocumentException
     */
    public void createPdf(String file) throws IOException, DocumentException {
        // step 1
        Document document = new Document();
        // step 2
        PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(file));
        // step 3
        document.open();
        // step 4
        // CSS
        CSSResolver cssResolver = new StyleAttrCSSResolver();
        CssFile cssFile = XMLWorkerHelper.getCSS(new ByteArrayInputStream("body {font-family:tsc fming s tt}".getBytes()));
        cssResolver.addCss(cssFile);
 
        // HTML
        XMLWorkerFontProvider fontProvider = new XMLWorkerFontProvider(XMLWorkerFontProvider.DONTLOOKFORFONTS);
        fontProvider.register("resources/fonts/cfmingeb.ttf");
        CssAppliers cssAppliers = new CssAppliersImpl(fontProvider);
        HtmlPipelineContext htmlContext = new HtmlPipelineContext(cssAppliers);
        htmlContext.setTagFactory(Tags.getHtmlTagProcessorFactory());
 
        // Pipelines
        PdfWriterPipeline pdf = new PdfWriterPipeline(document, writer);
        HtmlPipeline html = new HtmlPipeline(htmlContext, pdf);
        CssResolverPipeline css = new CssResolverPipeline(cssResolver, html);
 
        // XML Worker
        XMLWorker worker = new XMLWorker(css, true);
        XMLParser p = new XMLParser(worker);
        p.parse(new FileInputStream(HTML), Charset.forName("UTF-8"));
        // step 5
        document.close();
    }
 
    /**
     * Main method
     */
    public static void main(String[] args) throws IOException, DocumentException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        new D07tris_ParseHtmlAsian().createPdf(DEST);
    }
}
package sandbox.xmlworker;
 
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorkerHelper;
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.charset.Charset;
 
import sandbox.WrapToTest;
 
@WrapToTest
public class D08_ParseHtmlImagesLinksOops {
 
    public static final String HTML = "resources/xml/thoreau.html";
    public static final String DEST = "results/xmlworker/thoreau_oops.pdf";
 
    /**
     * Creates a PDF with the words "Hello World"
     * @param file
     * @throws IOException
     * @throws DocumentException
     */
    public void createPdf(String file) throws IOException, DocumentException {
        // step 1
        Document document = new Document();
        // step 2
        PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(file));
        // step 3
        document.open();
        // step 4
        XMLWorkerHelper.getInstance().parseXHtml(writer, document,
                new FileInputStream(HTML), Charset.forName("UTF-8"));
        // step 5
        document.close();
    }
 
    /**
     * Main method
     */
    public static void main(String[] args) throws IOException, DocumentException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        new D08_ParseHtmlImagesLinksOops().createPdf(DEST);
    }
}
package sandbox.xmlworker;
 
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorker;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.itextpdf.tool.xml.html.Tags;
import com.itextpdf.tool.xml.parser.XMLParser;
import com.itextpdf.tool.xml.pipeline.css.CSSResolver;
import com.itextpdf.tool.xml.pipeline.css.CssResolverPipeline;
import com.itextpdf.tool.xml.pipeline.end.PdfWriterPipeline;
import com.itextpdf.tool.xml.pipeline.html.AbstractImageProvider;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
import com.itextpdf.tool.xml.pipeline.html.LinkProvider;
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
 
import sandbox.WrapToTest;
 
@WrapToTest
public class D09_ParseHtmlImagesLinks {
 
    public static final String HTML = "resources/xml/thoreau.html";
    public static final String DEST = "results/xmlworker/thoreau.pdf";
    public static final String IMG_PATH = "resources/xml/";
    public static final String RELATIVE_PATH = "../../resources/xml/";
 
    /**
     * Creates a PDF with the words "Hello World"
     * @param file
     * @throws IOException
     * @throws DocumentException
     */
    public void createPdf(String file) throws IOException, DocumentException {
        // step 1
        Document document = new Document();
        // step 2
        PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(file));
        // step 3
        document.open();
        // step 4
 
        // CSS
        CSSResolver cssResolver =
                XMLWorkerHelper.getInstance().getDefaultCssResolver(true);
 
        // HTML
        HtmlPipelineContext htmlContext = new HtmlPipelineContext(null);
        htmlContext.setTagFactory(Tags.getHtmlTagProcessorFactory());
        htmlContext.setImageProvider(new AbstractImageProvider() {
            public String getImageRootPath() {
                return IMG_PATH;
            }
        });
        htmlContext.setLinkProvider(new LinkProvider() {
            public String getLinkRoot() {
                return RELATIVE_PATH;
            }
        });
 
        // Pipelines
        PdfWriterPipeline pdf = new PdfWriterPipeline(document, writer);
        HtmlPipeline html = new HtmlPipeline(htmlContext, pdf);
        CssResolverPipeline css = new CssResolverPipeline(cssResolver, html);
 
        // XML Worker
        XMLWorker worker = new XMLWorker(css, true);
        XMLParser p = new XMLParser(worker);
        p.parse(new FileInputStream(HTML));
 
        // step 5
        document.close();
    }
 
 
    /**
     * Main method
     */
    public static void main(String[] args) throws IOException, DocumentException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        new D09_ParseHtmlImagesLinks().createPdf(DEST);
    }
}
package sandbox.xmlworker;
 
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Element;
import com.itextpdf.text.Paragraph;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.Tag;
import com.itextpdf.tool.xml.WorkerContext;
import com.itextpdf.tool.xml.XMLWorker;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.itextpdf.tool.xml.html.Div;
import com.itextpdf.tool.xml.html.TagProcessorFactory;
import com.itextpdf.tool.xml.html.Tags;
import com.itextpdf.tool.xml.net.FileRetrieve;
import com.itextpdf.tool.xml.net.FileRetrieveImpl;
import com.itextpdf.tool.xml.parser.XMLParser;
import com.itextpdf.tool.xml.pipeline.css.CSSResolver;
import com.itextpdf.tool.xml.pipeline.css.CssResolverPipeline;
import com.itextpdf.tool.xml.pipeline.end.PdfWriterPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
 
public class D10_ParseCustomTag {
 
    public static final String HTML = "resources/xml/test.html";
    public static final String DEST = "results/xmlworker/date.pdf";
    public static final String CSS_DIR = "resources/xml/";
 
    /**
     * Creates a PDF with the words "Hello World"
     * @param file
     * @throws IOException
     * @throws DocumentException
     */
    public void createPdf(String file) throws IOException, DocumentException {
        // step 1
        Document document = new Document();
        // step 2
        PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(file));
        writer.setInitialLeading(12.5f);
        // step 3
        document.open();
        // step 4
 
        // CSS
        CSSResolver cssResolver =
                XMLWorkerHelper.getInstance().getDefaultCssResolver(false);
        FileRetrieve retrieve = new FileRetrieveImpl(CSS_DIR);
        cssResolver.setFileRetrieve(retrieve);
 
        // HTML
        HtmlPipelineContext htmlContext = new HtmlPipelineContext(null);
        TagProcessorFactory factory = Tags.getHtmlTagProcessorFactory();
        factory.addProcessor(
            new Div(){
                public List<Element> end(WorkerContext ctx, Tag tag, List<Element> l) {
                    List<Element> list = new ArrayList<Element>(1);
                    String date = DateFormat.getDateInstance(DateFormat.LONG, Locale.US).format(new Date());
                    list.add(new Paragraph(date));
                    return list;
                }
            },
            "date");
        htmlContext.setTagFactory(factory);
        htmlContext.autoBookmark(false);
 
        // Pipelines
        PdfWriterPipeline pdf = new PdfWriterPipeline(document, writer);
        HtmlPipeline html = new HtmlPipeline(htmlContext, pdf);
        CssResolverPipeline css = new CssResolverPipeline(cssResolver, html);
 
        // XML Worker
        XMLWorker worker = new XMLWorker(css, true);
        XMLParser p = new XMLParser(worker);
        p.parse(new FileInputStream(HTML));
 
        // step 5
        document.close();
    }
 
    /**
     * Main method
     */
    public static void main(String[] args) throws IOException, DocumentException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        new D10_ParseCustomTag().createPdf(DEST);
    }
}
package sandbox.xmlworker;
 
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Element;
import com.itextpdf.text.PageSize;
import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.ColumnText;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.ElementList;
import com.itextpdf.tool.xml.XMLWorker;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.itextpdf.tool.xml.html.Tags;
import com.itextpdf.tool.xml.parser.XMLParser;
import com.itextpdf.tool.xml.pipeline.css.CSSResolver;
import com.itextpdf.tool.xml.pipeline.css.CssResolverPipeline;
import com.itextpdf.tool.xml.pipeline.end.ElementHandlerPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
 
import sandbox.WrapToTest;
 
@WrapToTest
public class D11_ParseHtmlObjects {
 
    public static final String HTML = "resources/xml/walden.html";
    public static final String DEST = "results/xmlworker/walden5.pdf";
 
    public void createPdf(String file) throws IOException, DocumentException {
 
        // Parse HTML into Element list
 
        // CSS
        CSSResolver cssResolver =
                XMLWorkerHelper.getInstance().getDefaultCssResolver(true);
 
        // HTML
        HtmlPipelineContext htmlContext = new HtmlPipelineContext(null);
        htmlContext.setTagFactory(Tags.getHtmlTagProcessorFactory());
        htmlContext.autoBookmark(false);
 
        // Pipelines
        ElementList elements = new ElementList();
        ElementHandlerPipeline end = new ElementHandlerPipeline(elements, null);
        HtmlPipeline html = new HtmlPipeline(htmlContext, end);
        CssResolverPipeline css = new CssResolverPipeline(cssResolver, html);
 
        // XML Worker
        XMLWorker worker = new XMLWorker(css, true);
        XMLParser p = new XMLParser(worker);
        p.parse(new FileInputStream(HTML));
 
        // step 1
        Document document = new Document(PageSize.LEGAL.rotate());
 
        // step 2
        PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(file));
        writer.setInitialLeading(12.5f);
 
        // step 3
        document.open();
 
        // step 4
        Rectangle left = new Rectangle(36, 36, 486, 586);
        Rectangle right = new Rectangle(522, 36, 972, 586);
        ColumnText column = new ColumnText(writer.getDirectContent());
        column.setSimpleColumn(left);
        boolean leftside = true;
        int status = ColumnText.START_COLUMN;
        for (Element e : elements) {
            if (ColumnText.isAllowedElement(e)) {
                column.addElement(e);
                status = column.go();
                while (ColumnText.hasMoreText(status)) {
                    if (leftside) {
                        leftside = false;
                        column.setSimpleColumn(right);
                    }
                    else {
                        document.newPage();
                        leftside = true;
                        column.setSimpleColumn(left);
                    }
                    status = column.go();
                }
            }
        }
 
        // step 5
        document.close();
    }
 
    /**
     * Main method
     */
    public static void main(String[] args) throws IOException, DocumentException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        new D11_ParseHtmlObjects().createPdf(DEST);
    }
}
package sandbox.xmlworker;
 
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.CustomContext;
import com.itextpdf.tool.xml.Pipeline;
import com.itextpdf.tool.xml.PipelineException;
import com.itextpdf.tool.xml.ProcessObject;
import com.itextpdf.tool.xml.Tag;
import com.itextpdf.tool.xml.WorkerContext;
import com.itextpdf.tool.xml.XMLWorker;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.itextpdf.tool.xml.html.Tags;
import com.itextpdf.tool.xml.parser.XMLParser;
import com.itextpdf.tool.xml.pipeline.AbstractPipeline;
import com.itextpdf.tool.xml.pipeline.css.CSSResolver;
import com.itextpdf.tool.xml.pipeline.css.CssResolverPipeline;
import com.itextpdf.tool.xml.pipeline.end.PdfWriterPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
 
import sandbox.WrapToTest;
 
@WrapToTest
public class D12_ParseHtmlCustomPipeline {
 
    public static final String HTML = "resources/xml/walden.html";
    public static final String DEST = "results/xmlworker/walden6.pdf";
 
    class CustomPipeline extends AbstractPipeline<CustomContext> {
 
        private int indent = -1;
 
        /* (non-Javadoc)
         * @see com.itextpdf.tool.xml.pipeline.AbstractPipeline#open(com.itextpdf.tool.xml.WorkerContext, com.itextpdf.tool.xml.Tag, com.itextpdf.tool.xml.ProcessObject)
         */
        @Override
        public Pipeline<?> open(WorkerContext context, Tag t, ProcessObject po)
                throws PipelineException {
            indent++;
            for (int i = 0; i < indent; i++)
                System.out.print("\t");
            System.out.println("<" + t.getName() + ">");
            return super.open(context, t, po);
        }
 
        /* (non-Javadoc)
         * @see com.itextpdf.tool.xml.pipeline.AbstractPipeline#close(com.itextpdf.tool.xml.WorkerContext, com.itextpdf.tool.xml.Tag, com.itextpdf.tool.xml.ProcessObject)
         */
        @Override
        public Pipeline<?> close(WorkerContext context, Tag t, ProcessObject po)
                throws PipelineException {
            for (int i = 0; i < indent; i++)
                System.out.print("\t");
            System.out.println("</" + t.getName() + ">");
            indent--;
            return super.close(context, t, po);
        }
 
        public CustomPipeline(Pipeline<?> next) {
            super(next);
        }
 
    }
 
    public void createPdf(String file) throws IOException, DocumentException {
        // step 1
        Document document = new Document();
 
        // step 2
        PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(file));
        writer.setInitialLeading(12.5f);
 
        // step 3
        document.open();
 
        // step 4
 
        // CSS
        CSSResolver cssResolver =
                XMLWorkerHelper.getInstance().getDefaultCssResolver(true);
 
        // HTML
        HtmlPipelineContext htmlContext = new HtmlPipelineContext(null);
        htmlContext.setTagFactory(Tags.getHtmlTagProcessorFactory());
        htmlContext.autoBookmark(false);
 
        // Pipelines
        PdfWriterPipeline pdf = new PdfWriterPipeline(document, writer);
        CustomPipeline custom = new CustomPipeline(pdf);
        HtmlPipeline html = new HtmlPipeline(htmlContext, custom);
        CssResolverPipeline css = new CssResolverPipeline(cssResolver, html);
 
        // XML Worker
        XMLWorker worker = new XMLWorker(css, true);
        XMLParser p = new XMLParser(worker);
        p.parse(new FileInputStream(HTML));
 
        // step 5
        document.close();
    }
 
    /**
     * Main method
     */
    public static void main(String[] args) throws IOException, DocumentException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        new D12_ParseHtmlCustomPipeline().createPdf(DEST);
    }
}
Resources: 
<html>
<head>
	<link rel="StyleSheet" href="walden.css" type="text/css" />
	<style type="text/css">
		a {
			text-decoration: none;
			font-style: italic;
		}
	</style>
</head>
<body>
	<p>This is a test with an external stylesheet.</p>
	<p>This is a <a href="http://itextpdf.com/">link</a>.</p>
	<p>This is another link <a style="color:orange" href="http://lowagie.com/">link</a>.</p>
	<p>This is another link <a style="font-style:normal" href="http://lowagie.com/">link</a>.</p>
	<date />
</body>
</html>
<p><span style="font-size:12.0pt; font-family:MS Mincho">長空</span>
<span style="font-size:12.0pt; font-family:Times New Roman,serif">(Broken Sword),</span>
<span style="font-size:12.0pt; font-family:MS Mincho">秦王殘劍</span>
<span style="font-size:12.0pt; font-family:Times New Roman,serif">(Flying Snow),</span>
<span style="font-size:12.0pt; font-family:MS Mincho">飛雪</span>
<span style="font-size:12.0pt; font-family:Times New Roman,serif">(Moon), </span>
<span style="font-size:12.0pt; font-family:MS Mincho">如月</span>
<span style="font-size:12.0pt; font-family:Times New Roman,serif">(the King), and</span>
<span style="font-size:12.0pt; font-family:MS Mincho">秦王</span>
<span style="font-size:12.0pt; font-family:Times New Roman,serif">(Sky).</span></p>
<body><p>長空 (Broken Sword), 秦王殘劍 (Flying Snow), 飛雪 (Moon), 如月 (the King), and 秦王 (Sky).</p></body>
File nameRaw URLUpdated
walden.htmlwalden.html2015-10-24 5:04 pm
walden.htmlwalden.html2015-10-24 5:06 pm
walden.csswalden.css2015-10-24 5:04 pm
thoreau.htmlthoreau.html2015-10-24 5:04 pm
thoreau.htmlthoreau.html2015-10-24 5:06 pm
435px-VII.jpg435px-VII.jpg2015-10-24 5:05 pm
486px-Henry_David_Thoreau.jpg486px-Henry_David_Thoreau.jpg2015-10-24 5:05 pm
Henry_David_Thoreau_1861.jpgHenry_David_Thoreau_1861.jpg2015-10-24 5:05 pm
test.htmltest.html2015-10-24 5:04 pm
hero.htmlhero.html2015-10-24 5:04 pm
hero2.htmlhero2.html2015-10-24 5:04 pm
Results: 
File nameRaw URLUpdated
cmp_walden1.pdfcmp_walden1.pdf2015-11-05 3:03 pm
cmp_walden2.pdfcmp_walden2.pdf2015-11-05 3:03 pm
cmp_walden3.pdfcmp_walden3.pdf2015-11-05 3:03 pm
cmp_test.pdfcmp_test.pdf2015-11-05 3:03 pm
cmp_walden4.pdfcmp_walden4.pdf2015-11-05 3:03 pm
cmp_hero.pdfcmp_hero.pdf2015-11-05 3:03 pm
cmp_asian.pdfcmp_asian.pdf2015-11-05 3:03 pm
cmp_asian2.pdfcmp_asian2.pdf2015-11-05 3:03 pm
cmp_thoreau_oops.pdfcmp_thoreau_oops.pdf2015-11-05 3:03 pm
cmp_thoreau.pdfcmp_thoreau.pdf2015-11-05 3:03 pm
cmp_walden5.pdfcmp_walden5.pdf2015-11-05 3:03 pm
cmp_walden6.pdfcmp_walden6.pdf2015-11-05 3:03 pm