Html2Latex.java
/*
* Copyright © 2018-2025 The CTAN Team and individual authors
*
* This file is distributed under the 3-clause BSD license.
* See file LICENSE for details.
*/
package org.ctan.markup.html2latex;
import java.util.HashMap;
import java.util.Map;
import org.ctan.markup.Logos;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.Document.OutputSettings;
import org.jsoup.nodes.Entities.EscapeMode;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeVisitor;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import lombok.NonNull;
/**
* This class transforms HTML to LaTeX.
*
* @author <a href="mailto:gene@ctan.org">Gerd Neugebauer</a>
*/
public class Html2Latex {
/**
* This interface describes the handler for processing a HTML node.
*/
public interface Handler {
/**
* Process a HGML node.
*
* @param node the HTML Node
*
* @return the result
*/
String handle(Node node);
}
/**
* The node visitor invoked during traversing the HTML tree.
*
*/
private class LatexNodeVisitor implements NodeVisitor {
/**
* The field <code>buffer</code> contains the target buffer.
*/
private StringBuilder buffer = new StringBuilder();
/**
* {@inheritDoc}
*
* @see org.jsoup.select.NodeVisitor#head(org.jsoup.nodes.Node, int)
*/
@Override
public void head(Node node, int depth) {
var name = node.nodeName();
var visitor = tags.get(name);
if (visitor == null) {
throw new RuntimeException("unknown tag `" + name + "´");
}
buffer.append(visitor[0].handle(node));
}
/**
* {@inheritDoc}
*
* @see org.jsoup.select.NodeVisitor#tail(org.jsoup.nodes.Node, int)
*/
@Override
public void tail(Node node, int depth) {
buffer.append(tags.get(node.nodeName())[1].handle(node));
}
/**
* {@inheritDoc}
*
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
return buffer.toString();
}
}
/**
* This class is a Handler which encapsulates a String to be delivered.
*/
public static class StringHandler implements Handler {
/**
* The field <code>value</code> contains the encapsulated String.
*/
private String value;
/**
* Creates a new object.
*
* @param value the string to return
*/
public StringHandler(String value) {
this.value = value;
}
/**
* {@inheritDoc}
*
* @see org.ctan.markup.html2latex.Html2Latex.Handler#handle(org.jsoup.nodes.Node)
*/
@Override
public String handle(Node node) {
return value;
}
}
/**
* The field <code>NOTHING</code> contains the handler which produces
* nothing.
*/
public static final Handler NOTHING = new StringHandler("");
/**
* The field <code>CLOSE_BRACE</code> contains the handler which produces a
* closing curly brace.
*/
public static final Handler CLOSE_BRACE = new StringHandler("}");
/**
* The field <code>NL</code> contains the handler which produces a newline.
*/
public static final Handler NL = new StringHandler("\n");
/**
* The field <code>tags</code> contains the defined rules for HTML tags.
*/
private Map<String, Handler[]> tags = new HashMap<>();
// /**
// * The field <code>linkManager</code> contains the link manager.
// */
// private LinkManager linkManager;
/**
* This is the constructor for <code>Html2Latex</code>.
*
* @param linkManager the link manager
*/
@SuppressFBWarnings(value = "CT_CONSTRUCTOR_THROW")
public Html2Latex(@NonNull LinkManager linkManager) {
// this.linkManager = linkManager;
define("#text", (node) -> {
var t = (TextNode) node;
if (t.isBlank()) {
return t.text();
}
return Logos.text2latex(t.getWholeText()
.replaceAll(""", "\"")
.replaceAll(">", "\\$>\\$")
.replaceAll("<", "\\$<\\$"));
}, NOTHING);
define("#comment", (node) -> {
var s = ((Comment) node).getData();
return "% " + s.substring(2, s.length() - 1) + "\n";
}, NOTHING);
define("a", (node) -> {
var href = node.attr("href");
if (href.startsWith("http://")
|| href.startsWith("https://")
|| href.startsWith("ftp://")) {
return "\\href{" + href + "}{";
}
if (href.startsWith("/pkg/")) {
linkManager.see(href.replaceFirst("^/pkg/", "ctan:pkg:"));
}
return "\\href{https://ctan.org" + href + "}{";
}, (node) -> {
var href = node.attr("href");
return (href.startsWith("/pkg/")
? "}\\cite{ctan:pkg:" + href.substring(5) + "}"
: "}");
});
define("b", "\\textbf{", CLOSE_BRACE);
define("body", NOTHING, NOTHING);
define("br", "\\\\\n", NOTHING);
define("center", "\\begin{center}", "\\end{center}\n");
define("code", "\\texttt{", CLOSE_BRACE);
define("div", "\n\n", "\n\n");
define("dd", NOTHING, NOTHING);
define("dl", "\\begin{description}", "\n\\end{description}\n");
define("dt", "\n\\item[", "]");
define("em", "\\emph{", CLOSE_BRACE);
define("i", "\\textit{", CLOSE_BRACE);
define("kbd", "\\texttt{", CLOSE_BRACE);
define("li", "\\item ", NL);
define("p", NOTHING, "\n\n");
define("pre", "\\begin{verbatim}", "\\end{verbatim}\n");
define("ol", "\\begin{enumerate}", "\\end{enumerate}\n");
define("s", "\\sout{", CLOSE_BRACE);
define("small", "{\\ small", CLOSE_BRACE);
define("span", "\\begingroup ", "\\endgroup ");
define("strong", "\\emph{", CLOSE_BRACE);
define("table",
node -> "\\begin{tabular}{" + new TableAnalyzer().analyze(node)
+ "}\\headrule\n",
" \\bottomrule\n \\end[tabular}");
define("tbody", NOTHING, NOTHING);
define("td", (node) -> (isFirst(node) ? "" : "&"), NOTHING);
define("th", (node) -> (isFirst(node) ? "" : "&"), NOTHING);
define("thead", NOTHING, NOTHING);
define("tr", " ",
node -> (isTableHeader(node)
? "\\\\\\midrule\n"
: "\\\\\n"));
define("tt", "\\texttt{", CLOSE_BRACE);
define("u", "\\uline{", CLOSE_BRACE);
define("ul", "\\begin{itemize}", "\\end{itemize}\n");
define("var", "\\textit{", CLOSE_BRACE);
define("xref",
node -> {
linkManager.see(node.attr("refid"));
return "\\href{https://ctan.org/pkg/" + node.attr("refid")
+ "}{";
},
node -> {
return "}";
});
}
/**
* Convert HTML to LaTeX.
*
* @param in the HTML input as string
* @return the resulting
* L<span style="font-size: 75%; margin-left: -.36em; margin-right:
* -.125em; text-transform: uppercase; vertical-align:
* .45ex;">a</span>T<span style=
* "text-transform:uppercase;font-size:90%;vertical-align:-0.4ex;
* margin-left:-0.2em;margin-right:-0.1em;line-height: 0;" >e</span>X
* code
*/
public String convert(@NonNull String in) {
var buffer = new StringBuilder();
for (char c : in.toCharArray()) {
switch (c) {
case '{':
case '}':
case '#':
case '%':
case '&':
case '_':
case '$':
buffer.append('\\');
buffer.append(c);
break;
case '\\':
buffer.append("\\textbackslash{}");
break;
default:
buffer.append(c);
}
}
var visitor = new LatexNodeVisitor();
var doc = Jsoup.parse(buffer.toString());
doc.outputSettings(new OutputSettings().prettyPrint(false)
.escapeMode(EscapeMode.xhtml));
var body = doc.select("body");
body.traverse(visitor);
return visitor.toString().trim()//
.replaceAll("\n[ \t]+\n", "\n\n") //
.replaceAll("[\n][\n][\n]+", "\n\n");
}
/**
* Define the expansion rule for a HTML tag.
*
* @param name the name of the tag
* @param startHandler the handler for producing the start
* @param endHandler the handler for producing the end
*/
private void define(String name, Handler startHandler,
Handler endHandler) {
tags.put(name, new Handler[]{startHandler, endHandler});
}
/**
* Define the expansion rule for a HTML tag.
*
* @param name the name of the tag
* @param startHandler the handler for producing the start
* @param end the text at the end
*/
private void define(String name, Handler startHandler, String end) {
tags.put(name, new Handler[]{startHandler, new StringHandler(end)});
}
/**
* Define the expansion rule for a HTML tag.
*
* @param name the name of the tag
* @param start the start string
* @param endHandler the handler for producing the end
*/
private void define(String name, String start, Handler endHandler) {
tags.put(name, new Handler[]{new StringHandler(start), endHandler});
}
/**
* Define the expansion rule for a HTML tag.
*
* @param name the name of the tag
* @param start the text at the start
* @param end the text at the end
*/
private void define(String name, String start, String end) {
tags.put(name,
new Handler[]{new StringHandler(start), new StringHandler(end)});
}
/**
* The method <code>isFirst</code> provides means to check whether the
* previous nodes are only text.
*
* @param node the current node
*/
private boolean isFirst(Node node) {
for (var it = node.previousSibling(); it != null; it =
it.previousSibling()) {
if (!(it instanceof TextNode)) {
return false;
}
}
return true;
}
/**
* The method <code>isTableHeader</code> provides means to identify a table
* header.
*
* @param node the current node
* @return {@code true} iff we have a table header at hand
*/
private boolean isTableHeader(Node node) {
for (var it : node.childNodes()) {
if (it instanceof TextNode) {
continue;
}
var tag = it.nodeName();
if ("th".equals(tag)) {
return true;
}
return false;
}
return false;
}
}