Html2Latex.java

/*
 * Copyright © 2018-2025 The CTAN Team and individual authors
 *
 * This file is distributed under the 3-clause BSD license.
 * See file LICENSE for details.
 */
package org.ctan.markup.html2latex;

import java.util.HashMap;
import java.util.Map;

import org.ctan.markup.Logos;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.Document.OutputSettings;
import org.jsoup.nodes.Entities.EscapeMode;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeVisitor;

import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import lombok.NonNull;

/**
 * This class transforms HTML to LaTeX.
 *
 * @author <a href="mailto:gene@ctan.org">Gerd Neugebauer</a>
 */
public class Html2Latex {

    /**
     * This interface describes the handler for processing a HTML node.
     */
    public interface Handler {

        /**
         * Process a HGML node.
         *
         * @param node the HTML Node
         *
         * @return the result
         */
        String handle(Node node);
    }

    /**
     * The node visitor invoked during traversing the HTML tree.
     *
     */
    private class LatexNodeVisitor implements NodeVisitor {

        /**
         * The field <code>buffer</code> contains the target buffer.
         */
        private StringBuilder buffer = new StringBuilder();

        /**
         * {@inheritDoc}
         *
         * @see org.jsoup.select.NodeVisitor#head(org.jsoup.nodes.Node, int)
         */
        @Override
        public void head(Node node, int depth) {

            var name = node.nodeName();
            var visitor = tags.get(name);
            if (visitor == null) {
                throw new RuntimeException("unknown tag `" + name + "´");
            }
            buffer.append(visitor[0].handle(node));
        }

        /**
         * {@inheritDoc}
         *
         * @see org.jsoup.select.NodeVisitor#tail(org.jsoup.nodes.Node, int)
         */
        @Override
        public void tail(Node node, int depth) {

            buffer.append(tags.get(node.nodeName())[1].handle(node));
        }

        /**
         * {@inheritDoc}
         *
         * @see java.lang.Object#toString()
         */
        @Override
        public String toString() {

            return buffer.toString();
        }
    }

    /**
     * This class is a Handler which encapsulates a String to be delivered.
     */
    public static class StringHandler implements Handler {

        /**
         * The field <code>value</code> contains the encapsulated String.
         */
        private String value;

        /**
         * Creates a new object.
         *
         * @param value the string to return
         */
        public StringHandler(String value) {

            this.value = value;
        }

        /**
         * {@inheritDoc}
         *
         * @see org.ctan.markup.html2latex.Html2Latex.Handler#handle(org.jsoup.nodes.Node)
         */
        @Override
        public String handle(Node node) {

            return value;
        }
    }

    /**
     * The field <code>NOTHING</code> contains the handler which produces
     * nothing.
     */
    public static final Handler NOTHING = new StringHandler("");

    /**
     * The field <code>CLOSE_BRACE</code> contains the handler which produces a
     * closing curly brace.
     */
    public static final Handler CLOSE_BRACE = new StringHandler("}");

    /**
     * The field <code>NL</code> contains the handler which produces a newline.
     */
    public static final Handler NL = new StringHandler("\n");

    /**
     * The field <code>tags</code> contains the defined rules for HTML tags.
     */
    private Map<String, Handler[]> tags = new HashMap<>();
    // /**
    // * The field <code>linkManager</code> contains the link manager.
    // */
    // private LinkManager linkManager;

    /**
     * This is the constructor for <code>Html2Latex</code>.
     *
     * @param linkManager the link manager
     */
    @SuppressFBWarnings(value = "CT_CONSTRUCTOR_THROW")
    public Html2Latex(@NonNull LinkManager linkManager) {

        // this.linkManager = linkManager;
        define("#text", (node) -> {
            var t = (TextNode) node;
            if (t.isBlank()) {
                return t.text();
            }
            return Logos.text2latex(t.getWholeText()
                .replaceAll("&quot;", "\"")
                .replaceAll("&gt;", "\\$>\\$")
                .replaceAll("&lt;", "\\$<\\$"));
        }, NOTHING);
        define("#comment", (node) -> {
            var s = ((Comment) node).getData();
            return "% " + s.substring(2, s.length() - 1) + "\n";
        }, NOTHING);
        define("a", (node) -> {
            var href = node.attr("href");
            if (href.startsWith("http://")
                || href.startsWith("https://")
                || href.startsWith("ftp://")) {
                return "\\href{" + href + "}{";
            }
            if (href.startsWith("/pkg/")) {
                linkManager.see(href.replaceFirst("^/pkg/", "ctan:pkg:"));
            }
            return "\\href{https://ctan.org" + href + "}{";
        }, (node) -> {
            var href = node.attr("href");
            return (href.startsWith("/pkg/")
                ? "}\\cite{ctan:pkg:" + href.substring(5) + "}"
                : "}");
        });
        define("b", "\\textbf{", CLOSE_BRACE);
        define("body", NOTHING, NOTHING);
        define("br", "\\\\\n", NOTHING);
        define("center", "\\begin{center}", "\\end{center}\n");
        define("code", "\\texttt{", CLOSE_BRACE);
        define("div", "\n\n", "\n\n");
        define("dd", NOTHING, NOTHING);
        define("dl", "\\begin{description}", "\n\\end{description}\n");
        define("dt", "\n\\item[", "]");
        define("em", "\\emph{", CLOSE_BRACE);
        define("i", "\\textit{", CLOSE_BRACE);
        define("kbd", "\\texttt{", CLOSE_BRACE);
        define("li", "\\item ", NL);
        define("p", NOTHING, "\n\n");
        define("pre", "\\begin{verbatim}", "\\end{verbatim}\n");
        define("ol", "\\begin{enumerate}", "\\end{enumerate}\n");
        define("s", "\\sout{", CLOSE_BRACE);
        define("small", "{\\ small", CLOSE_BRACE);
        define("span", "\\begingroup ", "\\endgroup ");
        define("strong", "\\emph{", CLOSE_BRACE);
        define("table",
            node -> "\\begin{tabular}{" + new TableAnalyzer().analyze(node)
                + "}\\headrule\n",
            "    \\bottomrule\n  \\end[tabular}");
        define("tbody", NOTHING, NOTHING);
        define("td", (node) -> (isFirst(node) ? "" : "&"), NOTHING);
        define("th", (node) -> (isFirst(node) ? "" : "&"), NOTHING);
        define("thead", NOTHING, NOTHING);
        define("tr", "    ",
            node -> (isTableHeader(node)
                ? "\\\\\\midrule\n"
                : "\\\\\n"));
        define("tt", "\\texttt{", CLOSE_BRACE);
        define("u", "\\uline{", CLOSE_BRACE);
        define("ul", "\\begin{itemize}", "\\end{itemize}\n");
        define("var", "\\textit{", CLOSE_BRACE);
        define("xref",
            node -> {
                linkManager.see(node.attr("refid"));
                return "\\href{https://ctan.org/pkg/" + node.attr("refid")
                    + "}{";
            },
            node -> {
                return "}";
            });
    }

    /**
     * Convert HTML to LaTeX.
     *
     * @param in the HTML input as string
     * @return the resulting
     *     L<span style="font-size: 75%; margin-left: -.36em; margin-right:
     *     -.125em; text-transform: uppercase; vertical-align:
     *     .45ex;">a</span>T<span style=
     *     "text-transform:uppercase;font-size:90%;vertical-align:-0.4ex;
     *     margin-left:-0.2em;margin-right:-0.1em;line-height: 0;" >e</span>X
     *     code
     */
    public String convert(@NonNull String in) {

        var buffer = new StringBuilder();
        for (char c : in.toCharArray()) {
            switch (c) {
                case '{':
                case '}':
                case '#':
                case '%':
                case '&':
                case '_':
                case '$':
                    buffer.append('\\');
                    buffer.append(c);
                    break;
                case '\\':
                    buffer.append("\\textbackslash{}");
                    break;
                default:
                    buffer.append(c);
            }
        }
        var visitor = new LatexNodeVisitor();
        var doc = Jsoup.parse(buffer.toString());
        doc.outputSettings(new OutputSettings().prettyPrint(false)
            .escapeMode(EscapeMode.xhtml));
        var body = doc.select("body");
        body.traverse(visitor);
        return visitor.toString().trim()//
            .replaceAll("\n[ \t]+\n", "\n\n") //
            .replaceAll("[\n][\n][\n]+", "\n\n");
    }

    /**
     * Define the expansion rule for a HTML tag.
     *
     * @param name the name of the tag
     * @param startHandler the handler for producing the start
     * @param endHandler the handler for producing the end
     */
    private void define(String name, Handler startHandler,
        Handler endHandler) {

        tags.put(name, new Handler[]{startHandler, endHandler});
    }

    /**
     * Define the expansion rule for a HTML tag.
     *
     * @param name the name of the tag
     * @param startHandler the handler for producing the start
     * @param end the text at the end
     */
    private void define(String name, Handler startHandler, String end) {

        tags.put(name, new Handler[]{startHandler, new StringHandler(end)});
    }

    /**
     * Define the expansion rule for a HTML tag.
     *
     * @param name the name of the tag
     * @param start the start string
     * @param endHandler the handler for producing the end
     */
    private void define(String name, String start, Handler endHandler) {

        tags.put(name, new Handler[]{new StringHandler(start), endHandler});
    }

    /**
     * Define the expansion rule for a HTML tag.
     *
     * @param name the name of the tag
     * @param start the text at the start
     * @param end the text at the end
     */
    private void define(String name, String start, String end) {

        tags.put(name,
            new Handler[]{new StringHandler(start), new StringHandler(end)});
    }

    /**
     * The method <code>isFirst</code> provides means to check whether the
     * previous nodes are only text.
     *
     * @param node the current node
     */
    private boolean isFirst(Node node) {

        for (var it = node.previousSibling(); it != null; it =
            it.previousSibling()) {
            if (!(it instanceof TextNode)) {
                return false;
            }
        }
        return true;
    }

    /**
     * The method <code>isTableHeader</code> provides means to identify a table
     * header.
     *
     * @param node the current node
     * @return {@code true} iff we have a table header at hand
     */
    private boolean isTableHeader(Node node) {

        for (var it : node.childNodes()) {
            if (it instanceof TextNode) {
                continue;
            }
            var tag = it.nodeName();
            if ("th".equals(tag)) {
                return true;
            }
            return false;
        }
        return false;
    }
}