HtmlSanitizer.java

/*
 * Copyright © 2014-2025 The CTAN Team and individual authors
 *
 * This file is distributed under the 3-clause BSD license.
 * See file LICENSE for details.
 */
package org.ctan.markup.markdown;

import java.io.IOException;
import java.io.Writer;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.ctan.markup.Tag;

import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import lombok.NonNull;

/**
 * This class filters HTML and allows only a restricted set of tags or
 * attributes to pass. The allowed constructs are defined within the enumeration
 * {@link Tag} and the properties of the constants defined there.
 *
 * @author <a href="mailto:gene@ctan.org">Gerd Neugebauer</a>
 */
@SuppressFBWarnings(value = "EI_EXPOSE_REP2")
public class HtmlSanitizer {

    /**
     * The constant <code>ATTRIBUTE_IN_SINGLE</code> contains the pattern for
     * HTML attributes enclosed in single quotes.
     */
    private static final Pattern ATTRIBUTE_IN_SINGLE = Pattern
        .compile("^\\s*([a-zA-Z]+)='([^']*)'");

    /**
     * The constant <code>ATTRIBUTE_IN_DOUBLE</code> contains the pattern for
     * HTML attributes enclosed in double quotes.
     */
    private static final Pattern ATTRIBUTE_IN_DOUBLE = Pattern
        .compile("^\\s*([a-zA-Z]+)=\"([^\"]*)\"");

    /**
     * The constant <code>TAG_NAME_LENGTH</code> contains the maximal length of
     * the tag name.
     */
    private static final int TAG_NAME_LENGTH = 16;

    /**
     * The field <code>in</code> contains the input reader.
     */
    private MarkdownScanner in;

    /**
     * The field <code>out</code> contains the target writer.
     */
    private Writer out;

    /**
     * The field <code>base</code> contains the base URL.
     */
    private String base = null;

    /**
     * This is the constructor for <code>HtmlSanitizer</code>.
     *
     * @param in the input reader; not {@code null}
     * @param out the target writer; not {@code null}
     */
    @SuppressFBWarnings(value = "CT_CONSTRUCTOR_THROW")
    public HtmlSanitizer(@NonNull MarkdownScanner in, @NonNull Writer out) {

        this.in = in;
        this.out = out;
    }

    /**
     * This method matches the beginning of the argument string against the
     * pattern for the definition of an attribute. The attribute can be enclosed
     * either in single or in double quotes.
     *
     * <pre>
     *  attribute="value"
     *  attribute='value'
     * </pre>
     *
     * @param s the string to match
     *
     * @return a matcher which can be queried for the success status. In case of
     *     success the group 1 contains the attribute name and group 2 the
     *     attribute value (without delimiters)
     */
    private Matcher matchAttribute(String s) {

        var matcher = ATTRIBUTE_IN_DOUBLE.matcher(s);
        if (matcher.matches()) {
            return matcher;
        }
        return ATTRIBUTE_IN_SINGLE.matcher(s);
    }

    /**
     * This method reads in letters. They are stored in the target buffer. At
     * most 16 letters are slurped in.
     *
     * @param c the character already read
     * @param buffer the target buffer
     *
     * @return the first non-letter found, or -1 on EOF, or -2 when a comment
     *     has been encountered
     *
     * @throws IOException in case of an I/O error
     */
    private int readTag(int c, StringBuilder buffer) throws IOException {

        var n = TAG_NAME_LENGTH;
        if (c == '!') {
            if (in.expect('-', '-')) {
                for (; c >= 0 && !in.expect('-', '-', '>'); c = in.read()) {
                }
                return -2;
            } else if (in.expect('D', 'O', 'C', 'T', 'Y', 'P', 'E')
                || in.expect('d', 'o', 'c', 't', 'y', 'p', 'e')) {
                for (; c >= 0 && c != '>'; c = in.read()) {
                }
                in.skipWhiteSpace(0);
                return -2;
            }
            buffer.append('_');
            c = in.read();
        } else if (c == '?') {
            for (; c >= 0 && !in.expect('?', '>'); c = in.read()) {
            }
            // in.unget(in.skipWhiteSpace());
            return -2;
        }
        for (; c >= 0 && (Character.isLetter(c) || Character.isDigit(c))
            && n-- > 0; c = in.read()) {
            buffer.append((char) c);
        }
        return c;
    }

    /**
     * This method scans for embedded HTML and passes on only allowed tags and
     * attributes. It stops after the first completed tag.
     *
     * @return {@code true} iff everything went right
     *
     * @throws IOException in case of an I/O error
     */
    public boolean sanitize() throws IOException {

        var stack = new Stack<Tag>();
        for (var c = in.read(); c >= 0; c = in.read()) {
            switch (c) {
                case '<':
                    if (sanitizeHtmlTag(stack)) {
                        while (!stack.isEmpty()) {
                            stack.pop().end(out);
                        }
                        return false;
                    }
                    if (stack.isEmpty()) {
                        return true;
                    }
                    break;
                case '>':
                    out.write("&gt;");
                    break;
                default:
                    out.write(c);
            }
        }
        boolean result = true;
        while (!stack.isEmpty()) {
            stack.pop().end(out);
            result = false;
        }
        return result;
    }

    /**
     * This method reads the complete input stream and processes all tags until
     * the EOF is reached. On syntax errors part of the input is skipped.
     *
     * @return {@code true} iff a syntax error has been detected
     *
     * @throws IOException in case of an I/O error
     */
    public boolean sanitizeAll() throws IOException {

        boolean result = false;
        while (in.lookahead() >= 0) {
            result = sanitize() || result;
        }
        return result;
    }

    /**
     * This method parses a single HTML tag and generates output if it is valid.
     *
     * @param stack the HTML stack
     *
     * @return {@code true} iff a syntax error has been detected
     *
     * @throws IOException in case of an I/O error
     */
    private boolean sanitizeHtmlTag(Stack<Tag> stack) throws IOException {

        var buffer = new StringBuilder();
        int c = in.read();
        boolean atEnd = (c == '/');
        int endChar = (atEnd ? c : -1);
        c = readTag((atEnd ? in.read() : c), buffer);
        if (c == -2) {
            return false;
        }
        var t = buffer.toString();
        Tag tag;
        try {
            tag = Tag.valueOf(t.toUpperCase());
        } catch (IllegalArgumentException e) {
            in.unget(c, buffer, endChar);
            return true;
        }
        buffer = new StringBuilder();
        var a = 0;
        for (; c >= 0 && c != '\n' && (a != 0 || (c != '/' && c != '>')); c =
            in.read()) {
            buffer.append((char) c);
            if (c == '"' || c == '\'') {
                if (a == c) {
                    a = 0;
                } else if (a == 0) {
                    a = c;
                }
            }
        }
        var autoClose = false;
        if (c == '/') {
            if (atEnd || !in.expectAfterSpace('>')) {
                in.unget(c, buffer, t, endChar);
                return true;
            }
            autoClose = true;
        } else if (c < 0 || c == '\n') {
            in.unget(c, buffer, t, endChar);
            return true;
        }
        switch (tag.getType()) {
            case SKIP_TAG:
                if (!atEnd) {
                    in.unget(in.skipWhiteSpace(0));
                }
                return false;
            case SELF_CLOSING:
                autoClose = true;
                break;
            case PASS:
            default:
                // done
        }
        if (atEnd) {
            tag.end(out);
            if (!stack.isEmpty()) {
                stack.pop();
            }
            return false;
        }
        var s = buffer.toString();
        Map<String, String> attributes = new HashMap<String, String>();
        for (var m = matchAttribute(s); m.matches(); m =
            matchAttribute(s)) {
            attributes.put(m.group(1), m.group(2));
            s = s.substring(m.end());
        }
        if (autoClose) {
            tag.writeSave(out, attributes);
        } else if (tag.startSave(out, base, attributes)) {
            stack.push(tag);
        }
        return false;
    }

    /**
     * This is the setter for <code>base</code>.
     *
     * @param base the new value for base
     */
    public void setBase(String base) {

        this.base = base;
    }
}