HtmlSanitizer.java
/*
* Copyright © 2014-2025 The CTAN Team and individual authors
*
* This file is distributed under the 3-clause BSD license.
* See file LICENSE for details.
*/
package org.ctan.markup.markdown;
import java.io.IOException;
import java.io.Writer;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.ctan.markup.Tag;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import lombok.NonNull;
/**
* This class filters HTML and allows only a restricted set of tags or
* attributes to pass. The allowed constructs are defined within the enumeration
* {@link Tag} and the properties of the constants defined there.
*
* @author <a href="mailto:gene@ctan.org">Gerd Neugebauer</a>
*/
@SuppressFBWarnings(value = "EI_EXPOSE_REP2")
public class HtmlSanitizer {
/**
* The constant <code>ATTRIBUTE_IN_SINGLE</code> contains the pattern for
* HTML attributes enclosed in single quotes.
*/
private static final Pattern ATTRIBUTE_IN_SINGLE = Pattern
.compile("^\\s*([a-zA-Z]+)='([^']*)'");
/**
* The constant <code>ATTRIBUTE_IN_DOUBLE</code> contains the pattern for
* HTML attributes enclosed in double quotes.
*/
private static final Pattern ATTRIBUTE_IN_DOUBLE = Pattern
.compile("^\\s*([a-zA-Z]+)=\"([^\"]*)\"");
/**
* The constant <code>TAG_NAME_LENGTH</code> contains the maximal length of
* the tag name.
*/
private static final int TAG_NAME_LENGTH = 16;
/**
* The field <code>in</code> contains the input reader.
*/
private MarkdownScanner in;
/**
* The field <code>out</code> contains the target writer.
*/
private Writer out;
/**
* The field <code>base</code> contains the base URL.
*/
private String base = null;
/**
* This is the constructor for <code>HtmlSanitizer</code>.
*
* @param in the input reader; not {@code null}
* @param out the target writer; not {@code null}
*/
@SuppressFBWarnings(value = "CT_CONSTRUCTOR_THROW")
public HtmlSanitizer(@NonNull MarkdownScanner in, @NonNull Writer out) {
this.in = in;
this.out = out;
}
/**
* This method matches the beginning of the argument string against the
* pattern for the definition of an attribute. The attribute can be enclosed
* either in single or in double quotes.
*
* <pre>
* attribute="value"
* attribute='value'
* </pre>
*
* @param s the string to match
*
* @return a matcher which can be queried for the success status. In case of
* success the group 1 contains the attribute name and group 2 the
* attribute value (without delimiters)
*/
private Matcher matchAttribute(String s) {
var matcher = ATTRIBUTE_IN_DOUBLE.matcher(s);
if (matcher.matches()) {
return matcher;
}
return ATTRIBUTE_IN_SINGLE.matcher(s);
}
/**
* This method reads in letters. They are stored in the target buffer. At
* most 16 letters are slurped in.
*
* @param c the character already read
* @param buffer the target buffer
*
* @return the first non-letter found, or -1 on EOF, or -2 when a comment
* has been encountered
*
* @throws IOException in case of an I/O error
*/
private int readTag(int c, StringBuilder buffer) throws IOException {
var n = TAG_NAME_LENGTH;
if (c == '!') {
if (in.expect('-', '-')) {
for (; c >= 0 && !in.expect('-', '-', '>'); c = in.read()) {
}
return -2;
} else if (in.expect('D', 'O', 'C', 'T', 'Y', 'P', 'E')
|| in.expect('d', 'o', 'c', 't', 'y', 'p', 'e')) {
for (; c >= 0 && c != '>'; c = in.read()) {
}
in.skipWhiteSpace(0);
return -2;
}
buffer.append('_');
c = in.read();
} else if (c == '?') {
for (; c >= 0 && !in.expect('?', '>'); c = in.read()) {
}
// in.unget(in.skipWhiteSpace());
return -2;
}
for (; c >= 0 && (Character.isLetter(c) || Character.isDigit(c))
&& n-- > 0; c = in.read()) {
buffer.append((char) c);
}
return c;
}
/**
* This method scans for embedded HTML and passes on only allowed tags and
* attributes. It stops after the first completed tag.
*
* @return {@code true} iff everything went right
*
* @throws IOException in case of an I/O error
*/
public boolean sanitize() throws IOException {
var stack = new Stack<Tag>();
for (var c = in.read(); c >= 0; c = in.read()) {
switch (c) {
case '<':
if (sanitizeHtmlTag(stack)) {
while (!stack.isEmpty()) {
stack.pop().end(out);
}
return false;
}
if (stack.isEmpty()) {
return true;
}
break;
case '>':
out.write(">");
break;
default:
out.write(c);
}
}
boolean result = true;
while (!stack.isEmpty()) {
stack.pop().end(out);
result = false;
}
return result;
}
/**
* This method reads the complete input stream and processes all tags until
* the EOF is reached. On syntax errors part of the input is skipped.
*
* @return {@code true} iff a syntax error has been detected
*
* @throws IOException in case of an I/O error
*/
public boolean sanitizeAll() throws IOException {
boolean result = false;
while (in.lookahead() >= 0) {
result = sanitize() || result;
}
return result;
}
/**
* This method parses a single HTML tag and generates output if it is valid.
*
* @param stack the HTML stack
*
* @return {@code true} iff a syntax error has been detected
*
* @throws IOException in case of an I/O error
*/
private boolean sanitizeHtmlTag(Stack<Tag> stack) throws IOException {
var buffer = new StringBuilder();
int c = in.read();
boolean atEnd = (c == '/');
int endChar = (atEnd ? c : -1);
c = readTag((atEnd ? in.read() : c), buffer);
if (c == -2) {
return false;
}
var t = buffer.toString();
Tag tag;
try {
tag = Tag.valueOf(t.toUpperCase());
} catch (IllegalArgumentException e) {
in.unget(c, buffer, endChar);
return true;
}
buffer = new StringBuilder();
var a = 0;
for (; c >= 0 && c != '\n' && (a != 0 || (c != '/' && c != '>')); c =
in.read()) {
buffer.append((char) c);
if (c == '"' || c == '\'') {
if (a == c) {
a = 0;
} else if (a == 0) {
a = c;
}
}
}
var autoClose = false;
if (c == '/') {
if (atEnd || !in.expectAfterSpace('>')) {
in.unget(c, buffer, t, endChar);
return true;
}
autoClose = true;
} else if (c < 0 || c == '\n') {
in.unget(c, buffer, t, endChar);
return true;
}
switch (tag.getType()) {
case SKIP_TAG:
if (!atEnd) {
in.unget(in.skipWhiteSpace(0));
}
return false;
case SELF_CLOSING:
autoClose = true;
break;
case PASS:
default:
// done
}
if (atEnd) {
tag.end(out);
if (!stack.isEmpty()) {
stack.pop();
}
return false;
}
var s = buffer.toString();
Map<String, String> attributes = new HashMap<String, String>();
for (var m = matchAttribute(s); m.matches(); m =
matchAttribute(s)) {
attributes.put(m.group(1), m.group(2));
s = s.substring(m.end());
}
if (autoClose) {
tag.writeSave(out, attributes);
} else if (tag.startSave(out, base, attributes)) {
stack.push(tag);
}
return false;
}
/**
* This is the setter for <code>base</code>.
*
* @param base the new value for base
*/
public void setBase(String base) {
this.base = base;
}
}