MarkdownScanner.java

/*
 * Copyright © 2014-2025 The CTAN Team and individual authors
 *
 * This file is distributed under the 3-clause BSD license.
 * See file LICENSE for details.
 */

package org.ctan.markup.markdown;

import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;

/**
 * This class is the reader for the markdown parser. It contains additional
 * methods to support this task.
 *
 * <p>
 * This reader applies a filter operation to map the line endings \r, \n, and
 * \r\n to \n. Thus all major types of line endings are treated the same.
 * </p>
 *
 * @author <a href="mailto:gene@ctan.org">Gerd Neugebauer</a>
 */
public class MarkdownScanner extends PushbackReader {

    /**
     * This is the constructor for <code>MarkdownScanner</code>.
     *
     * @param in the input reader
     * @param size the push-back buffer size
     */
    public MarkdownScanner(Reader in, int size) {

        super(in, size);
    }

    /**
     * This method checks that the lookahead of several characters are identical
     * to the expected character. Then these are digested. Otherwise the
     * characters are pushed back into the input.
     *
     * @param s the expected characters
     *
     * @return {@code true} iff <i>n</i> characters have been found
     *
     * @throws IOException in case of an I/O error
     */
    public boolean expect(int... s) throws IOException {

        var i = 0;

        for (int c : s) {
            var cc = read();
            if (c != cc) {
                unget(cc);
                while (i-- > 0) {
                    unread(s[i]);
                }
                return false;
            }
            i++;
        }

        return true;
    }

    /**
     * This method skips spaces and checks for an expected character. If the
     * character is found then it is skipped as well. Otherwise the character
     * found is pushed back to the input stream.
     *
     * @param cc the expected character
     *
     * @return {@code true} iff the expected character has been found
     *
     * @throws IOException in case of an I/O error
     */
    public boolean expectAfterSpace(char cc) throws IOException {

        var c = readNonSpace();
        if (c >= 0 && c == cc) {
            return true;
        }
        unget(c);
        return false;
    }

    /**
     * This method checks that the line contains only whitespace and at least 2
     * instances of the given character.
     *
     * @param cc the additional terminating character
     *
     * @return the next undigested character
     *
     * @throws IOException in case of an I/O error
     */
    public boolean expectLineWith(int cc) throws IOException {

        var n = 0;
        var buffer = new StringBuilder();

        for (var c = read(); c >= 0; c = read()) {
            buffer.append((char) c);
            if (c == cc) {
                n++;
            } else if (c == '\n') {
                if (n >= 2) {
                    return true;
                }
                unget(c);
                break;
            } else if (!Character.isWhitespace(c)) {
                unget(buffer);
                return false;
            }
        }
        if (n < 2) {
            unget(buffer);
            return false;
        }
        return true;
    }

    /**
     * The method <code>expectNumberPeriod</code> skips spaces and checks for at
     * most 9 digits followed by a period and an optional space. If the
     * characters are found then they are skipped as well. Otherwise the
     * characters found are pushed back to the input stream.
     *
     * @param cc the digit already read
     * @return the number found or {@code null}
     * @throws IOException in case of an I/O error
     */
    public String expectNumberPeriod(int cc) throws IOException {

        var buffer = new StringBuilder();
        if (cc > 0) {
            buffer.append((char) cc);
        } else {
            skipWhiteSpace(0); // TODO count spaces
        }
        int c;
        for (c = in.read(); Character.isDigit(c); c = in.read()) {
            buffer.append((char) c);
        }

        unget(c);
        if (buffer.length() > 0 && buffer.length() <= 9 && expect('.')) {
            expect(' ');
            return buffer.toString();
        }
        unget(buffer);
        return null;
    }

    /**
     * This method parses to the closing parenthesis.
     *
     * @return a pair of the items contained
     *
     * @throws IOException in case of an I/O error
     */
    public String[] expectParens() throws IOException {

        var result = new String[2];
        result[0] = expectUrl(')', false);
        if (result[0] == null) {
            unget('(');
            return null;
        }
        result[1] = expectString();
        if (!expect(')')) {
            unget('(', result[0]);
            if (result[1] != null) {
                unget('"', result[1], '"'); // TODO: re-escape
            }
            return null;
        }
        return result;
    }

    /**
     * This method expects a string enclosed in single or double quotes.
     *
     * @return the string found or {@code null} on failure
     *
     * @throws IOException in case of an I/O error
     */
    public String expectString() throws IOException {

        var buffer = new StringBuilder();
        var cc = skipSpace(buffer);
        if (cc != '"' && cc != '\'') {
            unget(cc, buffer);
            return null;
        }
        var s = new StringBuilder();
        int c;
        for (c = read(); c >= 0 && c != cc; c = read()) {
            if (c == '\\') {
                c = read();
                if (c < 0) {
                    break;
                }
            }
            s.append((char) c);
        }

        return s.toString();
    }

    /**
     * This method parses a URL and returns it. If no proper URL is found then
     * the input stream is left unchanged and {@code null} is returned.
     *
     * @param endChar the additional end character &ndash; except whitespace
     * @param absolute indicate that only absolute URLs are allowed
     *
     * @return the URL found or {@code null} on failure
     *
     * @throws IOException in case of an I/O error
     */
    public String expectUrl(int endChar, boolean absolute) throws IOException {

        var spaces = new StringBuilder();
        var s = new StringBuilder();
        int c;
        for (c = skipSpace(spaces); c >= 0 && !Character.isWhitespace(c)
                        && c != endChar; c = read()) {
            if (c == '\\') {
                c = read();
                if (c < 0) {
                    break;
                }
            }
            s.append((char) c);
        }
        unget(c);
        var url = s.toString();
        if ((c < 0 && url.length() == 0)
                        || (absolute && !(url.startsWith("http://")
                                        || url.startsWith("https://")
                                        || url.startsWith("ftp://")
                                        || url.startsWith("irc://")
                                        || url.startsWith("mailto:")))) {
            unget(s);
            unget(spaces);
            url = null;
        }

        return url;
    }

    /**
     * This method looks at the next character without actually removing it from
     * the input.
     *
     * @return the next character
     *
     * @throws IOException in case of an I/O error
     */
    public int lookahead() throws IOException {

        var c = read();
        if (c >= 0) {
            unread(c);
        }
        return c;
    }

    /**
     * {@inheritDoc}
     *
     * @see java.io.PushbackReader#read()
     */
    @Override
    public int read() throws IOException {

        var c = super.read();
        if (c == '\r') {
            c = super.read();
            if (c >= 0 && c != '\n') {
                unread(c);
            }
            return '\n';
        }
        return c;
    }

    /**
     * This method reads text in brackets. It is assumed that the opening
     * bracket has already been digested.
     *
     * @return the contents found in the brackets or {@code null}
     *
     * @throws IOException in case of an I/O error
     */
    public String readBrackets() throws IOException {

        var n = 1;
        int c;
        var buffer = new StringBuilder();

        for (c = read(); c >= 0; c = read()) {
            switch (c) {
                case '[':
                    n++;
                    break;
                case ']':
                    if (--n <= 0) {
                        return buffer.toString();
                    }
                    break;
                default: // fall-through
            }
            buffer.append((char) c);
        }

        unget(c, buffer);
        return null;
    }

    /**
     * This method eats up all characters till the next newline.
     *
     * @return the line read
     *
     * @throws IOException in case of an I/O error
     */
    public String readLine() throws IOException {

        return readLineTo(new StringBuilder()).toString();
    }

    /**
     * This method collects characters in a target buffer until a newline or one
     * of two terminating characters is found.
     *
     * @param c the character already read
     * @param buffer the target buffer
     * @param c1 the first terminating character
     * @param c2 the second terminating character
     *
     * @return the character which caused the slurping to be terminated
     *
     * @throws IOException in case of an I/O error
     */
    public int readLineTo(int c, StringBuilder buffer, char c1, char c2)
                    throws IOException {

        while (c >= 0 && c != '\n' && c != c1 && c != c2) {
            buffer.append((char) c);
            c = read();
        }
        return c;
    }

    /**
     * This method eats up all characters till the next newline.
     *
     * @param buffer the target buffer
     *
     * @return the target buffer
     *
     * @throws IOException in case of an I/O error
     */
    public StringBuilder readLineTo(StringBuilder buffer) throws IOException {

        for (var c = read(); c >= 0 && c != '\n'; c = read()) {
            buffer.append((char) c);
        }
        return buffer;
    }

    /**
     * This method eats up all characters till the next newline or the
     * terminating character is encountered.
     *
     * @param buffer the target buffer
     * @param cc the additional terminating character
     *
     * @return the next undigested character
     *
     * @throws IOException in case of an I/O error
     */
    public int readLineTo(StringBuilder buffer, char cc) throws IOException {

        int c;
        for (c = read(); c >= 0 && c != '\n' && c != cc; c = read()) {
            buffer.append((char) c);
        }
        return c;
    }

    /**
     * This method eats up all whitespace characters.
     *
     * @return the next undigested character
     *
     * @throws IOException in case of an I/O error
     */
    public int readNonSpace() throws IOException {

        int c;
        for (c = read(); c >= 0 && c != '\n' && Character.isWhitespace(c); c =
                        read()) {
        }
        return c;
    }

    /**
     * This method reads characters to the next whitespace into a target buffer.
     *
     * @param buffer the target buffer
     *
     * @return the buffer
     *
     * @throws IOException in case of an I/O error
     */
    public StringBuilder readToWhitespace(StringBuilder buffer)
                    throws IOException {

        int c;
        for (c = read(); c >= 0 && !Character.isWhitespace(c) && c != '<'
                        && c != '>'; c = read()) {
            buffer.append((char) c);
        }
        unget(c);
        return buffer;
    }

    /**
     * This method digests all hash marks found and returns their number.
     *
     * @param max the limit for the return value
     *
     * @return the number of hashes up to the limit
     *
     * @throws IOException in case of an I/O error
     */
    public int scanSectionDepth(int max) throws IOException {

        var level = 0;
        int c;
        int last = '#';
        for (c = read(); level <= max; c = read()) {
            if (c == '#') {
                level++;
                last = '#';
            } else if (c != ' ' && c != '\t') {
                break;
            } else {
                last = c;
            }
        }
        if (c >= 0) {
            unget(c);
        }
        if (last == ' ' || last == '\t') {
            return level;
        }

        while (level-- >= 0) {
            unget('#');
        }
        return -1;
    }

    /**
     * This method skips to EOL.
     *
     * @throws IOException in case of an I/O error
     */
    public void skipLine() throws IOException {

        for (var c = read(); c >= 0 && c != '\n'; c = read()) {
        }
    }

    /**
     * This method reads ahead and stores all whitespace characters in the
     * provided buffer.
     *
     * @param buffer the target buffer
     *
     * @return the first non-whitespace character
     *
     * @throws IOException in case of an I/O error
     */
    public int skipSpace(StringBuilder buffer) throws IOException {

        int c;
        for (c = read(); c == ' ' || c == '\t' || c == '\f'; c = read()) {
            buffer.append((char) c);
        }
        return c;
    }

    /**
     * This method reads ahead and skips all whitespace characters.
     *
     * @param indent the previous indentation
     * @return the new indentation level
     *
     * @throws IOException in case of an I/O error
     */
    public int skipWhiteSpace(int indent) throws IOException {

        int c;
        for (;;) {
            switch (c = read()) {
                case ' ':
                case '\f':
                    indent++;
                    continue;
                case '\t':
                    indent += 4 - indent % 4;
                    continue;
                default:
            }
            break;
        }
        unget(c);
        return indent;
    }

    /**
     * This method pushes back a string to the input stream.
     *
     * @param s the string
     *
     * @throws IOException in case of an I/O error
     */
    public void unget(CharSequence s) throws IOException {

        if (s == null) {
            return;
        }
        try {
            for (var i = s.length() - 1; i >= 0; i--) {
                unread(s.charAt(i));
            }
        } catch (IOException e) {
            // ignored
        }
    }

    /**
     * This method pushes back a character to the input stream.
     *
     * @param c the character code
     *
     * @throws IOException in case of an I/O error
     */
    public void unget(int c) throws IOException {

        if (c >= 0) {
            unread(c);
        }
    }

    /**
     * This method pushes back a string to the input stream.
     *
     * @param c the character code
     * @param s the string
     *
     * @throws IOException in case of an I/O error
     */
    public void unget(int c, CharSequence s) throws IOException {

        unget(c);
        unget(s);
    }

    /**
     * This method pushes back a string to the input stream.
     *
     * @param c the character code
     * @param s the string
     * @param s2 the second string
     *
     * @throws IOException in case of an I/O error
     */
    public void unget(int c, CharSequence s, CharSequence s2)
                    throws IOException {

        unget(c);
        unget(s);
        unget(s2);
    }

    /**
     * This method pushes back a string to the input stream.
     *
     * @param c the character code
     * @param s the string
     * @param s2 the second string
     * @param c2 the terminating character
     *
     * @throws IOException in case of an I/O error
     */
    public void unget(int c, CharSequence s, CharSequence s2, int c2)
                    throws IOException {

        unget(c);
        unget(s);
        unget(s2);
        unget(c2);
    }

    /**
     * This method pushes back a string to the input stream.
     *
     * @param c the character code
     * @param s the string
     * @param c2 the terminating character
     *
     * @throws IOException in case of an I/O error
     */
    public void unget(int c, CharSequence s, int c2) throws IOException {

        unget(c);
        unget(s);
        unget(c2);
    }
}