MarkdownScanner.java
/*
* Copyright © 2014-2025 The CTAN Team and individual authors
*
* This file is distributed under the 3-clause BSD license.
* See file LICENSE for details.
*/
package org.ctan.markup.markdown;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
/**
* This class is the reader for the markdown parser. It contains additional
* methods to support this task.
*
* <p>
* This reader applies a filter operation to map the line endings \r, \n, and
* \r\n to \n. Thus all major types of line endings are treated the same.
* </p>
*
* @author <a href="mailto:gene@ctan.org">Gerd Neugebauer</a>
*/
public class MarkdownScanner extends PushbackReader {
/**
* This is the constructor for <code>MarkdownScanner</code>.
*
* @param in the input reader
* @param size the push-back buffer size
*/
public MarkdownScanner(Reader in, int size) {
super(in, size);
}
/**
* This method checks that the lookahead of several characters are identical
* to the expected character. Then these are digested. Otherwise the
* characters are pushed back into the input.
*
* @param s the expected characters
*
* @return {@code true} iff <i>n</i> characters have been found
*
* @throws IOException in case of an I/O error
*/
public boolean expect(int... s) throws IOException {
var i = 0;
for (int c : s) {
var cc = read();
if (c != cc) {
unget(cc);
while (i-- > 0) {
unread(s[i]);
}
return false;
}
i++;
}
return true;
}
/**
* This method skips spaces and checks for an expected character. If the
* character is found then it is skipped as well. Otherwise the character
* found is pushed back to the input stream.
*
* @param cc the expected character
*
* @return {@code true} iff the expected character has been found
*
* @throws IOException in case of an I/O error
*/
public boolean expectAfterSpace(char cc) throws IOException {
var c = readNonSpace();
if (c >= 0 && c == cc) {
return true;
}
unget(c);
return false;
}
/**
* This method checks that the line contains only whitespace and at least 2
* instances of the given character.
*
* @param cc the additional terminating character
*
* @return the next undigested character
*
* @throws IOException in case of an I/O error
*/
public boolean expectLineWith(int cc) throws IOException {
var n = 0;
var buffer = new StringBuilder();
for (var c = read(); c >= 0; c = read()) {
buffer.append((char) c);
if (c == cc) {
n++;
} else if (c == '\n') {
if (n >= 2) {
return true;
}
unget(c);
break;
} else if (!Character.isWhitespace(c)) {
unget(buffer);
return false;
}
}
if (n < 2) {
unget(buffer);
return false;
}
return true;
}
/**
* The method <code>expectNumberPeriod</code> skips spaces and checks for at
* most 9 digits followed by a period and an optional space. If the
* characters are found then they are skipped as well. Otherwise the
* characters found are pushed back to the input stream.
*
* @param cc the digit already read
* @return the number found or {@code null}
* @throws IOException in case of an I/O error
*/
public String expectNumberPeriod(int cc) throws IOException {
var buffer = new StringBuilder();
if (cc > 0) {
buffer.append((char) cc);
} else {
skipWhiteSpace(0); // TODO count spaces
}
int c;
for (c = in.read(); Character.isDigit(c); c = in.read()) {
buffer.append((char) c);
}
unget(c);
if (buffer.length() > 0 && buffer.length() <= 9 && expect('.')) {
expect(' ');
return buffer.toString();
}
unget(buffer);
return null;
}
/**
* This method parses to the closing parenthesis.
*
* @return a pair of the items contained
*
* @throws IOException in case of an I/O error
*/
public String[] expectParens() throws IOException {
var result = new String[2];
result[0] = expectUrl(')', false);
if (result[0] == null) {
unget('(');
return null;
}
result[1] = expectString();
if (!expect(')')) {
unget('(', result[0]);
if (result[1] != null) {
unget('"', result[1], '"'); // TODO: re-escape
}
return null;
}
return result;
}
/**
* This method expects a string enclosed in single or double quotes.
*
* @return the string found or {@code null} on failure
*
* @throws IOException in case of an I/O error
*/
public String expectString() throws IOException {
var buffer = new StringBuilder();
var cc = skipSpace(buffer);
if (cc != '"' && cc != '\'') {
unget(cc, buffer);
return null;
}
var s = new StringBuilder();
int c;
for (c = read(); c >= 0 && c != cc; c = read()) {
if (c == '\\') {
c = read();
if (c < 0) {
break;
}
}
s.append((char) c);
}
return s.toString();
}
/**
* This method parses a URL and returns it. If no proper URL is found then
* the input stream is left unchanged and {@code null} is returned.
*
* @param endChar the additional end character – except whitespace
* @param absolute indicate that only absolute URLs are allowed
*
* @return the URL found or {@code null} on failure
*
* @throws IOException in case of an I/O error
*/
public String expectUrl(int endChar, boolean absolute) throws IOException {
var spaces = new StringBuilder();
var s = new StringBuilder();
int c;
for (c = skipSpace(spaces); c >= 0 && !Character.isWhitespace(c)
&& c != endChar; c = read()) {
if (c == '\\') {
c = read();
if (c < 0) {
break;
}
}
s.append((char) c);
}
unget(c);
var url = s.toString();
if ((c < 0 && url.length() == 0)
|| (absolute && !(url.startsWith("http://")
|| url.startsWith("https://")
|| url.startsWith("ftp://")
|| url.startsWith("irc://")
|| url.startsWith("mailto:")))) {
unget(s);
unget(spaces);
url = null;
}
return url;
}
/**
* This method looks at the next character without actually removing it from
* the input.
*
* @return the next character
*
* @throws IOException in case of an I/O error
*/
public int lookahead() throws IOException {
var c = read();
if (c >= 0) {
unread(c);
}
return c;
}
/**
* {@inheritDoc}
*
* @see java.io.PushbackReader#read()
*/
@Override
public int read() throws IOException {
var c = super.read();
if (c == '\r') {
c = super.read();
if (c >= 0 && c != '\n') {
unread(c);
}
return '\n';
}
return c;
}
/**
* This method reads text in brackets. It is assumed that the opening
* bracket has already been digested.
*
* @return the contents found in the brackets or {@code null}
*
* @throws IOException in case of an I/O error
*/
public String readBrackets() throws IOException {
var n = 1;
int c;
var buffer = new StringBuilder();
for (c = read(); c >= 0; c = read()) {
switch (c) {
case '[':
n++;
break;
case ']':
if (--n <= 0) {
return buffer.toString();
}
break;
default: // fall-through
}
buffer.append((char) c);
}
unget(c, buffer);
return null;
}
/**
* This method eats up all characters till the next newline.
*
* @return the line read
*
* @throws IOException in case of an I/O error
*/
public String readLine() throws IOException {
return readLineTo(new StringBuilder()).toString();
}
/**
* This method collects characters in a target buffer until a newline or one
* of two terminating characters is found.
*
* @param c the character already read
* @param buffer the target buffer
* @param c1 the first terminating character
* @param c2 the second terminating character
*
* @return the character which caused the slurping to be terminated
*
* @throws IOException in case of an I/O error
*/
public int readLineTo(int c, StringBuilder buffer, char c1, char c2)
throws IOException {
while (c >= 0 && c != '\n' && c != c1 && c != c2) {
buffer.append((char) c);
c = read();
}
return c;
}
/**
* This method eats up all characters till the next newline.
*
* @param buffer the target buffer
*
* @return the target buffer
*
* @throws IOException in case of an I/O error
*/
public StringBuilder readLineTo(StringBuilder buffer) throws IOException {
for (var c = read(); c >= 0 && c != '\n'; c = read()) {
buffer.append((char) c);
}
return buffer;
}
/**
* This method eats up all characters till the next newline or the
* terminating character is encountered.
*
* @param buffer the target buffer
* @param cc the additional terminating character
*
* @return the next undigested character
*
* @throws IOException in case of an I/O error
*/
public int readLineTo(StringBuilder buffer, char cc) throws IOException {
int c;
for (c = read(); c >= 0 && c != '\n' && c != cc; c = read()) {
buffer.append((char) c);
}
return c;
}
/**
* This method eats up all whitespace characters.
*
* @return the next undigested character
*
* @throws IOException in case of an I/O error
*/
public int readNonSpace() throws IOException {
int c;
for (c = read(); c >= 0 && c != '\n' && Character.isWhitespace(c); c =
read()) {
}
return c;
}
/**
* This method reads characters to the next whitespace into a target buffer.
*
* @param buffer the target buffer
*
* @return the buffer
*
* @throws IOException in case of an I/O error
*/
public StringBuilder readToWhitespace(StringBuilder buffer)
throws IOException {
int c;
for (c = read(); c >= 0 && !Character.isWhitespace(c) && c != '<'
&& c != '>'; c = read()) {
buffer.append((char) c);
}
unget(c);
return buffer;
}
/**
* This method digests all hash marks found and returns their number.
*
* @param max the limit for the return value
*
* @return the number of hashes up to the limit
*
* @throws IOException in case of an I/O error
*/
public int scanSectionDepth(int max) throws IOException {
var level = 0;
int c;
int last = '#';
for (c = read(); level <= max; c = read()) {
if (c == '#') {
level++;
last = '#';
} else if (c != ' ' && c != '\t') {
break;
} else {
last = c;
}
}
if (c >= 0) {
unget(c);
}
if (last == ' ' || last == '\t') {
return level;
}
while (level-- >= 0) {
unget('#');
}
return -1;
}
/**
* This method skips to EOL.
*
* @throws IOException in case of an I/O error
*/
public void skipLine() throws IOException {
for (var c = read(); c >= 0 && c != '\n'; c = read()) {
}
}
/**
* This method reads ahead and stores all whitespace characters in the
* provided buffer.
*
* @param buffer the target buffer
*
* @return the first non-whitespace character
*
* @throws IOException in case of an I/O error
*/
public int skipSpace(StringBuilder buffer) throws IOException {
int c;
for (c = read(); c == ' ' || c == '\t' || c == '\f'; c = read()) {
buffer.append((char) c);
}
return c;
}
/**
* This method reads ahead and skips all whitespace characters.
*
* @param indent the previous indentation
* @return the new indentation level
*
* @throws IOException in case of an I/O error
*/
public int skipWhiteSpace(int indent) throws IOException {
int c;
for (;;) {
switch (c = read()) {
case ' ':
case '\f':
indent++;
continue;
case '\t':
indent += 4 - indent % 4;
continue;
default:
}
break;
}
unget(c);
return indent;
}
/**
* This method pushes back a string to the input stream.
*
* @param s the string
*
* @throws IOException in case of an I/O error
*/
public void unget(CharSequence s) throws IOException {
if (s == null) {
return;
}
try {
for (var i = s.length() - 1; i >= 0; i--) {
unread(s.charAt(i));
}
} catch (IOException e) {
// ignored
}
}
/**
* This method pushes back a character to the input stream.
*
* @param c the character code
*
* @throws IOException in case of an I/O error
*/
public void unget(int c) throws IOException {
if (c >= 0) {
unread(c);
}
}
/**
* This method pushes back a string to the input stream.
*
* @param c the character code
* @param s the string
*
* @throws IOException in case of an I/O error
*/
public void unget(int c, CharSequence s) throws IOException {
unget(c);
unget(s);
}
/**
* This method pushes back a string to the input stream.
*
* @param c the character code
* @param s the string
* @param s2 the second string
*
* @throws IOException in case of an I/O error
*/
public void unget(int c, CharSequence s, CharSequence s2)
throws IOException {
unget(c);
unget(s);
unget(s2);
}
/**
* This method pushes back a string to the input stream.
*
* @param c the character code
* @param s the string
* @param s2 the second string
* @param c2 the terminating character
*
* @throws IOException in case of an I/O error
*/
public void unget(int c, CharSequence s, CharSequence s2, int c2)
throws IOException {
unget(c);
unget(s);
unget(s2);
unget(c2);
}
/**
* This method pushes back a string to the input stream.
*
* @param c the character code
* @param s the string
* @param c2 the terminating character
*
* @throws IOException in case of an I/O error
*/
public void unget(int c, CharSequence s, int c2) throws IOException {
unget(c);
unget(s);
unget(c2);
}
}