CtanTokenizer.java

/*
/*
 * Copyright (C) 2012-2025 The CTAN Team and individual authors
 *
 * This file is distributed under the 3-clause BSD license.
 * See file LICENSE for details.
 */

package org.ctan.site.services.search.base;

import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.KeywordAttributeImpl;

/**
 * This tokenizer knows a little bit of TeX.
 *
 * @author <a href="mailto:gene@ctan.org">Gerd Neugebauer</a>
 */
public final class CtanTokenizer extends Tokenizer {

    // /**
    // * The field <code>offset</code> contains the ...
    // */
    // private int offset = 0;

    /**
     * The field <code>save</code> contains the last character read.
     */
    private int save = -1;

    /**
     * Creates a new object.
     *
     * @param reader the reader
     */
    public CtanTokenizer(Reader reader) {

        setReader(reader);
    }

    /**
     * {@inheritDoc} Overwritten to make SpotBugs happy.
     *
     * @see org.apache.lucene.util.AttributeSource#equals(java.lang.Object)
     */
    @Override
    public boolean equals(Object obj) {

        return super.equals(obj);
    }

    /**
     * Retrieve the next character.
     *
     * @return the next character or -1
     *
     * @throws IOException in case of an I/O problem
     */
    private int getc() throws IOException {

        if (save >= 0) {
            var c = save;
            save = -1;
            return c;
        }
        // offset++;
        return input.read();
    }

    /**
     * {@inheritDoc} Overwritten to make SpotBugs happy.
     *
     * @see org.apache.lucene.util.AttributeSource#hashCode()
     */
    @Override
    public int hashCode() {

        return super.hashCode();
    }

    /**
     * {@inheritDoc}
     *
     * @see org.apache.lucene.analysis.TokenStream#incrementToken()
     */
    @Override
    public boolean incrementToken() throws IOException {

        int c;
        StringBuilder buffer;
        // int start;
        do {
            do {
                c = getc();
                if (c < 0) {
                    return false;
                }
            } while (!Character.isLetter(c) && c != '\\' && c != '@');

            // start = offset;
            buffer = new StringBuilder();

            for (; c >= 0; c = getc()) {
                if (c == '\\') {
                    c = getc();
                    if (c < 0) {
                        return false;
                    }
                    if (Character.isLetter(c)) {
                        buffer.append('\\');
                        do {
                            buffer.append((char) c);
                            c = getc();
                        } while (c >= 0 && Character.isLetter(c));
                        ungetc(c);
                    } else if (c == '"' || c == '\'' || c == '^' || c == '`'
                        || c == '~') {
                        // map to unaccented
                    } else {
                        buffer.append('\\');
                        buffer.append((char) c);
                    }
                } else if (Character.isLetter(c) || c == '@') {
                    buffer.append((char) c);
                    for (c = getc(); c >= 0 && Character.isLetter(c)
                        || c == '@'; c = getc()) {
                        buffer.append((char) c);
                    }
                } else if (c != '{' && c != '}') {
                    ungetc(c);
                    break;
                }
            }
        } while (buffer.length() < 2);

        addAttributeImpl(new KeywordAttributeImpl());
        // return new Token(buffer.toString().toLowerCase(), start, offset);
        return true;
    }

    // /**
    // * {@inheritDoc}
    // *
    // * @see org.apache.lucene.analysis.TokenStream#next()
    // */
    // @Override
    // public Token next() throws IOException {

    // int c;
    // StringBuilder buffer;
    // int start;
    // do {
    // do {
    // c = getc();
    // if (c < 0) {
    // return null;
    // }
    // } while (!Character.isLetter(c) && c != '\\' && c != '@');
    //
    // start = offset;
    // buffer = new StringBuilder();
    //
    // for (; c >= 0; c = getc()) {
    // if (c == '\\') {
    // c = getc();
    // if (c < 0) {
    // return null;
    // }
    // if (Character.isLetter(c)) {
    // buffer.append('\\');
    // do {
    // buffer.append((char) c);
    // c = getc();
    // } while (c >= 0 && Character.isLetter(c));
    // ungetc(c);
    // } else if (c == '"' || c == '\'' || c == '^' || c == '`'
    // || c == '~') {
    // // map to unaccented
    // } else {
    // buffer.append('\\');
    // buffer.append((char) c);
    // }
    // } else if (Character.isLetter(c) || c == '@') {
    // buffer.append((char) c);
    // for (c = getc(); c >= 0 && Character.isLetter(c)
    // || c == '@'; c = getc()) {
    // buffer.append((char) c);
    // }
    // } else if (c != '{' && c != '}') {
    // ungetc(c);
    // break;
    // }
    // }
    // } while (buffer.length() < 2);
    //
    // return new Token(buffer.toString().toLowerCase(), start, offset);
    // return null;
    // }

    /**
     * Save away a character for later reading.
     *
     * @param c the character to push back. A negative value is also allowed,
     *     but will be ignored.
     */
    private void ungetc(int c) {

        save = c;
    }
}