CtanTokenizer.java
/*
/*
* Copyright (C) 2012-2025 The CTAN Team and individual authors
*
* This file is distributed under the 3-clause BSD license.
* See file LICENSE for details.
*/
package org.ctan.site.services.search.base;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.KeywordAttributeImpl;
/**
* This tokenizer knows a little bit of TeX.
*
* @author <a href="mailto:gene@ctan.org">Gerd Neugebauer</a>
*/
public final class CtanTokenizer extends Tokenizer {
// /**
// * The field <code>offset</code> contains the ...
// */
// private int offset = 0;
/**
* The field <code>save</code> contains the last character read.
*/
private int save = -1;
/**
* Creates a new object.
*
* @param reader the reader
*/
public CtanTokenizer(Reader reader) {
setReader(reader);
}
/**
* {@inheritDoc} Overwritten to make SpotBugs happy.
*
* @see org.apache.lucene.util.AttributeSource#equals(java.lang.Object)
*/
@Override
public boolean equals(Object obj) {
return super.equals(obj);
}
/**
* Retrieve the next character.
*
* @return the next character or -1
*
* @throws IOException in case of an I/O problem
*/
private int getc() throws IOException {
if (save >= 0) {
var c = save;
save = -1;
return c;
}
// offset++;
return input.read();
}
/**
* {@inheritDoc} Overwritten to make SpotBugs happy.
*
* @see org.apache.lucene.util.AttributeSource#hashCode()
*/
@Override
public int hashCode() {
return super.hashCode();
}
/**
* {@inheritDoc}
*
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
int c;
StringBuilder buffer;
// int start;
do {
do {
c = getc();
if (c < 0) {
return false;
}
} while (!Character.isLetter(c) && c != '\\' && c != '@');
// start = offset;
buffer = new StringBuilder();
for (; c >= 0; c = getc()) {
if (c == '\\') {
c = getc();
if (c < 0) {
return false;
}
if (Character.isLetter(c)) {
buffer.append('\\');
do {
buffer.append((char) c);
c = getc();
} while (c >= 0 && Character.isLetter(c));
ungetc(c);
} else if (c == '"' || c == '\'' || c == '^' || c == '`'
|| c == '~') {
// map to unaccented
} else {
buffer.append('\\');
buffer.append((char) c);
}
} else if (Character.isLetter(c) || c == '@') {
buffer.append((char) c);
for (c = getc(); c >= 0 && Character.isLetter(c)
|| c == '@'; c = getc()) {
buffer.append((char) c);
}
} else if (c != '{' && c != '}') {
ungetc(c);
break;
}
}
} while (buffer.length() < 2);
addAttributeImpl(new KeywordAttributeImpl());
// return new Token(buffer.toString().toLowerCase(), start, offset);
return true;
}
// /**
// * {@inheritDoc}
// *
// * @see org.apache.lucene.analysis.TokenStream#next()
// */
// @Override
// public Token next() throws IOException {
// int c;
// StringBuilder buffer;
// int start;
// do {
// do {
// c = getc();
// if (c < 0) {
// return null;
// }
// } while (!Character.isLetter(c) && c != '\\' && c != '@');
//
// start = offset;
// buffer = new StringBuilder();
//
// for (; c >= 0; c = getc()) {
// if (c == '\\') {
// c = getc();
// if (c < 0) {
// return null;
// }
// if (Character.isLetter(c)) {
// buffer.append('\\');
// do {
// buffer.append((char) c);
// c = getc();
// } while (c >= 0 && Character.isLetter(c));
// ungetc(c);
// } else if (c == '"' || c == '\'' || c == '^' || c == '`'
// || c == '~') {
// // map to unaccented
// } else {
// buffer.append('\\');
// buffer.append((char) c);
// }
// } else if (Character.isLetter(c) || c == '@') {
// buffer.append((char) c);
// for (c = getc(); c >= 0 && Character.isLetter(c)
// || c == '@'; c = getc()) {
// buffer.append((char) c);
// }
// } else if (c != '{' && c != '}') {
// ungetc(c);
// break;
// }
// }
// } while (buffer.length() < 2);
//
// return new Token(buffer.toString().toLowerCase(), start, offset);
// return null;
// }
/**
* Save away a character for later reading.
*
* @param c the character to push back. A negative value is also allowed,
* but will be ignored.
*/
private void ungetc(int c) {
save = c;
}
}