CtanAnalyzer.java
/*
* Copyright (C) 2012-2025 The CTAN Team and individual authors
*
* This file is distributed under the 3-clause BSD license.
* See file LICENSE for details.
*/
package org.ctan.site.services.search.base;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
/**
* This is a special {@link Analyzer} which has an extended set of stop words
* for the CTAN search.
*
* @author <a href="mailto:gene@ctan.org">Gerd Neugebauer</a>
*/
public class CtanAnalyzer extends StopwordAnalyzerBase {
/**
* The field <code>STOP_WORDS</code> contains the ...
*/
private static final String[] STOP_WORDS = {
"a",
"an",
"and",
"are",
"as",
"at",
"be",
"but",
"by",
"for",
"if",
"in",
"into",
"is",
"it",
"no",
"not",
"of",
"on",
"or",
"s",
"such",
"t",
"that",
"the",
"their",
"then",
"there",
"these",
"they",
"this",
"to",
"was",
"will",
"with", //
"ein",
"eine",
"einer",
"eines",
"einen",
"einem",
"der",
"die",
"das", // German articles
"le",
"la",
"les", // French articles
"il", // Italian article
"el", // Spanish article
"\\begin",
"\\end",
"\\def",
"\\xdef",
"\\edef",
"\\gdef",
"\\global",
"\\newcommand",
"\\renewcommand",
"\\newenvironment",
"\\renewenvironment"};
/**
* The field <code>INSTANCE</code> contains the instance to use.
*/
public static final CtanAnalyzer INSTANCE = new CtanAnalyzer();
/**
* This is the constructor for <code>CtanAnalyzer</code>.
*/
private CtanAnalyzer() {
}
// /**
// * {@inheritDoc}
// *
// * @see
// org.apache.lucene.analysis.ReusableAnalyzerBase#createComponents(
// java.lang.String,
// * java.io.Reader)
// */
// @Override
// protected TokenStreamComponents createComponents(String fieldName,
// Reader reader) {
//
// Tokenizer tokenizer = new StandardTokenizer(reader);
// TokenStream stream = new StandardFilter(tokenizer);
// stream = new LowerCaseFilter(stream);
// stream = new ASCIIFoldingFilter(stream);
// stream = new StopFilter(stream, //
// new CharArraySet(Arrays.asList(STOP_WORDS), true));
//
// return new TokenStreamComponents(tokenizer, stream);
// }
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new WhitespaceTokenizer();
TokenStream result =
new LengthFilter(source, 3, Integer.MAX_VALUE);
// result = new PartOfSpeechTaggingFilter(result);
return new TokenStreamComponents(source, result);
// TODO Auto-generated method stub
// return null;
}
}