CtanAnalyzer.java

/*
 * Copyright (C) 2012-2025 The CTAN Team and individual authors
 *
 * This file is distributed under the 3-clause BSD license.
 * See file LICENSE for details.
 */

package org.ctan.site.services.search.base;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.LengthFilter;

/**
 * This is a special {@link Analyzer} which has an extended set of stop words
 * for the CTAN search.
 *
 * @author <a href="mailto:gene@ctan.org">Gerd Neugebauer</a>
 */
public class CtanAnalyzer extends StopwordAnalyzerBase {

    /**
     * The field <code>STOP_WORDS</code> contains the ...
     */
    private static final String[] STOP_WORDS = {
        "a",
        "an",
        "and",
        "are",
        "as",
        "at",
        "be",
        "but",
        "by",
        "for",
        "if",
        "in",
        "into",
        "is",
        "it",
        "no",
        "not",
        "of",
        "on",
        "or",
        "s",
        "such",
        "t",
        "that",
        "the",
        "their",
        "then",
        "there",
        "these",
        "they",
        "this",
        "to",
        "was",
        "will",
        "with", //
        "ein",
        "eine",
        "einer",
        "eines",
        "einen",
        "einem",
        "der",
        "die",
        "das", // German articles
        "le",
        "la",
        "les", // French articles
        "il", // Italian article
        "el", // Spanish article
        "\\begin",
        "\\end",
        "\\def",
        "\\xdef",
        "\\edef",
        "\\gdef",
        "\\global",
        "\\newcommand",
        "\\renewcommand",
        "\\newenvironment",
        "\\renewenvironment"};

    /**
     * The field <code>INSTANCE</code> contains the instance to use.
     */
    public static final CtanAnalyzer INSTANCE = new CtanAnalyzer();

    /**
     * This is the constructor for <code>CtanAnalyzer</code>.
     */
    private CtanAnalyzer() {

    }

    // /**
    // * {@inheritDoc}
    // *
    // * @see
    // org.apache.lucene.analysis.ReusableAnalyzerBase#createComponents(
    // java.lang.String,
    // * java.io.Reader)
    // */
    // @Override
    // protected TokenStreamComponents createComponents(String fieldName,
    // Reader reader) {
    //
    // Tokenizer tokenizer = new StandardTokenizer(reader);
    // TokenStream stream = new StandardFilter(tokenizer);
    // stream = new LowerCaseFilter(stream);
    // stream = new ASCIIFoldingFilter(stream);
    // stream = new StopFilter(stream, //
    // new CharArraySet(Arrays.asList(STOP_WORDS), true));
    //
    // return new TokenStreamComponents(tokenizer, stream);
    // }

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {

        final Tokenizer source = new WhitespaceTokenizer();
        TokenStream result =
                        new LengthFilter(source, 3, Integer.MAX_VALUE);
        // result = new PartOfSpeechTaggingFilter(result);
        return new TokenStreamComponents(source, result);
        // TODO Auto-generated method stub
        // return null;
    }
}