SearchService.java

/*
 * Copyright © 2012-2025 The CTAN Team and individual authors
 *
 * This file is distributed under the 3-clause BSD license.
 * See file LICENSE for details.
 */
package org.ctan.site.services.search;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;

import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.mlt.MoreLikeThis;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.TopScoreDocCollectorManager;
import org.ctan.site.CtanConfiguration.IndexConfig;
import org.ctan.site.services.search.QueryContainer.HitInfo;
import org.ctan.site.services.search.base.CtanAnalyzer;
import org.ctan.site.services.search.base.Fields;
import org.ctan.site.services.search.base.IndexType;
import org.ctan.site.services.search.base.IndexingBase;

import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NonNull;
import lombok.extern.slf4j.Slf4j;

/**
 * The class <code>SearchingService</code> implements a service to perform a
 * full-text search.
 *
 * <h2>Configuration</h2> This service can be configured via the global
 * Dropwizard configuration file <code>src/main/resources/ctan.yml</code>. The
 * following configuration items are recognized:
 * <dl>
 * <dt>search.directory</dt>
 * <dd>This configuration item contains the absolute path to the directory in
 * which the sub-directories for the Lucene index files reside.</dd>
 * </dl>
 *
 * @author <a href="mailto:gene@ctan.org">Gerd Neugebauer</a>
 */
@Slf4j
public class SearchService extends IndexingBase {

    /**
     * The class <code>RecommendationTo</code> contains the definition of the
     * transport object for a recommendation.
     */
    @Data
    @Builder
    @AllArgsConstructor
    public static class RecommendationTo {

        String path;

        String key;

        String display;

        String title;

        float score;
    }
    // @Data
    // @Builder
    // @AllArgsConstructor
    // static public class SearchTo {
    //
    // String path;
    //
    // String key;
    //
    // String display;
    //
    // String title;
    //
    // float score;
    //
    // String type;
    // }

    /**
     * Add a * to all words.
     *
     * @param phrase the phrase
     * @return the adapted parameter
     */
    private static String addWildcard(String phrase) {

        var chars = phrase.toCharArray();
        var len = chars.length;
        var output = new char[len * 4 + 1];
        var n = ASCIIFoldingFilter.foldToASCII(chars, 0, output, 0, len);
        var buffer = new StringBuilder();
        for (var w : new String(output, 0, n).split("[ \t]+")) {
            buffer.append(w);
            buffer.append("* ");
        }
        return buffer.toString();
    }
    // /**
    // * Perform a query.
    // *
    // * @param query the query
    // *
    // * @return the result list
    // */
    // def findAllByPath(IndexType type, String path) {
    //
    // //TODO: Fixme
    // def result = []
    // IndexReader iReader = createReader(type)
    // Query query = new TermQuery(new Term(Fields.PATH, path))
    // TopScoreDocCollector collector = TopScoreDocCollector.create(1024, true)
    // IndexSearcher searcher = new IndexSearcher(iReader, true)
    //
    // try {
    // searcher.search(query, collector)
    // if (collector.getTotalHits() == 0 ) {
    // return result
    // }
    // TopDocs topDocs = collector.topDocs(0)
    // for (ScoreDoc x : topDocs.scoreDocs) {
    // Document doc = searcher.doc(x.doc)
    // long lastModified = doc.getField(Fields.MTIME).
    // numericValue().longValue()
    // result.add([path: doc.get(Fields.PATH),
    // type: doc.get(Fields.TYPE),
    // title: doc.get(Fields.TITLE),
    // display: doc.get(Fields.DISPLAY),
    // lastModified: lastModified])
    // }
    // } finally {
    // iReader.close()
    // }
    //
    // return result
    // }

    /**
     * This is the constructor for <code>SearchService</code>.
     *
     * @param config the configuration
     * @throws FileNotFoundException in case that one of the index directories
     *     does not exist
     */
    @SuppressFBWarnings(value = "CT_CONSTRUCTOR_THROW")
    public SearchService(@NonNull IndexConfig config)
        throws FileNotFoundException {

        super(config);
    }

    /**
     * Add a new reader to a list.
     *
     * @param list the list to augment
     * @param type the index type
     * @param locale the locale
     * @return <code>true</code> iff the reader has been added
     * @throws CorruptIndexException in case of a problem with the index
     * @throws IOException in case of an I/O error
     */
    private boolean addReader(List<IndexReader> list, IndexType type,
        String locale)
        throws CorruptIndexException,
            IOException {

        var reader = createReader(type, locale);
        if (reader == null) {
            return false;
        }
        list.add(reader);
        return true;
    }

    /**
     * Make an index reader.
     *
     * @param type the index type
     * @param locale the locale
     *
     * @return the new index reader or {@code null} if the index is not
     *     available
     *
     * @throws CorruptIndexException in case of a problem with the index
     * @throws IOException in case of an I/O error
     */
    private IndexReader createReader(IndexType type, String locale)
        throws CorruptIndexException,
            IOException {

        // TODO
        // var index = getIndexDir(type, locale);
        // if (!index.toFile().exists()) {
        // return null;
        // }
        return DirectoryReader.open(open(type, locale));
    }

    /**
     * Perform a query.
     *
     * @param query the query
     *
     * @return the query container
     * @throws IOException in case of an I/O error
     * @throws CorruptIndexException in case of an error
     * @throws ParseException in case of an error
     */
    public QueryContainer find(@NonNull QueryContainer query)
        throws CorruptIndexException,
            IOException,
            ParseException {

        if (query.getMax() > 512) {
            query.setMax(512);
        }
        if (query.getPhrase() == null || query.getSections().isEmpty()) {
            return query.resetHits();
        }
        var phrase = query.getPhrase().trim();
        if (query.isWildcard()) {
            phrase = addWildcard(phrase);
        }
        var t = System.currentTimeMillis();
        List<IndexReader> readerList = new ArrayList<>();
        String language = query.getLocale(Locale.ENGLISH).getLanguage();
        for (var it : query.getSections()) {
            addReader(readerList, it, language);
        }
        IndexReader[] subReaders =
            readerList.toArray(new IndexReader[readerList.size()]);
        var parser = new QueryParser(Fields.DEFAULT, CtanAnalyzer.INSTANCE);
        // QueryParser parser = new QueryParser(Version.LATEST,
        // Fields.DEFAULT, ANALYZER)
        parser.setDefaultOperator(QueryParser.AND_OPERATOR);
        Query q = parser.parse(phrase);
        if (query.hasType("PKG")) {
            // PhraseQuery pq = new PhraseQuery(Fields.PKG);
            // pq.add(new Term(Fields.PKG, query.phrase.trim()));
            // // pq.setBoost(128.0f)
            // BooleanQuery bq = new BooleanQuery()
            // bq.add(pq, BooleanClause.Occur.SHOULD)
            // bq.add(q, BooleanClause.Occur.SHOULD)
            // q = bq
        }
        TopScoreDocCollector collector =
            new TopScoreDocCollectorManager(query.getMax(), null,
                10000,
                false).newCollector();
        // TopScoreDocCollector.create(query.getOffset() + query.getMax(),
        // true);
        var reader = new MultiReader(subReaders, true);
        try {
            IndexSearcher searcher = new IndexSearcher(reader);
            searcher.search(q, collector);
            query.setHitNumber(collector.getTotalHits());
            query.setPhrase(q.toString());
            List<HitInfo> results = new ArrayList<HitInfo>();
            TopDocs topDocs = collector.topDocs(query.getOffset());
            for (ScoreDoc it : topDocs.scoreDocs) {

                // TODO

                // Document doc = searcher.doc(it.doc);
                // results.add(HitInfo.builder()
                // .path(doc.get(Fields.PATH))
                // .title(doc.get(Fields.TITLE))
                // .type(doc.get(Fields.TYPE))
                // .display(doc.get(Fields.DISPLAY))
                // .lastModified(
                // doc.getField(Fields.MTIME).numericValue().longValue())
                // .build());
            }
            // }
            // } finally {
            // iReader.close()
            query.setHits(results);
            query.setRuntime(formatTime(System.currentTimeMillis() - t));
        } catch (IOException e) {
            log.error(e.getMessage());
            return query;
        } finally {
            readerList.forEach(it -> {
                try {
                    it.close();
                } catch (IOException e) {
                    log.error(e.getMessage());
                }
            });
        }
        return query;
    }

    /**
     * The method <code>findByPath</code> provides means to TODO gene.
     *
     * @param type the type
     * @param path the path
     * @return -1
     * @throws CorruptIndexException in case of an error
     * @throws IOException in case of an I/O error
     */
    int findByPath(IndexType type, String locale, String path)
        throws CorruptIndexException,
            IOException {

        IndexReader reader = createReader(type, locale);
        if (reader == null) {
            return -1;
        }
        try {
            new TermQuery(new Term(Fields.PATH, path));
        } finally {
            reader.close();
        }
        return -1;
    }
    // /**
    // * The method <code>find</code> provides means to perform a search.
    // *
    // * @param q the search term
    // * @param sections the sections to search in
    // * @param page the current page
    // * @param size the page size
    // * @param lang the locale
    // * @return the search result
    // */
    // public QueryContainer find(@NonNull String q, IndexType[] sections,
    // int page,
    // int size, String lang) {
    //
    // // TODO Auto-generated method stub
    //
    // if (page > 0 || "x".equals(q)) {// TODO
    // return QueryContainer.builder()
    // .max(size)
    // .offset(page)
    // .phrase(q)
    // .build();
    // }
    // return QueryContainer.builder()
    // .max(size)
    // .offset(page)
    // .phrase(q)
    // .hits(List.of(HitInfo.builder()
    // .display("display display")// TODO
    // .path("/path/path")// TODO
    // .title("Title Title")// TODO
    // .lastModified(121212L)// TODO
    // .type("T")
    // .build(),
    // HitInfo.builder()
    // .display("display display display")// TODO
    // .path("/path/path/path")// TODO
    // .title("Title Title Title")// TODO
    // .lastModified(12121212L)// TODO
    // .type("A")
    // .build()))
    // .build();
    // }
    // /**
    // * This method retrieves the modification time for a path in an index.
    // *
    // * @param type the index type
    // * @param path the URL path
    // *
    // * @return the date of last modification or 0
    // */
    // long getLastModified(IndexType type, String path) {
    //
    // long lastModified = 0l
    // Directory directory = FSDirectory.open(getIndexDir(type))
    // try {
    // IndexSearcher searcher = new IndexSearcher(directory)
    // Query query = new TermQuery(new Term(Fields.PATH, path))
    // TopDocs docs = searcher.search(query, 1)
    // def d = docs.scoreDocs
    // if (d.length() >= 1) {
    // Document doc = searcher.doc(d[0].doc)
    // lastModified = doc.getField(Fields.MTIME).numericValue().longValue()
    // }
    // } finally {
    // directory.close()
    // }
    // }

    /**
     * Format a time.
     *
     * @param t the time
     *
     * @return the time in readable format
     */
    private String formatTime(long t) {

        if (t == 0) {
            return "0ms";
        }
        StringBuilder buffer = new StringBuilder();
        if (t >= 60000) {
            buffer.append((int) (t / 60000)).append("min");
            t = t % 60000;
        }
        if (t >= 1000) {
            buffer.append((int) (t / 1000)).append("s");
            t = t % 1000;
        }
        if (t > 0) {
            buffer.append(t).append("ms");
        }
        return buffer.toString();
    }

    /**
     * This method retrieves a list of recommendations for a given package. The
     * list contains a transport object for each hit with the following entries:
     * <dl>
     * <dt>path</dt>
     * <dd>The path to the package description</dd>
     * <dt>display</dt>
     * <dd>The short name of the package</dd>
     * <dt>title</dt>
     * <dd>The title – currently in English</dd>
     * <dt>score</dt>
     * <dd>The score as float.</dd>
     * </dl>
     *
     * @param packageName the name of the base package
     * @param locale the language
     * @param maxNumber the maximal number results to be returned
     *
     * @return the list of recommendations
     * @throws IOException in case of an I/O error
     * @throws CorruptIndexException in case of an error
     */
    public List<RecommendationTo> recommendations(String packageName,
        String locale,
        int maxNumber)
        throws CorruptIndexException,
            IOException {

        var pkgDocId = findByPath(IndexType.PKG, locale, "/pkg/${packageName}");
        if (pkgDocId < 0) {
            return List.of();
        }
        var indexReader = createReader(IndexType.PKG, locale);
        var indexSearcher = new IndexSearcher(indexReader);
        var mlt = new MoreLikeThis(indexReader);
        mlt.setMinTermFreq(1);
        mlt.setMinDocFreq(1);
        mlt.setFieldNames(new String[]{Fields.TAGS});
        mlt.setAnalyzer(CtanAnalyzer.INSTANCE);
        var query = mlt.like(pkgDocId);
        var topDocs = indexSearcher.search(query, maxNumber);
        List<RecommendationTo> result = new ArrayList<>();
        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            if (scoreDoc.doc == pkgDocId) {
                continue;
            }
            // TODO
            // var similar = indexReader.document(scoreDoc.doc);
            // result.add(RecommendationTo.builder()
            // .path(similar.get(Fields.PATH))
            // .key(similar.get(Fields.PATH).replaceAll(".*/", ""))
            // .display(similar.get(Fields.DISPLAY))
            // .title(similar.get(Fields.TITLE))
            // .score(scoreDoc.score)
            // .build());
        }
        return result;
    }
}