SearchService.java
/*
* Copyright © 2012-2025 The CTAN Team and individual authors
*
* This file is distributed under the 3-clause BSD license.
* See file LICENSE for details.
*/
package org.ctan.site.services.search;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.mlt.MoreLikeThis;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.TopScoreDocCollectorManager;
import org.ctan.site.CtanConfiguration.IndexConfig;
import org.ctan.site.services.search.QueryContainer.HitInfo;
import org.ctan.site.services.search.base.CtanAnalyzer;
import org.ctan.site.services.search.base.Fields;
import org.ctan.site.services.search.base.IndexType;
import org.ctan.site.services.search.base.IndexingBase;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NonNull;
import lombok.extern.slf4j.Slf4j;
/**
* The class <code>SearchingService</code> implements a service to perform a
* full-text search.
*
* <h2>Configuration</h2> This service can be configured via the global
* Dropwizard configuration file <code>src/main/resources/ctan.yml</code>. The
* following configuration items are recognized:
* <dl>
* <dt>search.directory</dt>
* <dd>This configuration item contains the absolute path to the directory in
* which the sub-directories for the Lucene index files reside.</dd>
* </dl>
*
* @author <a href="mailto:gene@ctan.org">Gerd Neugebauer</a>
*/
@Slf4j
public class SearchService extends IndexingBase {
/**
* The class <code>RecommendationTo</code> contains the definition of the
* transport object for a recommendation.
*/
@Data
@Builder
@AllArgsConstructor
public static class RecommendationTo {
String path;
String key;
String display;
String title;
float score;
}
// @Data
// @Builder
// @AllArgsConstructor
// static public class SearchTo {
//
// String path;
//
// String key;
//
// String display;
//
// String title;
//
// float score;
//
// String type;
// }
/**
* Add a * to all words.
*
* @param phrase the phrase
* @return the adapted parameter
*/
private static String addWildcard(String phrase) {
var chars = phrase.toCharArray();
var len = chars.length;
var output = new char[len * 4 + 1];
var n = ASCIIFoldingFilter.foldToASCII(chars, 0, output, 0, len);
var buffer = new StringBuilder();
for (var w : new String(output, 0, n).split("[ \t]+")) {
buffer.append(w);
buffer.append("* ");
}
return buffer.toString();
}
// /**
// * Perform a query.
// *
// * @param query the query
// *
// * @return the result list
// */
// def findAllByPath(IndexType type, String path) {
//
// //TODO: Fixme
// def result = []
// IndexReader iReader = createReader(type)
// Query query = new TermQuery(new Term(Fields.PATH, path))
// TopScoreDocCollector collector = TopScoreDocCollector.create(1024, true)
// IndexSearcher searcher = new IndexSearcher(iReader, true)
//
// try {
// searcher.search(query, collector)
// if (collector.getTotalHits() == 0 ) {
// return result
// }
// TopDocs topDocs = collector.topDocs(0)
// for (ScoreDoc x : topDocs.scoreDocs) {
// Document doc = searcher.doc(x.doc)
// long lastModified = doc.getField(Fields.MTIME).
// numericValue().longValue()
// result.add([path: doc.get(Fields.PATH),
// type: doc.get(Fields.TYPE),
// title: doc.get(Fields.TITLE),
// display: doc.get(Fields.DISPLAY),
// lastModified: lastModified])
// }
// } finally {
// iReader.close()
// }
//
// return result
// }
/**
* This is the constructor for <code>SearchService</code>.
*
* @param config the configuration
* @throws FileNotFoundException in case that one of the index directories
* does not exist
*/
@SuppressFBWarnings(value = "CT_CONSTRUCTOR_THROW")
public SearchService(@NonNull IndexConfig config)
throws FileNotFoundException {
super(config);
}
/**
* Add a new reader to a list.
*
* @param list the list to augment
* @param type the index type
* @param locale the locale
* @return <code>true</code> iff the reader has been added
* @throws CorruptIndexException in case of a problem with the index
* @throws IOException in case of an I/O error
*/
private boolean addReader(List<IndexReader> list, IndexType type,
String locale)
throws CorruptIndexException,
IOException {
var reader = createReader(type, locale);
if (reader == null) {
return false;
}
list.add(reader);
return true;
}
/**
* Make an index reader.
*
* @param type the index type
* @param locale the locale
*
* @return the new index reader or {@code null} if the index is not
* available
*
* @throws CorruptIndexException in case of a problem with the index
* @throws IOException in case of an I/O error
*/
private IndexReader createReader(IndexType type, String locale)
throws CorruptIndexException,
IOException {
// TODO
// var index = getIndexDir(type, locale);
// if (!index.toFile().exists()) {
// return null;
// }
return DirectoryReader.open(open(type, locale));
}
/**
* Perform a query.
*
* @param query the query
*
* @return the query container
* @throws IOException in case of an I/O error
* @throws CorruptIndexException in case of an error
* @throws ParseException in case of an error
*/
public QueryContainer find(@NonNull QueryContainer query)
throws CorruptIndexException,
IOException,
ParseException {
if (query.getMax() > 512) {
query.setMax(512);
}
if (query.getPhrase() == null || query.getSections().isEmpty()) {
return query.resetHits();
}
var phrase = query.getPhrase().trim();
if (query.isWildcard()) {
phrase = addWildcard(phrase);
}
var t = System.currentTimeMillis();
List<IndexReader> readerList = new ArrayList<>();
String language = query.getLocale(Locale.ENGLISH).getLanguage();
for (var it : query.getSections()) {
addReader(readerList, it, language);
}
IndexReader[] subReaders =
readerList.toArray(new IndexReader[readerList.size()]);
var parser = new QueryParser(Fields.DEFAULT, CtanAnalyzer.INSTANCE);
// QueryParser parser = new QueryParser(Version.LATEST,
// Fields.DEFAULT, ANALYZER)
parser.setDefaultOperator(QueryParser.AND_OPERATOR);
Query q = parser.parse(phrase);
if (query.hasType("PKG")) {
// PhraseQuery pq = new PhraseQuery(Fields.PKG);
// pq.add(new Term(Fields.PKG, query.phrase.trim()));
// // pq.setBoost(128.0f)
// BooleanQuery bq = new BooleanQuery()
// bq.add(pq, BooleanClause.Occur.SHOULD)
// bq.add(q, BooleanClause.Occur.SHOULD)
// q = bq
}
TopScoreDocCollector collector =
new TopScoreDocCollectorManager(query.getMax(), null,
10000,
false).newCollector();
// TopScoreDocCollector.create(query.getOffset() + query.getMax(),
// true);
var reader = new MultiReader(subReaders, true);
try {
IndexSearcher searcher = new IndexSearcher(reader);
searcher.search(q, collector);
query.setHitNumber(collector.getTotalHits());
query.setPhrase(q.toString());
List<HitInfo> results = new ArrayList<HitInfo>();
TopDocs topDocs = collector.topDocs(query.getOffset());
for (ScoreDoc it : topDocs.scoreDocs) {
// TODO
// Document doc = searcher.doc(it.doc);
// results.add(HitInfo.builder()
// .path(doc.get(Fields.PATH))
// .title(doc.get(Fields.TITLE))
// .type(doc.get(Fields.TYPE))
// .display(doc.get(Fields.DISPLAY))
// .lastModified(
// doc.getField(Fields.MTIME).numericValue().longValue())
// .build());
}
// }
// } finally {
// iReader.close()
query.setHits(results);
query.setRuntime(formatTime(System.currentTimeMillis() - t));
} catch (IOException e) {
log.error(e.getMessage());
return query;
} finally {
readerList.forEach(it -> {
try {
it.close();
} catch (IOException e) {
log.error(e.getMessage());
}
});
}
return query;
}
/**
* The method <code>findByPath</code> provides means to TODO gene.
*
* @param type the type
* @param path the path
* @return -1
* @throws CorruptIndexException in case of an error
* @throws IOException in case of an I/O error
*/
int findByPath(IndexType type, String locale, String path)
throws CorruptIndexException,
IOException {
IndexReader reader = createReader(type, locale);
if (reader == null) {
return -1;
}
try {
new TermQuery(new Term(Fields.PATH, path));
} finally {
reader.close();
}
return -1;
}
// /**
// * The method <code>find</code> provides means to perform a search.
// *
// * @param q the search term
// * @param sections the sections to search in
// * @param page the current page
// * @param size the page size
// * @param lang the locale
// * @return the search result
// */
// public QueryContainer find(@NonNull String q, IndexType[] sections,
// int page,
// int size, String lang) {
//
// // TODO Auto-generated method stub
//
// if (page > 0 || "x".equals(q)) {// TODO
// return QueryContainer.builder()
// .max(size)
// .offset(page)
// .phrase(q)
// .build();
// }
// return QueryContainer.builder()
// .max(size)
// .offset(page)
// .phrase(q)
// .hits(List.of(HitInfo.builder()
// .display("display display")// TODO
// .path("/path/path")// TODO
// .title("Title Title")// TODO
// .lastModified(121212L)// TODO
// .type("T")
// .build(),
// HitInfo.builder()
// .display("display display display")// TODO
// .path("/path/path/path")// TODO
// .title("Title Title Title")// TODO
// .lastModified(12121212L)// TODO
// .type("A")
// .build()))
// .build();
// }
// /**
// * This method retrieves the modification time for a path in an index.
// *
// * @param type the index type
// * @param path the URL path
// *
// * @return the date of last modification or 0
// */
// long getLastModified(IndexType type, String path) {
//
// long lastModified = 0l
// Directory directory = FSDirectory.open(getIndexDir(type))
// try {
// IndexSearcher searcher = new IndexSearcher(directory)
// Query query = new TermQuery(new Term(Fields.PATH, path))
// TopDocs docs = searcher.search(query, 1)
// def d = docs.scoreDocs
// if (d.length() >= 1) {
// Document doc = searcher.doc(d[0].doc)
// lastModified = doc.getField(Fields.MTIME).numericValue().longValue()
// }
// } finally {
// directory.close()
// }
// }
/**
* Format a time.
*
* @param t the time
*
* @return the time in readable format
*/
private String formatTime(long t) {
if (t == 0) {
return "0ms";
}
StringBuilder buffer = new StringBuilder();
if (t >= 60000) {
buffer.append((int) (t / 60000)).append("min");
t = t % 60000;
}
if (t >= 1000) {
buffer.append((int) (t / 1000)).append("s");
t = t % 1000;
}
if (t > 0) {
buffer.append(t).append("ms");
}
return buffer.toString();
}
/**
* This method retrieves a list of recommendations for a given package. The
* list contains a transport object for each hit with the following entries:
* <dl>
* <dt>path</dt>
* <dd>The path to the package description</dd>
* <dt>display</dt>
* <dd>The short name of the package</dd>
* <dt>title</dt>
* <dd>The title – currently in English</dd>
* <dt>score</dt>
* <dd>The score as float.</dd>
* </dl>
*
* @param packageName the name of the base package
* @param locale the language
* @param maxNumber the maximal number results to be returned
*
* @return the list of recommendations
* @throws IOException in case of an I/O error
* @throws CorruptIndexException in case of an error
*/
public List<RecommendationTo> recommendations(String packageName,
String locale,
int maxNumber)
throws CorruptIndexException,
IOException {
var pkgDocId = findByPath(IndexType.PKG, locale, "/pkg/${packageName}");
if (pkgDocId < 0) {
return List.of();
}
var indexReader = createReader(IndexType.PKG, locale);
var indexSearcher = new IndexSearcher(indexReader);
var mlt = new MoreLikeThis(indexReader);
mlt.setMinTermFreq(1);
mlt.setMinDocFreq(1);
mlt.setFieldNames(new String[]{Fields.TAGS});
mlt.setAnalyzer(CtanAnalyzer.INSTANCE);
var query = mlt.like(pkgDocId);
var topDocs = indexSearcher.search(query, maxNumber);
List<RecommendationTo> result = new ArrayList<>();
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
if (scoreDoc.doc == pkgDocId) {
continue;
}
// TODO
// var similar = indexReader.document(scoreDoc.doc);
// result.add(RecommendationTo.builder()
// .path(similar.get(Fields.PATH))
// .key(similar.get(Fields.PATH).replaceAll(".*/", ""))
// .display(similar.get(Fields.DISPLAY))
// .title(similar.get(Fields.TITLE))
// .score(scoreDoc.score)
// .build());
}
return result;
}
}