PostingCache.java
/*
* Copyright (C) 2016-2025 The CTAN Team and individual authors
*
* This file is distributed under the 3-clause BSD license.
* See file LICENSE for details.
*/
package org.ctan.site.services.postings;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.time.Instant;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import jakarta.validation.constraints.NotNull;
import lombok.NonNull;
/**
* This class provides a container for {@link Posting}s.
*
* @author <a href="gene@ctan.org">Gerd Neugebauer</a>
*/
public class PostingCache {
/**
* The field <code>PKG_PATTERN</code> contains the pattern to identify a
* package.
*/
static final Pattern PKG_PATTERN = Pattern
.compile(//
".*([./]ctan[.]org/pkg/"
+ "|/help/Catalogue/entries/"
+ "|http://tug.ctan.org/info/[?]id="
+ "|http://tug.ctan.org/cgi-bin/ctanPackageInformation.py[?]id="
+ ")([a-zA-Z0-9_-]+).*", //
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE
| Pattern.DOTALL);
/**
* The field <code>BODY_PATTERN</code> contains the pattern to identify a
* package.
*/
static final Pattern BODY_PATTERN = Pattern.compile(
".*(macros/latex/contrib/supported/"
+ "|macros/latex/contrib/"
+ "|macros/latex/contributed/"
+ "|/graphics/metapost/contrib/macros/"
+ "|graphics/pstricks/contrib/"
+ "|graphics/pgf/contrib/"
+ "|tex-archive/tools/"
+ "|ctan.org/info/[?]id="
+ "|Name of contribution: +"
+ "|Location on CTAN: /biblio/bibtex/utils/"
+ "|Location on CTAN: biblio/bibtex/contrib/"
+ "|Location on CTAN: /dviware/"
+ "|Location on CTAN: /info/"
+ "|Location on CTAN: /support/"
+ "|Location on CTAN: /fonts/"
+ "|Location on CTAN: /fonts/ps-type1/"
+ "|Location on CTAN: /?fonts/utilities/"
+ "|Location on CTAN: /macro/generic/"
+ "|Location on CTAN: /macros/plain/"
+ "|Location on CTAN: /graphics/metapost/contrib/tools/"
+ "|Location on CTAN: /graphics/"
+ "|Location on CTAN: /language/"
+ ")([a-zA-Z0-9_-]+).*", //
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
/**
* The field <code>SUBJECT_PATTERN</code> contains the pattern to identify a
* package.
*/
static final Pattern SUBJECT_PATTERN = Pattern.compile(".*(upload:"
+ "|CTAN +upload" //
+ "|CTAN +upgrade" //
+ "|CTAN +documentation update" //
+ "|CTAN +update to" //
+ "|CTAN +update package" //
+ "|CTAN +update?d?" //
+ "|CTAN +sub[mi]*ss?ion" //
+ "|CTAN +change" //
+ "|[op]acka?ge +updatg?ed?" //
+ "|package +upgrade" //
+ "|package +upgrade of" //
+ "|package +bugfix" //
+ "|package +submiss?ion" //
+ "|new package" //
+ "|new version of" //
+ "|update [tof]*" //
+ "|update notification" //
+ "|has a new bundle" //
+ "|upload notification" //
+ "|new CTAN +documentation" //
+ "|new CTAN item" //
+ "|new CTAN package" //
+ "|new LaTeX package" //
+ "|new upload of" //
+ "|new on CTAN)[ \t:;-]+" //
+ "\"?([a-zA-Z0-9_-]+).*", //
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
/**
* The field <code>monthMap</code> contains a map from month name to int.
*/
private static Map<String, Integer> monthMap;
static {
monthMap = new HashMap<>();
monthMap.put("Jan", 1);
monthMap.put("Feb", 2);
monthMap.put("Mar", 3);
monthMap.put("Apr", 4);
monthMap.put("May", 5);
monthMap.put("Jun", 6);
monthMap.put("Jul", 7);
monthMap.put("Aug", 8);
monthMap.put("Sep", 9);
monthMap.put("Oct", 10);
monthMap.put("Nov", 11);
monthMap.put("Dec", 12);
}
/**
* The field <code>base</code> contains the base directory containing all
* mail archive files.
*/
private File base = null;
/**
* The field <code>mailByPkg</code> contains the mapping of package make to
* associated postings.
*/
private Map<String, List<Posting>> mailByPkg = new HashMap<>();
/**
* The field <code>mailByDate</code> contains the list of postings sorted by
* date.
*/
private List<Posting> mailByDate = new ArrayList<>();
/**
* The field <code>fingerprints</code> contains the mapping from file name
* to file length.
*/
private Map<String, Long> fingerprints = new HashMap<>();
/**
* The field <code>mail</code> contains the header fields.
*/
private Map<String, Posting> mail = new HashMap<>();
/**
* The field <code>mailNewest</code> contains the list of newest postings.
*/
private List<Posting> mailNewest = null;
/**
* The field <code>mailNewestMax</code> contains the max value for the
* newest list is composed.
*/
private int mailNewestMax = -1;
/**
* This is the constructor for <code>PostingCache</code>.
*/
private PostingCache() {
}
/**
* This is the constructor for <code>PostingCache</code>. The argument is
* considered as directory which is scanned for appropriate files. If the
* directory is {@code null} then the scanning is omitted and the cache is
* left empty.
*
* @param base the base directory
*
* @throws FileNotFoundException in case the file does not exist
* @throws IOException in case of an I/O error
*/
@SuppressFBWarnings(value = "CT_CONSTRUCTOR_THROW")
public PostingCache(@NonNull File base)
throws FileNotFoundException,
IOException {
this();
setBase(base);
}
/**
* This method scans the base directory for all files contained.
*
* @return the file list
*/
private File[] fileList() {
var files = base.listFiles(
(FilenameFilter) (dir, name) -> name.endsWith(".txt.gz"));
if (files == null) {
return new File[0];
}
Arrays.sort(files, (f1, f2) -> {
var n1 = f1.getName();
var n2 = f2.getName();
var cmp = n1.substring(0, 4).compareTo(n2.substring(0, 4));
return cmp != 0
? cmp
: mapMonth(n1.substring(5))
- mapMonth(n2.substring(5));
});
return files;
}
/**
* This method is the getter for the postings according to its id.
*
* @param key the mail id
*
* @return the mail found or {@code null} for none
*/
public Posting get(String key) {
return mail.get(key);
}
/**
* The method <code>getBase</code> provides means to access the base
* directory.
*
* @return the base directory
*/
protected File getBase() {
return base;
}
/**
* This method checks whether postings for a package key have been read and
* cached.
*
* @param pkg the key of the package
*
* @return {@code true} iff a posting for the package has been encountered
*/
public boolean hasPostingForPackage(String pkg) {
return mailByPkg.get(pkg.toLowerCase()) != null;
}
/**
* The method <code>isEmpty</code> checks whether postings are contained.
*
* @return {@code true} iff no postings are contained
*/
public boolean isEmpty() {
return mail.isEmpty();
}
/**
* The method <code>listByDate</code> returns a list sorted by date which
* starts at a given date and contains at most a given number of items.
*
* @param start the start date
* @param max the maximal number of items
*
* @return a list of postings
*/
public List<Posting> listByDate(@NonNull LocalDateTime start, int max) {
if (max < 1) {
throw new IllegalArgumentException(
"max is supposed to be positive but is " + max);
}
var l = 0;
var r = mailByDate.size();
Instant startInstant = start.toInstant(ZoneOffset.UTC);
while (l < r) {
var i = (l + r) / 2;
if (startInstant.isBefore(mailByDate.get(i).getDate())) {
r = i;
} else {
l = i + 1;
}
}
r = l + max;
if (r > mailByDate.size()) {
r = mailByDate.size();
}
return mailByDate.subList(l, r);
}
/**
* This method retrieves a list of postings associated to a package.
*
* @param pkg the key of the package
* @param max the maximal length
*
* @return the requested list of postings. The list is a new list. It is
* sorted increasingly.
*/
public List<Posting> listByPackage(@NonNull String pkg, int max) {
if (max < 1) {
throw new IllegalArgumentException(
"max is supposed to be positive but is " + max);
}
var list = mailByPkg.get(pkg.toLowerCase());
if (list == null) {
return List.of();
}
var n = list.size();
if (n > max) {
list = new ArrayList<>(list.subList(n - max, n));
} else {
list = new ArrayList<>(list);
}
Collections.sort(list);
return list;
}
/**
* This method retrieves the newest postings up to a maximal number. If the
* maximal number is reached then additional older postings are not
* contained in the result.
*
* @param max the truncation limit
*
* @return the requested list of mails
*/
@SuppressFBWarnings(value = "EI_EXPOSE_REP")
public synchronized Iterable<Posting> listNewest(int max) {
if (max < 1) {
throw new IllegalArgumentException(
"max is assumed to be positive instead of " + max);
}
if (mailNewest != null && mailNewestMax == max) {
return mailNewest;
}
var size = mailByDate.size();
var start = size - max;
mailNewest = new ArrayList<>(mailByDate.subList(
start > 0 ? start : 0, size));
Collections.reverse(mailNewest);
mailNewestMax = max;
return mailNewest;
}
/**
* This method retrieves the newest postings up to a maximal number for a
* given package. If the maximal number is reached then additional older
* postings are not contained in the result.
*
* @param pkg the CTAN name of the package
* @param max the truncation limit
*
* @return the requested list of postings
*/
public Iterable<Posting> listNewest(@NonNull String pkg, int max) {
if (max < 1) {
throw new IllegalArgumentException(
"max is assumed to be positive instead of " + max);
}
var mails = new ArrayList<Posting>();
int i = mailByDate.size();
while (--i >= 0 && --max >= 0) {
Posting it = mailByDate.get(i);
if (!it.getPkg().contains(pkg)) {
continue;
}
mails.add(it);
}
return mails.reversed();
}
/**
* The method <code>listPaged</code> provides means to retrieve a paged list
* of postings.
*
* @param page the start page
* @param size the maximal number of items
*
* @return a list of postings
*/
public List<Posting> listPaged(int page, int size) {
if (page < 0) {
throw new IllegalArgumentException(
"page is assumed to be not negative instead of " + page);
}
if (size < 1) {
throw new IllegalArgumentException(
"size is assumed to be positive instead of " + size);
}
var l = page * size;
return l >= mailByDate.size() - 1
? List.of()
: mailByDate.reversed()
.subList(l,
Math.min(l + size, mailByDate.size() - 1));
}
/**
* This method provides a mapping from month name to the respective numeric
* representation.
*
* @param s the name of the month
*
* @return a number 1 to 12 for the month January to December or 0 for an
* illegal argument
*/
private int mapMonth(String s) {
var mon = monthMap.get(s.substring(0, 2));
return (mon != null ? mon : 0);
}
/**
* This method is a getter for the keys of the packages.
*
* @return the set of keys of packages
*/
public Set<String> packages() {
return mailByPkg.keySet();
}
/**
* This method add a mailing to the list of mailings for a package.
*
* @param pkg the name of the package
* @param mail the mail
*/
private void putPkg(String pkg, Posting mail) {
pkg = pkg.toLowerCase();
mail.addPkg(pkg);
var list = mailByPkg.get(pkg);
if (list == null) {
list = new ArrayList<>();
mailByPkg.put(pkg, list);
}
list.add(mail);
mailByDate.add(mail);
}
/**
* This method is the setter for the base directory. As a side effect all
* previously cached mails are discarded and the new base directory is read.
*
* @param dir the new base directory
* @throws IOException in case of an error
*/
private synchronized void setBase(@NotNull File dir) throws IOException {
if (!dir.isDirectory()) {
throw new FileNotFoundException(
"not a directory: " + dir.toString());
}
this.base = dir;
mailByPkg = new HashMap<>();
mailByDate = new ArrayList<>();
fingerprints = new HashMap<>();
mail = new HashMap<>();
mailNewest = null;
update();
}
/**
* This method is a getter for the number of mails in the cache.
*
* @return the number of mails in the cache
*/
public int size() {
return mail.size();
}
/**
* This method is a getter for the monthly count by package.
*
* @param pkg the key of the package
*
* @return the list of counts
*/
public List<Integer[]> summaryCountByPkg(String pkg) {
var result = new ArrayList<Integer[]>();
var list = mailByPkg.get(pkg);
if (list == null) {
return result;
}
for (Posting p : list) {
LocalDate date =
p.getDate().atZone(ZoneId.systemDefault()).toLocalDate();
var item = new Integer[3];
item[0] = date.getYear();
item[1] = date.getMonth().getValue();
item[2] = date.getDayOfMonth();
result.add(item);
}
return result;
}
/**
* The method <code>total</code> provides means to retrieve the total number
* of mails.
*
* @return total number of mails
*/
public int total() {
return mailByDate.size();
}
/**
* This method scans all files in the base directory and reads in the new or
* changed mail archive files.
*
* @throws IOException in case of an I/O error
*/
public void update() throws IOException {
for (File file : fileList()) {
var fileName = file.getName();
var len = fingerprints.get(fileName);
var fileLength = file.length();
if (len == null || fileLength != len) {
fingerprints.put(fileName, fileLength);
updateFile(file);
}
}
Collections.sort(mailByDate);
}
/**
* This method tries to update a single file.
*
* @param file the input file
*
* @throws IOException in case of an I/O error
*/
private void updateFile(File file)
throws IOException {
if (file.length() == 0L) {
return;
}
try (var pr = new PostingReader(
new InputStreamReader(
new GzipCompressorInputStream(
new FileInputStream(file)),
StandardCharsets.UTF_8))) {
for (var post = pr.read(); post != null; post = pr.read()) {
var id = post.getId();
if (mail.get(id) != null) {
continue;
}
mail.put(id, post);
Matcher matcher;
var body = post.getBody();
if ((matcher = PKG_PATTERN.matcher(body)).matches()) {
putPkg(matcher.group(2), post);
} else if ((matcher = BODY_PATTERN.matcher(body)).matches()) {
putPkg(matcher.group(2), post);
} else if ((matcher =
SUBJECT_PATTERN.matcher(post.get("Subject")))
.matches()) {
putPkg(matcher.group(2), post);
}
}
}
}
}