PostingCache.java

/*
 * Copyright (C) 2016-2025 The CTAN Team and individual authors
 *
 * This file is distributed under the 3-clause BSD license.
 * See file LICENSE for details.
 */
package org.ctan.site.services.postings;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.time.Instant;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;

import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import jakarta.validation.constraints.NotNull;
import lombok.NonNull;

/**
 * This class provides a container for {@link Posting}s.
 *
 * @author <a href="gene@ctan.org">Gerd Neugebauer</a>
 */
public class PostingCache {

    /**
     * The field <code>PKG_PATTERN</code> contains the pattern to identify a
     * package.
     */
    static final Pattern PKG_PATTERN = Pattern
        .compile(//
            ".*([./]ctan[.]org/pkg/"
                + "|/help/Catalogue/entries/"
                + "|http://tug.ctan.org/info/[?]id="
                + "|http://tug.ctan.org/cgi-bin/ctanPackageInformation.py[?]id="
                + ")([a-zA-Z0-9_-]+).*", //
            Pattern.CASE_INSENSITIVE | Pattern.MULTILINE
                | Pattern.DOTALL);

    /**
     * The field <code>BODY_PATTERN</code> contains the pattern to identify a
     * package.
     */
    static final Pattern BODY_PATTERN = Pattern.compile(
        ".*(macros/latex/contrib/supported/"
            + "|macros/latex/contrib/"
            + "|macros/latex/contributed/"
            + "|/graphics/metapost/contrib/macros/"
            + "|graphics/pstricks/contrib/"
            + "|graphics/pgf/contrib/"
            + "|tex-archive/tools/"
            + "|ctan.org/info/[?]id="
            + "|Name of contribution: +"
            + "|Location on CTAN: /biblio/bibtex/utils/"
            + "|Location on CTAN: biblio/bibtex/contrib/"
            + "|Location on CTAN: /dviware/"
            + "|Location on CTAN: /info/"
            + "|Location on CTAN: /support/"
            + "|Location on CTAN: /fonts/"
            + "|Location on CTAN: /fonts/ps-type1/"
            + "|Location on CTAN: /?fonts/utilities/"
            + "|Location on CTAN: /macro/generic/"
            + "|Location on CTAN: /macros/plain/"
            + "|Location on CTAN: /graphics/metapost/contrib/tools/"
            + "|Location on CTAN: /graphics/"
            + "|Location on CTAN: /language/"
            + ")([a-zA-Z0-9_-]+).*", //
        Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

    /**
     * The field <code>SUBJECT_PATTERN</code> contains the pattern to identify a
     * package.
     */
    static final Pattern SUBJECT_PATTERN = Pattern.compile(".*(upload:"
        + "|CTAN +upload" //
        + "|CTAN +upgrade" //
        + "|CTAN +documentation update" //
        + "|CTAN +update to" //
        + "|CTAN +update package" //
        + "|CTAN +update?d?" //
        + "|CTAN +sub[mi]*ss?ion" //
        + "|CTAN +change" //
        + "|[op]acka?ge +updatg?ed?" //
        + "|package +upgrade" //
        + "|package +upgrade of" //
        + "|package +bugfix" //
        + "|package +submiss?ion" //
        + "|new package" //
        + "|new version of" //
        + "|update [tof]*" //
        + "|update notification" //
        + "|has a new bundle" //
        + "|upload notification" //
        + "|new CTAN +documentation" //
        + "|new CTAN item" //
        + "|new CTAN package" //
        + "|new LaTeX package" //
        + "|new upload of" //
        + "|new on CTAN)[ \t:;-]+" //
        + "\"?([a-zA-Z0-9_-]+).*", //
        Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

    /**
     * The field <code>monthMap</code> contains a map from month name to int.
     */
    private static Map<String, Integer> monthMap;

    static {
        monthMap = new HashMap<>();
        monthMap.put("Jan", 1);
        monthMap.put("Feb", 2);
        monthMap.put("Mar", 3);
        monthMap.put("Apr", 4);
        monthMap.put("May", 5);
        monthMap.put("Jun", 6);
        monthMap.put("Jul", 7);
        monthMap.put("Aug", 8);
        monthMap.put("Sep", 9);
        monthMap.put("Oct", 10);
        monthMap.put("Nov", 11);
        monthMap.put("Dec", 12);
    }

    /**
     * The field <code>base</code> contains the base directory containing all
     * mail archive files.
     */
    private File base = null;

    /**
     * The field <code>mailByPkg</code> contains the mapping of package make to
     * associated postings.
     */
    private Map<String, List<Posting>> mailByPkg = new HashMap<>();

    /**
     * The field <code>mailByDate</code> contains the list of postings sorted by
     * date.
     */
    private List<Posting> mailByDate = new ArrayList<>();

    /**
     * The field <code>fingerprints</code> contains the mapping from file name
     * to file length.
     */
    private Map<String, Long> fingerprints = new HashMap<>();

    /**
     * The field <code>mail</code> contains the header fields.
     */
    private Map<String, Posting> mail = new HashMap<>();

    /**
     * The field <code>mailNewest</code> contains the list of newest postings.
     */
    private List<Posting> mailNewest = null;

    /**
     * The field <code>mailNewestMax</code> contains the max value for the
     * newest list is composed.
     */
    private int mailNewestMax = -1;

    /**
     * This is the constructor for <code>PostingCache</code>.
     */
    private PostingCache() {

    }

    /**
     * This is the constructor for <code>PostingCache</code>. The argument is
     * considered as directory which is scanned for appropriate files. If the
     * directory is {@code null} then the scanning is omitted and the cache is
     * left empty.
     *
     * @param base the base directory
     *
     * @throws FileNotFoundException in case the file does not exist
     * @throws IOException in case of an I/O error
     */
    @SuppressFBWarnings(value = "CT_CONSTRUCTOR_THROW")
    public PostingCache(@NonNull File base)
        throws FileNotFoundException,
            IOException {

        this();
        setBase(base);
    }

    /**
     * This method scans the base directory for all files contained.
     *
     * @return the file list
     */
    private File[] fileList() {

        var files = base.listFiles(
            (FilenameFilter) (dir, name) -> name.endsWith(".txt.gz"));
        if (files == null) {
            return new File[0];
        }
        Arrays.sort(files, (f1, f2) -> {
            var n1 = f1.getName();
            var n2 = f2.getName();
            var cmp = n1.substring(0, 4).compareTo(n2.substring(0, 4));
            return cmp != 0
                ? cmp
                : mapMonth(n1.substring(5))
                    - mapMonth(n2.substring(5));
        });
        return files;
    }

    /**
     * This method is the getter for the postings according to its id.
     *
     * @param key the mail id
     *
     * @return the mail found or {@code null} for none
     */
    public Posting get(String key) {

        return mail.get(key);
    }

    /**
     * The method <code>getBase</code> provides means to access the base
     * directory.
     *
     * @return the base directory
     */
    protected File getBase() {

        return base;
    }

    /**
     * This method checks whether postings for a package key have been read and
     * cached.
     *
     * @param pkg the key of the package
     *
     * @return {@code true} iff a posting for the package has been encountered
     */
    public boolean hasPostingForPackage(String pkg) {

        return mailByPkg.get(pkg.toLowerCase()) != null;
    }

    /**
     * The method <code>isEmpty</code> checks whether postings are contained.
     *
     * @return {@code true} iff no postings are contained
     */
    public boolean isEmpty() {

        return mail.isEmpty();
    }

    /**
     * The method <code>listByDate</code> returns a list sorted by date which
     * starts at a given date and contains at most a given number of items.
     *
     * @param start the start date
     * @param max the maximal number of items
     *
     * @return a list of postings
     */
    public List<Posting> listByDate(@NonNull LocalDateTime start, int max) {

        if (max < 1) {
            throw new IllegalArgumentException(
                "max is supposed to be positive but is " + max);
        }
        var l = 0;
        var r = mailByDate.size();
        Instant startInstant = start.toInstant(ZoneOffset.UTC);
        while (l < r) {
            var i = (l + r) / 2;
            if (startInstant.isBefore(mailByDate.get(i).getDate())) {
                r = i;
            } else {
                l = i + 1;
            }
        }
        r = l + max;
        if (r > mailByDate.size()) {
            r = mailByDate.size();
        }
        return mailByDate.subList(l, r);
    }

    /**
     * This method retrieves a list of postings associated to a package.
     *
     * @param pkg the key of the package
     * @param max the maximal length
     *
     * @return the requested list of postings. The list is a new list. It is
     *     sorted increasingly.
     */
    public List<Posting> listByPackage(@NonNull String pkg, int max) {

        if (max < 1) {
            throw new IllegalArgumentException(
                "max is supposed to be positive but is " + max);
        }
        var list = mailByPkg.get(pkg.toLowerCase());
        if (list == null) {
            return List.of();
        }
        var n = list.size();
        if (n > max) {
            list = new ArrayList<>(list.subList(n - max, n));
        } else {
            list = new ArrayList<>(list);
        }
        Collections.sort(list);
        return list;
    }

    /**
     * This method retrieves the newest postings up to a maximal number. If the
     * maximal number is reached then additional older postings are not
     * contained in the result.
     *
     * @param max the truncation limit
     *
     * @return the requested list of mails
     */
    @SuppressFBWarnings(value = "EI_EXPOSE_REP")
    public synchronized Iterable<Posting> listNewest(int max) {

        if (max < 1) {
            throw new IllegalArgumentException(
                "max is assumed to be positive instead of " + max);
        }
        if (mailNewest != null && mailNewestMax == max) {
            return mailNewest;
        }
        var size = mailByDate.size();
        var start = size - max;
        mailNewest = new ArrayList<>(mailByDate.subList(
            start > 0 ? start : 0, size));
        Collections.reverse(mailNewest);
        mailNewestMax = max;
        return mailNewest;
    }

    /**
     * This method retrieves the newest postings up to a maximal number for a
     * given package. If the maximal number is reached then additional older
     * postings are not contained in the result.
     *
     * @param pkg the CTAN name of the package
     * @param max the truncation limit
     *
     * @return the requested list of postings
     */
    public Iterable<Posting> listNewest(@NonNull String pkg, int max) {

        if (max < 1) {
            throw new IllegalArgumentException(
                "max is assumed to be positive instead of " + max);
        }
        var mails = new ArrayList<Posting>();
        int i = mailByDate.size();
        while (--i >= 0 && --max >= 0) {
            Posting it = mailByDate.get(i);
            if (!it.getPkg().contains(pkg)) {
                continue;
            }
            mails.add(it);
        }
        return mails.reversed();
    }

    /**
     * The method <code>listPaged</code> provides means to retrieve a paged list
     * of postings.
     *
     * @param page the start page
     * @param size the maximal number of items
     *
     * @return a list of postings
     */
    public List<Posting> listPaged(int page, int size) {

        if (page < 0) {
            throw new IllegalArgumentException(
                "page is assumed to be not negative instead of " + page);
        }
        if (size < 1) {
            throw new IllegalArgumentException(
                "size is assumed to be positive instead of " + size);
        }
        var l = page * size;
        return l >= mailByDate.size() - 1
            ? List.of()
            : mailByDate.reversed()
                .subList(l,
                    Math.min(l + size, mailByDate.size() - 1));
    }

    /**
     * This method provides a mapping from month name to the respective numeric
     * representation.
     *
     * @param s the name of the month
     *
     * @return a number 1 to 12 for the month January to December or 0 for an
     *     illegal argument
     */
    private int mapMonth(String s) {

        var mon = monthMap.get(s.substring(0, 2));
        return (mon != null ? mon : 0);
    }

    /**
     * This method is a getter for the keys of the packages.
     *
     * @return the set of keys of packages
     */
    public Set<String> packages() {

        return mailByPkg.keySet();
    }

    /**
     * This method add a mailing to the list of mailings for a package.
     *
     * @param pkg the name of the package
     * @param mail the mail
     */
    private void putPkg(String pkg, Posting mail) {

        pkg = pkg.toLowerCase();
        mail.addPkg(pkg);
        var list = mailByPkg.get(pkg);
        if (list == null) {
            list = new ArrayList<>();
            mailByPkg.put(pkg, list);
        }
        list.add(mail);
        mailByDate.add(mail);
    }

    /**
     * This method is the setter for the base directory. As a side effect all
     * previously cached mails are discarded and the new base directory is read.
     *
     * @param dir the new base directory
     * @throws IOException in case of an error
     */
    private synchronized void setBase(@NotNull File dir) throws IOException {

        if (!dir.isDirectory()) {
            throw new FileNotFoundException(
                "not a directory: " + dir.toString());
        }
        this.base = dir;
        mailByPkg = new HashMap<>();
        mailByDate = new ArrayList<>();
        fingerprints = new HashMap<>();
        mail = new HashMap<>();
        mailNewest = null;
        update();
    }

    /**
     * This method is a getter for the number of mails in the cache.
     *
     * @return the number of mails in the cache
     */
    public int size() {

        return mail.size();
    }

    /**
     * This method is a getter for the monthly count by package.
     *
     * @param pkg the key of the package
     *
     * @return the list of counts
     */
    public List<Integer[]> summaryCountByPkg(String pkg) {

        var result = new ArrayList<Integer[]>();
        var list = mailByPkg.get(pkg);
        if (list == null) {
            return result;
        }
        for (Posting p : list) {
            LocalDate date =
                p.getDate().atZone(ZoneId.systemDefault()).toLocalDate();
            var item = new Integer[3];
            item[0] = date.getYear();
            item[1] = date.getMonth().getValue();
            item[2] = date.getDayOfMonth();
            result.add(item);
        }
        return result;
    }

    /**
     * The method <code>total</code> provides means to retrieve the total number
     * of mails.
     *
     * @return total number of mails
     */
    public int total() {

        return mailByDate.size();
    }

    /**
     * This method scans all files in the base directory and reads in the new or
     * changed mail archive files.
     *
     * @throws IOException in case of an I/O error
     */
    public void update() throws IOException {

        for (File file : fileList()) {
            var fileName = file.getName();
            var len = fingerprints.get(fileName);
            var fileLength = file.length();
            if (len == null || fileLength != len) {
                fingerprints.put(fileName, fileLength);
                updateFile(file);
            }
        }
        Collections.sort(mailByDate);
    }

    /**
     * This method tries to update a single file.
     *
     * @param file the input file
     *
     * @throws IOException in case of an I/O error
     */
    private void updateFile(File file)
        throws IOException {

        if (file.length() == 0L) {
            return;
        }
        try (var pr = new PostingReader(
            new InputStreamReader(
                new GzipCompressorInputStream(
                    new FileInputStream(file)),
                StandardCharsets.UTF_8))) {
            for (var post = pr.read(); post != null; post = pr.read()) {
                var id = post.getId();
                if (mail.get(id) != null) {
                    continue;
                }
                mail.put(id, post);
                Matcher matcher;
                var body = post.getBody();
                if ((matcher = PKG_PATTERN.matcher(body)).matches()) {
                    putPkg(matcher.group(2), post);
                } else if ((matcher = BODY_PATTERN.matcher(body)).matches()) {
                    putPkg(matcher.group(2), post);
                } else if ((matcher =
                    SUBJECT_PATTERN.matcher(post.get("Subject")))
                        .matches()) {
                    putPkg(matcher.group(2), post);
                }
            }
        }
    }
}