ContentService.java

/*
 * Copyright © 2022-2025 The CTAN Team and individual authors
 *
 * This file is distributed under the 3-clause BSD license.
 * See file LICENSE for details.
 */
package org.ctan.site.services.content;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;

import org.ctan.markup.markdown.MarkdownRenderer;
import org.ctan.site.CtanConfiguration.ContentConfig;
import org.ctan.site.CtanConfiguration.CtanConfig;
import org.ctan.site.services.util.ConfigUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonInclude.Include;

import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Builder.Default;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.NonNull;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;

/**
 * The class <code>ContentService</code> contains the service to access the
 * content area. There pages and assets are stored.
 *
 * <h2>Pages</h2>
 *
 * <p>
 * The pages are HTML files. Only the title section and the body are extracted.
 * </p>
 *
 * <pre>
 * &lt;title&gt;the title&lt;/title&gt;
 * &lt;body&gt;
 *  the body
 * &lt;/body&gt;
 * </pre>
 *
 * <h2>Images</h2>
 *
 * <p>
 * The images are taken from the sub-directory <code>images/</code> of the base
 * directory.
 * </p>
 *
 * <h2>Configuration</h2>
 *
 * <pre>
 * ctan:
 *   languages:
 *     - en
 *     - de
 *  content:
 *    directory: /serv/www/www.ctan.org/ctan-content
 * </pre>
 *
 * @author <a href="mailto:gene@ctan.org">Gerd Neugebauer</a>
 */
@Slf4j
public class ContentService {

    /**
     * The class <code>ContentPageListTo</code> contains the transport object
     * for an entry in the page list.
     */
    @Getter
    @AllArgsConstructor
    @NoArgsConstructor
    @Builder
    @SuppressFBWarnings(value = "EI_EXPOSE_REP")
    public static class ContentPageListTo {

        /**
         * The field <code>name</code> contains the name.
         */
        private String name;

        /**
         * The field <code>title</code> contains the page title.
         */
        private String title;

        /**
         * The field <code>lang</code> contains the locale.
         */
        private String lang;

        /**
         * The field <code>classification</code> contains the classification.
         */
        @JsonInclude(Include.NON_NULL)
        private String classification;

        /**
         * The field <code>isDirectory</code> contains the indicator for
         * directories.
         */
        @Default
        @JsonInclude(Include.NON_NULL)
        private boolean isDirectory = false;
    }

    /**
     * The class <code>ContentPageTo</code> contains the transport object for a
     * page.
     */
    @Getter
    @Builder
    @NoArgsConstructor
    @AllArgsConstructor
    @SuppressFBWarnings(value = "EI_EXPOSE_REP")
    public static class ContentPageTo {

        /**
         * The field <code>name</code> contains the page name.
         */
        private String name;

        /**
         * The field <code>content</code> contains the content body.
         */
        private String content;

        /**
         * The field <code>title</code> contains the page title.
         */
        private String title;

        /**
         * The field <code>lang</code> contains the locale.
         */
        private String lang;

        /**
         * The field <code>icon</code> contains the optional icon.
         */
        @JsonInclude(Include.NON_NULL)
        private String icon;

        /**
         * The field <code>banner</code> contains the optional banner.
         */
        @JsonInclude(Include.NON_NULL)
        private Map<String, String> banner;

        /**
         * The field <code>classification</code> contains the classification.
         */
        @JsonInclude(Include.NON_NULL)
        private String classification;

        /**
         * The field <code>children</code> contains the sub-pages.
         */
        @Setter
        @JsonInclude(Include.NON_NULL)
        private List<ContentPageListTo> children;
    }

    /**
     * The class <code>ContentPageTreeTo</code> contains the transport object
     * for a page tree.
     */
    @Getter
    @Builder
    @NoArgsConstructor
    @AllArgsConstructor
    @SuppressFBWarnings(value = "EI_EXPOSE_REP")
    public static class ContentPageTreeTo {

        /**
         * The field <code>name</code> contains the page name.
         */
        private String name;

        /**
         * The field <code>title</code> contains the page title.
         */
        private String title;

        /**
         * The field <code>description</code> contains the description.
         */
        @JsonInclude(Include.NON_NULL)
        private String description;

        /**
         * The field <code>lang</code> contains the locale.
         */
        private String lang;

        /**
         * The field <code>icon</code> contains the optional icon.
         */
        @JsonInclude(Include.NON_NULL)
        private String icon;

        /**
         * The field <code>banner</code> contains the optional banner.
         */
        @JsonInclude(Include.NON_NULL)
        private Map<String, String> banner;

        /**
         * The field <code>children</code> contains the sub-pages.
         */
        @Setter
        @JsonInclude(Include.NON_NULL)
        private List<ContentPageTreeTo> children;
    }

    /**
     * The enum <code>TeaserType</code> contains types of supported teasers.
     */
    public enum TeaserType {
        /**
         * The indicator for a package.
         */
        PKG {

            @Override
            public String toString() {

                return "pkg";
            }
        },
        /**
         * The indicator for a topic.
         */
        TOPIC {

            @Override
            public String toString() {

                return "topic";
            }
        }
    }

    /**
     * The field <code>FILE_TYPE</code> contains the mapping from extension to
     * MIME type.
     */
    private static final Map<String, String> FILE_TYPE = Map.of(
        "png", "image/png",
        "gif", "image/gif",
        "jpg", "image/jpeg",
        "jpeg", "image/jpeg",
        "tif", "image/tiff",
        "tiff", "image/tiff",
        "eps", "application/postscript",
        "ps", "application/postscript",
        "", "text/plain");

    /**
     * The field <code>base</code> contains the content directory. It ends with
     * a slash.
     */
    private String base;

    /**
     * The field <code>languages</code> contains the supported languages. If the
     * given language has no associated file then the languages are tried in
     * turn until a file is found.
     */
    private List<String> languages;

    /**
     * The field <code>ctanConfig</code> contains the configuration.
     */
    private @NonNull CtanConfig ctanConfig;

    /**
     * This is the constructor for the class <code>ContentService</code>.
     *
     * @param config the content configuration
     * @param ctanConfig the CTAN configuration
     */
    @SuppressFBWarnings(value = "CT_CONSTRUCTOR_THROW")
    public ContentService(@NonNull ContentConfig config,
        @NonNull CtanConfig ctanConfig) {

        base = config.getDirectory();
        if (base == null || "".equals(base) || !new File(base).isDirectory()) {
            throw new IllegalArgumentException(
                "ctan.content is not a directory");
        }
        base = base + "/";
        this.ctanConfig = ctanConfig;
        languages = Arrays.asList(ConfigUtils.languages(ctanConfig));
    }

    /**
     * The method <code>collectBanner</code> provides means to retrieve a
     * banner.
     *
     * @param doc the Document
     * @return a Map for the banner or {@code null}
     */
    private Map<String, String> collectBanner(Document doc) {

        var src = getRel(doc, "banner");
        if (src == null) {
            return null;
        }
        var by = getRel(doc, "banner-author");
        return by == null
            ? Map.of(
                "src", src)
            : Map.of(
                "by", by,
                "src", src);
    }

    /**
     * The method <code>findLocalizedFile</code> provides means to search for a
     * file with a given language. If the given language is not found then the
     * languages are tried in turn until a file is found.
     *
     * @param path the path to search for
     * @param lang the language to try first
     * @return the language found or {@code null} for none
     */
    private String findLocalizedFile(String path, String lang) {

        var file = new File(base + path + "." + lang);
        if (file.isFile()) {
            return lang;
        }
        for (String language : languages) {
            if (language.equals(lang)) {
                continue;
            }
            file = new File(base + path + "." + language);
            if (file.isFile()) {
                return language;
            }
        }
        return null;
    }

    /**
     * The method <code>findLocalizedIndex</code> provides means to retrieve an
     * index file. The target locale is taken into consideration.
     *
     * @param path the path containing the index file
     * @param lang the target locale
     * @return the file or {@code null}
     */
    private File findLocalizedIndex(File path, String lang) {

        var file = new File(path, "index." + lang);
        if (file.isFile()) {
            return file;
        }
        for (String language : languages) {
            if (language.equals(lang)) {
                continue;
            }
            file = new File(path, "index." + language);
            if (file.isFile()) {
                return file;
            }
        }
        return null;
    }

    /**
     * Find an image file.
     *
     * @param path the path
     * @return the image
     * @throws IOException in case of an I/O error
     */
    public byte[] getImage(String path) throws IOException {

        if (hasRelativePath(path)) {
            throw new IllegalArgumentException("relative path");
        }
        return Files.readAllBytes(Paths.get(base + "images/" + path));
    }

    /**
     * Map the image name to the file type.
     *
     * @param path the path
     * @return the file type or {@code null}
     */
    public String getImageType(String path) {

        int i = path.lastIndexOf(".");
        return FILE_TYPE.get(i < 0 ? "" : path.substring(i + 1));
    }

    /**
     * The method <code>getMeta</code> provides means to retrieve a Meta header
     * from a document.
     *
     * @param doc the document
     * @param name name of the attribute
     * @return the content or {@code null}
     */
    private String getMeta(Document doc, String name) {

        for (var it : doc.getElementsByTag("meta")) {
            var el = it.getElementsByAttributeValue("name", name);
            if (el != null && el.first() != null) {
                return el.first().attr("content");
            }
        }
        return null;
    }

    /**
     * The method <code>getNamedValue</code> provides means to retrieve some
     * element's value.
     *
     * @param doc the document
     * @param name name of the element
     * @return the content or {@code null}
     */
    private String getNamedValue(Document doc, String name) {

        var el = doc.getElementsByAttributeValue("name", name);
        return el == null || el.first() == null
            ? null
            : el.first().attr("content");
    }

    /**
     * The method <code>getPage</code> provides means to retrieve a page.
     *
     * @param lang the ISO language code consisting of two lower-case letters
     * @param path the path in the content workspace; i.e. relative to the base
     *     directory
     * @return the page or {@code null}
     * @throws IOException in case of an I/O error
     */
    public ContentPageTo getPage(String lang, @NonNull String path)
        throws IOException {

        if (hasRelativePath(path)) {
            throw new IllegalArgumentException("relative path");
        }
        lang = ConfigUtils.fallbackLanguage(ctanConfig, lang);
        List<ContentPageListTo> items = null;
        if (!path.startsWith("/")) {
            path = "/" + path;
        }
        var name = base + "page" + path;
        var file = new File(name);
        if (file.isDirectory()) {
            if (!path.endsWith("/")) {
                path = path + "/";
            }
            items = getPageItems(file, path, lang);
            Collections.sort(items,
                (a, b) -> a.title.compareToIgnoreCase(b.title));
            path = path + "index";
            name = base + "page" + path;
        }
        lang = findLocalizedFile("page" + path, lang);
        if (lang == null) {
            return null;
        }
        name = name + "." + lang;
        var doc = Jsoup.parse(new File(name));
        var body = processHtml(doc)
            .html()
            .replaceAll(
                "&amp;(TeX|LaTeX|TeXLaTeX|LaTeXTeX|BibTeX|LaTeX2e|LaTeXe);",
                "&$1;");
        return ContentPageTo.builder()
            .name(path)
            .lang(lang)
            .title(doc.getElementsByTag("title").text())
            .content(body)
            .icon(getRel(doc, "icon"))
            .banner(collectBanner(doc))
            .classification(getNamedValue(doc, "classification"))
            .children(items)
            .build();
    }

    /**
     * The method <code>getPageItems</code> provides means to retrieve the list
     * of files in a directory having a given language.
     *
     * @param file the directory to scan
     * @param path the path
     * @param lang the locale
     * @return the list of files
     * @throws IOException in case of an I/O error
     */
    private List<ContentPageListTo> getPageItems(File file, String path,
        String lang)
        throws IOException {

        List<ContentPageListTo> items = new ArrayList<>();
        var index = "index." + lang;
        File[] listFiles = file.listFiles((d, f) -> f.endsWith("." + lang));
        if (listFiles == null) {
            throw new FileNotFoundException();
        }
        for (File f : listFiles) {
            if (index.equals(f.getName())) {
                continue;
            }
            var doc = Jsoup.parse(f);
            var name = f.getName();
            items.add(ContentPageListTo.builder()
                .name(
                    path + name.substring(0, name.length() - 1 - lang.length()))
                .lang(lang)
                .title(doc.getElementsByTag("title").text())
                .build());
        }
        File[] list = file.listFiles();
        if (list == null) {
            throw new FileNotFoundException();
        }
        for (File f : list) {
            if (!f.isDirectory()) {
                continue;
            }
            var title = f.getName();
            var subIndex = new File(f, index);
            if (subIndex.exists()) {
                Document doc;
                try {
                    doc = Jsoup.parse(subIndex);
                } catch (IOException e) {
                    log.warn("Warning: Parsing failed for {}: {}", f.getName(),
                        e.getMessage());
                    continue;
                }
                title = doc.getElementsByTag("title").text();
            }
            items.add(ContentPageListTo.builder()
                .name(path + f.getName())
                .lang(lang)
                .title(title)
                .isDirectory(true)
                .build());
        }
        return items;
    }

    /**
     * The method <code>getPageList</code> provides means to retrieve a
     * directory listing.
     *
     * @param lang the ISO language code consisting of two lower-case letters
     * @param path the path in the content workspace; i.e. relative to the base
     *     directory
     * @return the page or {@code null}
     * @throws IOException in case of an I/O error
     */
    public ContentPageTo getPageList(@NonNull String lang,
        @NonNull String path)
        throws IOException {

        if (hasRelativePath(path)) {
            throw new IllegalArgumentException("relative path");
        }
        if (!lang.matches("\\p{Lower}\\p{Lower}")) {
            throw new IllegalArgumentException("illegal lang");
        }
        var name = base + "page/" + path;
        var dir = new File(name);
        if (!dir.isDirectory()) {
            return null;
        }
        if (!"".equals(path) && !path.endsWith("/")) {
            path = path + "/";
        }
        List<ContentPageListTo> list = new ArrayList<>();
        var files = dir.listFiles(f -> f.getName().endsWith(".en")
            && !f.getName().startsWith("index."));
        if (files == null) {
            return null;
        }
        Arrays.sort(files,
            (a, b) -> a.getName().compareToIgnoreCase(b.getName()));
        for (File file : files) {
            var fileName = file.getName();
            var fn = fileName.substring(0, fileName.length() - 3);
            var locale = findLocalizedFile("page/" + path + fn, lang);
            if (locale == null) {
                continue;
            }
            var doc = Jsoup.parse(new File(name + "/" + fn + "." + locale));
            list.add(ContentPageListTo.builder()
                .name(fn)
                .title(doc.getElementsByTag("title").text())
                .lang(locale)
                .classification(getNamedValue(doc, "classification"))
                .build());
        }
        var page = getPage(lang, path);
        if (page == null) {
            page = new ContentPageTo();
        }
        page.setChildren(list);
        return page;
    }

    /**
     * The method <code>getPageTree</code> provides means to retrieve the tree
     * of pages.
     *
     * @param lang the language
     * @param dir the directory
     * @param depth the maximal depth
     * @return the tree
     * @throws IOException in case of an I/O error
     */
    private ContentPageTreeTo getPageTree(String lang, File dir, int depth)
        throws IOException {

        List<ContentPageTreeTo> children = new ArrayList<>();
        if (depth-- > 1) {
            var files = dir.listFiles((f) -> f.isDirectory());
            if (files == null) {
                return null;
            }
            for (var d : files) {
                children.add(getPageTree(lang, d, depth));
            }
        }
        var dotLang = "." + lang;
        var files = dir
            .listFiles((x) -> x.isFile()
                && x.getName().endsWith(dotLang)
                && !x.getName().equals("index" + dotLang));
        if (files == null) {
            return null;
        }
        for (var f : files) {
            var name = f.getPath()
                .substring(base.length() + 4)
                .replaceAll("\\.[a-z][a-z]$", "");
            var doc = Jsoup.parse(f);
            children.add(
                ContentPageTreeTo.builder()
                    .lang(lang)
                    .title(doc.getElementsByTag("title").text())
                    .description(getMeta(doc, "description"))
                    .icon(getRel(doc, "icon"))
                    .banner(collectBanner(doc))
                    .name(name)
                    .build());
        }
        Collections.sort(children,
            (a, b) -> a.title.compareToIgnoreCase(b.title));
        var path = dir.getPath().substring(base.length() + 4);
        var index = findLocalizedIndex(dir, lang);
        if (index != null) {
            var doc = Jsoup.parse(index);
            return ContentPageTreeTo.builder()
                .lang(lang)
                .title(doc.getElementsByTag("title").text())
                .description(getMeta(doc, "description"))
                .icon(getRel(doc, "icon"))
                .banner(collectBanner(doc))
                .name(path)
                .children(children)
                .build();
        }
        return ContentPageTreeTo.builder()
            .name(path)
            .title(path)
            .lang(lang)
            .children(children)
            .build();
    }

    /**
     * The method <code>getPageTree</code> provides means to retrieve a tree of
     * pages.
     *
     * @param lang the locale
     * @param path the path
     * @param depth the maximal depth
     * @return the tree
     * @throws IOException in case of an I/O error
     */
    public ContentPageTreeTo getPageTree(String lang, @NonNull String path,
        int depth)
        throws IOException {

        if (hasRelativePath(path)) {
            throw new IllegalArgumentException(
                "illegal relative path: " + path);
        }
        lang = ConfigUtils.fallbackLanguage(ctanConfig, lang);
        if (!path.startsWith("/")) {
            path = "/" + path;
        }
        return getPageTree(lang, new File(base + "page" + path), depth);
    }

    /**
     * The method <code>getRel</code> provides means to retrieve a header from a
     * document.
     *
     * @param doc the document
     * @param name name of the attribute
     * @return the content or {@code null}
     */
    private String getRel(Document doc, String name) {

        var el = doc.getElementsByAttributeValue("rel", name);
        return el == null || el.first() == null
            ? null
            : el.first().attr("content");
    }

    /**
     * The method <code>getPkgTeaser</code> provides means to read a teaser
     * image.
     *
     * @param type the type of the teaser
     * @param key the key of the package
     * @return the teaser image
     * @throws IOException in case of an I/O error
     */
    public byte[] getTeaser(@NonNull TeaserType type, @NonNull String key)
        throws IOException {

        if (key.indexOf('/') >= 0) {
            throw new IllegalArgumentException(
                "illegal char '/' in key " + key);
        }
        return Files
            .readAllBytes(
                Paths.get(base + "teasers/" + type.toString() + "/600x200/"
                    + key + ".png"));
    }

    /**
     * The method <code>getText</code> provides means to retrieve a text
     * fragment.
     *
     * @param path the path in the content workspace; i.e. relative to the base
     *     directory
     * @param lang the ISO language code consisting of two lower-case letters
     * @return the page or {@code null}
     * @throws IOException in case of an I/O error
     */
    public ContentPageTo getText(@NonNull String path, String lang)
        throws IOException {

        if (hasRelativePath(path)) {
            throw new IllegalArgumentException(
                "illegal relative path: " + path);
        }
        lang = ConfigUtils.fallbackLanguage(ctanConfig, lang);
        if (!path.startsWith("/")) {
            path = "/" + path;
        }
        var name = base + "text" + path;
        lang = findLocalizedFile("text" + path, lang);
        if (lang == null) {
            return null;
        }
        name = name + "." + lang;
        var doc = Jsoup.parse(new File(name));
        var body = doc.getElementsByTag("body").get(0);
        var title = doc.getElementsByTag("title").html();
        var h1 = body.getElementsByTag("h1");
        if (h1.size() > 0) {
            h1.get(0).remove();
        }
        return ContentPageTo.builder()
            .name(path)
            .lang(lang)
            .title(title)
            .content(body.html()
                .replaceAll(
                    "&amp;(TeX|LaTeX|TeXLaTeX|LaTeXTeX|BibTeX|LaTeX2e|LaTeXe);",
                    "&$1;"))
            .icon(getRel(doc, "icon"))
            .banner(collectBanner(doc))
            .classification(getNamedValue(doc, "classification"))
            .build();
    }

    /**
     * The method <code>hasRelativePath</code> provides means to identify a
     * relative path. This is a path containing .. in order to navigate to
     * parent directories.
     *
     * <p>
     * Allowing such a path would open a security hole which would allow the
     * attacker to access the complete file system.
     *
     * @param path the path to check
     * @return {@code true} iff the path contains a segment ..
     */
    private boolean hasRelativePath(String path) {

        return "..".equals(path)
            || path.contains("/../")
            || path.startsWith("../")
            || path.endsWith("/..");
    }

    /**
     * The method <code>hasTeaser</code> provides means to check whether a
     * teaser image exists.
     *
     * @param type the type
     * @param key the key of the package
     * @return {@code true} iff the teaser image could be found
     */
    public boolean hasTeaser(TeaserType type, @NonNull String key) {

        if (key.indexOf('/') >= 0) {
            throw new IllegalArgumentException(
                "illegal char '/' in key " + key);
        }
        return new File(base + "teasers/" + type.toString() + "/600x200/" + key
            + ".png").isFile();
    }

    /**
     * The method <code>processHtml</code> provides means to render the
     * document.
     *
     * @param doc the document
     * @return the body element
     */
    private Element processHtml(Document doc) {

        var nodes = doc.getElementsByTag("markdown:render");
        for (var node : nodes) {
            var text = node.wholeText();
            var w = new StringWriter();
            try {
                new MarkdownRenderer(new StringReader(text), "").render(w);
                var to = Jsoup.parse(w.toString())
                    .getElementsByTag("body")
                    .get(0);
                to.tagName("div");
                var cls = node.attr("class");
                if (cls != null) {
                    to.attr("class", cls);
                }
                node.replaceWith(to);
            } catch (IOException e) {
                log.error("processsing skipped", e);
            }
        }
        return doc.body();
    }
}