ContentService.java
/*
* Copyright © 2022-2025 The CTAN Team and individual authors
*
* This file is distributed under the 3-clause BSD license.
* See file LICENSE for details.
*/
package org.ctan.site.services.content;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.ctan.markup.markdown.MarkdownRenderer;
import org.ctan.site.CtanConfiguration.ContentConfig;
import org.ctan.site.CtanConfiguration.CtanConfig;
import org.ctan.site.services.util.ConfigUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonInclude.Include;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Builder.Default;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.NonNull;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
/**
* The class <code>ContentService</code> contains the service to access the
* content area. There pages and assets are stored.
*
* <h2>Pages</h2>
*
* <p>
* The pages are HTML files. Only the title section and the body are extracted.
* </p>
*
* <pre>
* <title>the title</title>
* <body>
* the body
* </body>
* </pre>
*
* <h2>Images</h2>
*
* <p>
* The images are taken from the sub-directory <code>images/</code> of the base
* directory.
* </p>
*
* <h2>Configuration</h2>
*
* <pre>
* ctan:
* languages:
* - en
* - de
* content:
* directory: /serv/www/www.ctan.org/ctan-content
* </pre>
*
* @author <a href="mailto:gene@ctan.org">Gerd Neugebauer</a>
*/
@Slf4j
public class ContentService {
/**
* The class <code>ContentPageListTo</code> contains the transport object
* for an entry in the page list.
*/
@Getter
@AllArgsConstructor
@NoArgsConstructor
@Builder
@SuppressFBWarnings(value = "EI_EXPOSE_REP")
public static class ContentPageListTo {
/**
* The field <code>name</code> contains the name.
*/
private String name;
/**
* The field <code>title</code> contains the page title.
*/
private String title;
/**
* The field <code>lang</code> contains the locale.
*/
private String lang;
/**
* The field <code>classification</code> contains the classification.
*/
@JsonInclude(Include.NON_NULL)
private String classification;
/**
* The field <code>isDirectory</code> contains the indicator for
* directories.
*/
@Default
@JsonInclude(Include.NON_NULL)
private boolean isDirectory = false;
}
/**
* The class <code>ContentPageTo</code> contains the transport object for a
* page.
*/
@Getter
@Builder
@NoArgsConstructor
@AllArgsConstructor
@SuppressFBWarnings(value = "EI_EXPOSE_REP")
public static class ContentPageTo {
/**
* The field <code>name</code> contains the page name.
*/
private String name;
/**
* The field <code>content</code> contains the content body.
*/
private String content;
/**
* The field <code>title</code> contains the page title.
*/
private String title;
/**
* The field <code>lang</code> contains the locale.
*/
private String lang;
/**
* The field <code>icon</code> contains the optional icon.
*/
@JsonInclude(Include.NON_NULL)
private String icon;
/**
* The field <code>banner</code> contains the optional banner.
*/
@JsonInclude(Include.NON_NULL)
private Map<String, String> banner;
/**
* The field <code>classification</code> contains the classification.
*/
@JsonInclude(Include.NON_NULL)
private String classification;
/**
* The field <code>children</code> contains the sub-pages.
*/
@Setter
@JsonInclude(Include.NON_NULL)
private List<ContentPageListTo> children;
}
/**
* The class <code>ContentPageTreeTo</code> contains the transport object
* for a page tree.
*/
@Getter
@Builder
@NoArgsConstructor
@AllArgsConstructor
@SuppressFBWarnings(value = "EI_EXPOSE_REP")
public static class ContentPageTreeTo {
/**
* The field <code>name</code> contains the page name.
*/
private String name;
/**
* The field <code>title</code> contains the page title.
*/
private String title;
/**
* The field <code>description</code> contains the description.
*/
@JsonInclude(Include.NON_NULL)
private String description;
/**
* The field <code>lang</code> contains the locale.
*/
private String lang;
/**
* The field <code>icon</code> contains the optional icon.
*/
@JsonInclude(Include.NON_NULL)
private String icon;
/**
* The field <code>banner</code> contains the optional banner.
*/
@JsonInclude(Include.NON_NULL)
private Map<String, String> banner;
/**
* The field <code>children</code> contains the sub-pages.
*/
@Setter
@JsonInclude(Include.NON_NULL)
private List<ContentPageTreeTo> children;
}
/**
* The enum <code>TeaserType</code> contains types of supported teasers.
*/
public enum TeaserType {
/**
* The indicator for a package.
*/
PKG {
@Override
public String toString() {
return "pkg";
}
},
/**
* The indicator for a topic.
*/
TOPIC {
@Override
public String toString() {
return "topic";
}
}
}
/**
* The field <code>FILE_TYPE</code> contains the mapping from extension to
* MIME type.
*/
private static final Map<String, String> FILE_TYPE = Map.of(
"png", "image/png",
"gif", "image/gif",
"jpg", "image/jpeg",
"jpeg", "image/jpeg",
"tif", "image/tiff",
"tiff", "image/tiff",
"eps", "application/postscript",
"ps", "application/postscript",
"", "text/plain");
/**
* The field <code>base</code> contains the content directory. It ends with
* a slash.
*/
private String base;
/**
* The field <code>languages</code> contains the supported languages. If the
* given language has no associated file then the languages are tried in
* turn until a file is found.
*/
private List<String> languages;
/**
* The field <code>ctanConfig</code> contains the configuration.
*/
private @NonNull CtanConfig ctanConfig;
/**
* This is the constructor for the class <code>ContentService</code>.
*
* @param config the content configuration
* @param ctanConfig the CTAN configuration
*/
@SuppressFBWarnings(value = "CT_CONSTRUCTOR_THROW")
public ContentService(@NonNull ContentConfig config,
@NonNull CtanConfig ctanConfig) {
base = config.getDirectory();
if (base == null || "".equals(base) || !new File(base).isDirectory()) {
throw new IllegalArgumentException(
"ctan.content is not a directory");
}
base = base + "/";
this.ctanConfig = ctanConfig;
languages = Arrays.asList(ConfigUtils.languages(ctanConfig));
}
/**
* The method <code>collectBanner</code> provides means to retrieve a
* banner.
*
* @param doc the Document
* @return a Map for the banner or {@code null}
*/
private Map<String, String> collectBanner(Document doc) {
var src = getRel(doc, "banner");
if (src == null) {
return null;
}
var by = getRel(doc, "banner-author");
return by == null
? Map.of(
"src", src)
: Map.of(
"by", by,
"src", src);
}
/**
* The method <code>findLocalizedFile</code> provides means to search for a
* file with a given language. If the given language is not found then the
* languages are tried in turn until a file is found.
*
* @param path the path to search for
* @param lang the language to try first
* @return the language found or {@code null} for none
*/
private String findLocalizedFile(String path, String lang) {
var file = new File(base + path + "." + lang);
if (file.isFile()) {
return lang;
}
for (String language : languages) {
if (language.equals(lang)) {
continue;
}
file = new File(base + path + "." + language);
if (file.isFile()) {
return language;
}
}
return null;
}
/**
* The method <code>findLocalizedIndex</code> provides means to retrieve an
* index file. The target locale is taken into consideration.
*
* @param path the path containing the index file
* @param lang the target locale
* @return the file or {@code null}
*/
private File findLocalizedIndex(File path, String lang) {
var file = new File(path, "index." + lang);
if (file.isFile()) {
return file;
}
for (String language : languages) {
if (language.equals(lang)) {
continue;
}
file = new File(path, "index." + language);
if (file.isFile()) {
return file;
}
}
return null;
}
/**
* Find an image file.
*
* @param path the path
* @return the image
* @throws IOException in case of an I/O error
*/
public byte[] getImage(String path) throws IOException {
if (hasRelativePath(path)) {
throw new IllegalArgumentException("relative path");
}
return Files.readAllBytes(Paths.get(base + "images/" + path));
}
/**
* Map the image name to the file type.
*
* @param path the path
* @return the file type or {@code null}
*/
public String getImageType(String path) {
int i = path.lastIndexOf(".");
return FILE_TYPE.get(i < 0 ? "" : path.substring(i + 1));
}
/**
* The method <code>getMeta</code> provides means to retrieve a Meta header
* from a document.
*
* @param doc the document
* @param name name of the attribute
* @return the content or {@code null}
*/
private String getMeta(Document doc, String name) {
for (var it : doc.getElementsByTag("meta")) {
var el = it.getElementsByAttributeValue("name", name);
if (el != null && el.first() != null) {
return el.first().attr("content");
}
}
return null;
}
/**
* The method <code>getNamedValue</code> provides means to retrieve some
* element's value.
*
* @param doc the document
* @param name name of the element
* @return the content or {@code null}
*/
private String getNamedValue(Document doc, String name) {
var el = doc.getElementsByAttributeValue("name", name);
return el == null || el.first() == null
? null
: el.first().attr("content");
}
/**
* The method <code>getPage</code> provides means to retrieve a page.
*
* @param lang the ISO language code consisting of two lower-case letters
* @param path the path in the content workspace; i.e. relative to the base
* directory
* @return the page or {@code null}
* @throws IOException in case of an I/O error
*/
public ContentPageTo getPage(String lang, @NonNull String path)
throws IOException {
if (hasRelativePath(path)) {
throw new IllegalArgumentException("relative path");
}
lang = ConfigUtils.fallbackLanguage(ctanConfig, lang);
List<ContentPageListTo> items = null;
if (!path.startsWith("/")) {
path = "/" + path;
}
var name = base + "page" + path;
var file = new File(name);
if (file.isDirectory()) {
if (!path.endsWith("/")) {
path = path + "/";
}
items = getPageItems(file, path, lang);
Collections.sort(items,
(a, b) -> a.title.compareToIgnoreCase(b.title));
path = path + "index";
name = base + "page" + path;
}
lang = findLocalizedFile("page" + path, lang);
if (lang == null) {
return null;
}
name = name + "." + lang;
var doc = Jsoup.parse(new File(name));
var body = processHtml(doc)
.html()
.replaceAll(
"&(TeX|LaTeX|TeXLaTeX|LaTeXTeX|BibTeX|LaTeX2e|LaTeXe);",
"&$1;");
return ContentPageTo.builder()
.name(path)
.lang(lang)
.title(doc.getElementsByTag("title").text())
.content(body)
.icon(getRel(doc, "icon"))
.banner(collectBanner(doc))
.classification(getNamedValue(doc, "classification"))
.children(items)
.build();
}
/**
* The method <code>getPageItems</code> provides means to retrieve the list
* of files in a directory having a given language.
*
* @param file the directory to scan
* @param path the path
* @param lang the locale
* @return the list of files
* @throws IOException in case of an I/O error
*/
private List<ContentPageListTo> getPageItems(File file, String path,
String lang)
throws IOException {
List<ContentPageListTo> items = new ArrayList<>();
var index = "index." + lang;
File[] listFiles = file.listFiles((d, f) -> f.endsWith("." + lang));
if (listFiles == null) {
throw new FileNotFoundException();
}
for (File f : listFiles) {
if (index.equals(f.getName())) {
continue;
}
var doc = Jsoup.parse(f);
var name = f.getName();
items.add(ContentPageListTo.builder()
.name(
path + name.substring(0, name.length() - 1 - lang.length()))
.lang(lang)
.title(doc.getElementsByTag("title").text())
.build());
}
File[] list = file.listFiles();
if (list == null) {
throw new FileNotFoundException();
}
for (File f : list) {
if (!f.isDirectory()) {
continue;
}
var title = f.getName();
var subIndex = new File(f, index);
if (subIndex.exists()) {
Document doc;
try {
doc = Jsoup.parse(subIndex);
} catch (IOException e) {
log.warn("Warning: Parsing failed for {}: {}", f.getName(),
e.getMessage());
continue;
}
title = doc.getElementsByTag("title").text();
}
items.add(ContentPageListTo.builder()
.name(path + f.getName())
.lang(lang)
.title(title)
.isDirectory(true)
.build());
}
return items;
}
/**
* The method <code>getPageList</code> provides means to retrieve a
* directory listing.
*
* @param lang the ISO language code consisting of two lower-case letters
* @param path the path in the content workspace; i.e. relative to the base
* directory
* @return the page or {@code null}
* @throws IOException in case of an I/O error
*/
public ContentPageTo getPageList(@NonNull String lang,
@NonNull String path)
throws IOException {
if (hasRelativePath(path)) {
throw new IllegalArgumentException("relative path");
}
if (!lang.matches("\\p{Lower}\\p{Lower}")) {
throw new IllegalArgumentException("illegal lang");
}
var name = base + "page/" + path;
var dir = new File(name);
if (!dir.isDirectory()) {
return null;
}
if (!"".equals(path) && !path.endsWith("/")) {
path = path + "/";
}
List<ContentPageListTo> list = new ArrayList<>();
var files = dir.listFiles(f -> f.getName().endsWith(".en")
&& !f.getName().startsWith("index."));
if (files == null) {
return null;
}
Arrays.sort(files,
(a, b) -> a.getName().compareToIgnoreCase(b.getName()));
for (File file : files) {
var fileName = file.getName();
var fn = fileName.substring(0, fileName.length() - 3);
var locale = findLocalizedFile("page/" + path + fn, lang);
if (locale == null) {
continue;
}
var doc = Jsoup.parse(new File(name + "/" + fn + "." + locale));
list.add(ContentPageListTo.builder()
.name(fn)
.title(doc.getElementsByTag("title").text())
.lang(locale)
.classification(getNamedValue(doc, "classification"))
.build());
}
var page = getPage(lang, path);
if (page == null) {
page = new ContentPageTo();
}
page.setChildren(list);
return page;
}
/**
* The method <code>getPageTree</code> provides means to retrieve the tree
* of pages.
*
* @param lang the language
* @param dir the directory
* @param depth the maximal depth
* @return the tree
* @throws IOException in case of an I/O error
*/
private ContentPageTreeTo getPageTree(String lang, File dir, int depth)
throws IOException {
List<ContentPageTreeTo> children = new ArrayList<>();
if (depth-- > 1) {
var files = dir.listFiles((f) -> f.isDirectory());
if (files == null) {
return null;
}
for (var d : files) {
children.add(getPageTree(lang, d, depth));
}
}
var dotLang = "." + lang;
var files = dir
.listFiles((x) -> x.isFile()
&& x.getName().endsWith(dotLang)
&& !x.getName().equals("index" + dotLang));
if (files == null) {
return null;
}
for (var f : files) {
var name = f.getPath()
.substring(base.length() + 4)
.replaceAll("\\.[a-z][a-z]$", "");
var doc = Jsoup.parse(f);
children.add(
ContentPageTreeTo.builder()
.lang(lang)
.title(doc.getElementsByTag("title").text())
.description(getMeta(doc, "description"))
.icon(getRel(doc, "icon"))
.banner(collectBanner(doc))
.name(name)
.build());
}
Collections.sort(children,
(a, b) -> a.title.compareToIgnoreCase(b.title));
var path = dir.getPath().substring(base.length() + 4);
var index = findLocalizedIndex(dir, lang);
if (index != null) {
var doc = Jsoup.parse(index);
return ContentPageTreeTo.builder()
.lang(lang)
.title(doc.getElementsByTag("title").text())
.description(getMeta(doc, "description"))
.icon(getRel(doc, "icon"))
.banner(collectBanner(doc))
.name(path)
.children(children)
.build();
}
return ContentPageTreeTo.builder()
.name(path)
.title(path)
.lang(lang)
.children(children)
.build();
}
/**
* The method <code>getPageTree</code> provides means to retrieve a tree of
* pages.
*
* @param lang the locale
* @param path the path
* @param depth the maximal depth
* @return the tree
* @throws IOException in case of an I/O error
*/
public ContentPageTreeTo getPageTree(String lang, @NonNull String path,
int depth)
throws IOException {
if (hasRelativePath(path)) {
throw new IllegalArgumentException(
"illegal relative path: " + path);
}
lang = ConfigUtils.fallbackLanguage(ctanConfig, lang);
if (!path.startsWith("/")) {
path = "/" + path;
}
return getPageTree(lang, new File(base + "page" + path), depth);
}
/**
* The method <code>getRel</code> provides means to retrieve a header from a
* document.
*
* @param doc the document
* @param name name of the attribute
* @return the content or {@code null}
*/
private String getRel(Document doc, String name) {
var el = doc.getElementsByAttributeValue("rel", name);
return el == null || el.first() == null
? null
: el.first().attr("content");
}
/**
* The method <code>getPkgTeaser</code> provides means to read a teaser
* image.
*
* @param type the type of the teaser
* @param key the key of the package
* @return the teaser image
* @throws IOException in case of an I/O error
*/
public byte[] getTeaser(@NonNull TeaserType type, @NonNull String key)
throws IOException {
if (key.indexOf('/') >= 0) {
throw new IllegalArgumentException(
"illegal char '/' in key " + key);
}
return Files
.readAllBytes(
Paths.get(base + "teasers/" + type.toString() + "/600x200/"
+ key + ".png"));
}
/**
* The method <code>getText</code> provides means to retrieve a text
* fragment.
*
* @param path the path in the content workspace; i.e. relative to the base
* directory
* @param lang the ISO language code consisting of two lower-case letters
* @return the page or {@code null}
* @throws IOException in case of an I/O error
*/
public ContentPageTo getText(@NonNull String path, String lang)
throws IOException {
if (hasRelativePath(path)) {
throw new IllegalArgumentException(
"illegal relative path: " + path);
}
lang = ConfigUtils.fallbackLanguage(ctanConfig, lang);
if (!path.startsWith("/")) {
path = "/" + path;
}
var name = base + "text" + path;
lang = findLocalizedFile("text" + path, lang);
if (lang == null) {
return null;
}
name = name + "." + lang;
var doc = Jsoup.parse(new File(name));
var body = doc.getElementsByTag("body").get(0);
var title = doc.getElementsByTag("title").html();
var h1 = body.getElementsByTag("h1");
if (h1.size() > 0) {
h1.get(0).remove();
}
return ContentPageTo.builder()
.name(path)
.lang(lang)
.title(title)
.content(body.html()
.replaceAll(
"&(TeX|LaTeX|TeXLaTeX|LaTeXTeX|BibTeX|LaTeX2e|LaTeXe);",
"&$1;"))
.icon(getRel(doc, "icon"))
.banner(collectBanner(doc))
.classification(getNamedValue(doc, "classification"))
.build();
}
/**
* The method <code>hasRelativePath</code> provides means to identify a
* relative path. This is a path containing .. in order to navigate to
* parent directories.
*
* <p>
* Allowing such a path would open a security hole which would allow the
* attacker to access the complete file system.
*
* @param path the path to check
* @return {@code true} iff the path contains a segment ..
*/
private boolean hasRelativePath(String path) {
return "..".equals(path)
|| path.contains("/../")
|| path.startsWith("../")
|| path.endsWith("/..");
}
/**
* The method <code>hasTeaser</code> provides means to check whether a
* teaser image exists.
*
* @param type the type
* @param key the key of the package
* @return {@code true} iff the teaser image could be found
*/
public boolean hasTeaser(TeaserType type, @NonNull String key) {
if (key.indexOf('/') >= 0) {
throw new IllegalArgumentException(
"illegal char '/' in key " + key);
}
return new File(base + "teasers/" + type.toString() + "/600x200/" + key
+ ".png").isFile();
}
/**
* The method <code>processHtml</code> provides means to render the
* document.
*
* @param doc the document
* @return the body element
*/
private Element processHtml(Document doc) {
var nodes = doc.getElementsByTag("markdown:render");
for (var node : nodes) {
var text = node.wholeText();
var w = new StringWriter();
try {
new MarkdownRenderer(new StringReader(text), "").render(w);
var to = Jsoup.parse(w.toString())
.getElementsByTag("body")
.get(0);
to.tagName("div");
var cls = node.attr("class");
if (cls != null) {
to.attr("class", cls);
}
node.replaceWith(to);
} catch (IOException e) {
log.error("processsing skipped", e);
}
}
return doc.body();
}
}