ArchiveFilesUpdateService.java
/*
* Copyright © 2025 The CTAN Team and individual authors
*
* This file is distributed under the 3-clause BSD license.
* See file LICENSE for details.
*/
package org.ctan.site.services.texarchive;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.ctan.site.CtanConfiguration;
import org.ctan.site.CtanConfiguration.TexArchiveConfig;
import org.ctan.site.domain.archive.ArchiveFile;
import org.ctan.site.domain.archive.ArchiveFile.FileType;
import org.ctan.site.services.upload.util.archive.Archive;
import org.ctan.site.services.upload.util.archive.Archive.Entry;
import org.ctan.site.stores.ArchiveFileStore;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import lombok.NonNull;
import lombok.extern.slf4j.Slf4j;
import minitex.InsParser;
import minitex.MiniTexParser;
/**
* The class <code>ArchiveFilesUpdateService</code> contains the service for
* scanning the TeX archive directory and updating the database and search index
* accordingly.
*
* @author <a href="mailto:gene@ctan.org">Gerd Neugebauer</a>
*/
@Slf4j
public class ArchiveFilesUpdateService {
/**
* The field <code>base</code> contains the base directory.
*/
private File base;
/**
* The field <code>strip</code> contains the length of the initial segment
* to be stripped.
*/
private int strip;
/**
* The field <code>store</code> contains the store.
*/
private ArchiveFileStore store;
/**
* This is the constructor for <code>ArchiveFilesUpdateService</code>.
*
* @param cfg the configuration
* @param store the store
*/
@SuppressFBWarnings(value = "EI_EXPOSE_REP2")
public ArchiveFilesUpdateService(CtanConfiguration cfg,
ArchiveFileStore store) {
this.store = store;
TexArchiveConfig texArchive = cfg.getTexArchive();
String directory = texArchive.getDirectory();
this.base = new File(directory);
this.strip = base.getAbsolutePath().length();
}
/**
* The method <code>addUpdate</code> provides means to save or update an
* archive file from a file.
*
* @param archiveFile the archive file
* @param file the file
* @param type the type
*/
private void addUpdate(ArchiveFile archiveFile, File file,
FileType type) {
addUpdate(archiveFile,
file.getName(),
file.getPath().substring(strip), type,
file.lastModified());
}
/**
* The method <code>addUpdate</code> provides means to save or update an
* archive file from attributes.
*
* @param archiveFile the archive file
* @param name the name
* @param path the path
* @param type the type
* @param mtime the modification time
*/
private void addUpdate(ArchiveFile archiveFile, String name, String path,
FileType type,
long mtime) {
if (archiveFile == null) {
archiveFile = ArchiveFile.builder()
.type(type)
.name(name)
.path(path)
.mtime(mtime)
.build();
} else {
archiveFile.setMtime(mtime);
}
store.save(archiveFile);
}
/**
* The method <code>find</code> provides means to find an element in a list
* with a given name.
*
* @param list the list to search in
* @param name the name to find
* @return the element found or <code>null</code>
*/
private ArchiveFile find(List<ArchiveFile> list, String name) {
for (var it : list) {
if (it.getName().equals(name)) {
return it;
}
}
return null;
}
/**
* The method <code>removeDirectory</code> provides means to delete a
* directory from the archive file database table.
*
* @param dir the directory to be removed
*/
private void removeDirectory(ArchiveFile dir) {
store.remove(dir);
for (var it : store.findAllByPath(dir.getPath())) {
store.remove(it);
}
}
/**
* The method <code>update</code> provides means to traverse the directory
* tree and update the database accordingly.
*
* @throws IOException in case of an I/O error
*/
public void update() throws IOException {
List<ArchiveFile> directories = store.findAllByType(FileType.DIRECTORY);
List<ArchiveFile> generated = store.findAllByType(FileType.GENERATED);
update(base, directories, generated);
for (var del : directories) {
removeDirectory(del);
}
store.remove(generated);
}
/**
* The method <code>update</code> provides means to update a certain
* directory.
*
* @param dir the directory to update
* @throws FileNotFoundException in case of an I/O error
*/
private void update(@NonNull File dir, List<ArchiveFile> directories,
List<ArchiveFile> generated)
throws FileNotFoundException {
String path = dir.toString().substring(strip);
List<ArchiveFile> toDelete = new ArrayList<>(store.findAllByPath(path));
File[] listFiles = dir.listFiles();
if (listFiles == null) {
throw new FileNotFoundException();
}
for (File file : listFiles) {
String name = file.getName();
// ignore dot files
if (name.startsWith(".")) {
continue;
}
try {
// ignore symlinks
if (!file.getCanonicalPath().equals(file.getAbsolutePath())) {
continue;
}
} catch (IOException e) {
continue;
}
ArchiveFile archiveFile = find(toDelete, name);
if (archiveFile == null) {
archiveFile = ArchiveFile.builder()
.type(FileType.FILE)
.name(file.getName())
.path(file.getPath().substring(strip))
.mtime(file.lastModified())
.build();
} else {
toDelete.remove(archiveFile);
if (archiveFile.getMtime() == file.lastModified()) {
continue;
}
}
try {
if (file.isDirectory()) {
addUpdate(archiveFile, file, FileType.DIRECTORY);
update(file, directories, generated);
directories.remove(archiveFile);
} else if (file.isFile()) {
if (name.endsWith(".zip")
|| name.endsWith(".tgz")
|| name.endsWith(".tar.gz")) {
addUpdate(archiveFile, file, FileType.ARCHIVE);
updateArchive(file, archiveFile);
continue;
}
int i = name.lastIndexOf('.');
switch (i >= 0 ? name.substring(i) : "") {
case ".dtx":
updateDtx(file, archiveFile);
break;
case ".ins":
updateIns(file, archiveFile);
generated.remove(archiveFile);
break;
case ".latex", ".LaTeX", ".ltx", ".tex", ".TeX":
updateTex(file, archiveFile);
break;
case ".pdf":
updatePdf(file, archiveFile);
break;
default:
// done
}
addUpdate(archiveFile, file, FileType.FILE);
}
} catch (IOException e) {
log.warn("Problem analysing file: {} {}",
file.getAbsolutePath(),
e.getMessage());
}
}
store.remove(toDelete);
}
/**
* The method <code>updateZip</code> provides means to analyse an archive
* file and update the search index accordingly.
*
* @param file the current file
* @param archiveFile the archive file
* @throws IOException in case of an I/O error
*/
private void updateArchive(File file, ArchiveFile archiveFile)
throws IOException {
try (var in = new FileInputStream(file)) {
Archive arc = Archive.of(file.getName(), in);
if (arc == null) {
// TODO
}
for (Entry entry = arc.getNextEntry(); entry != null; entry =
arc.getNextEntry()) {
String name = entry.getName();
// TODO
}
}
// System.err.println(file.getPath());
}
/**
* The method <code>updateDtx</code> provides means to means to analyse a
* dtx file and update the search index accordingly.
*
* @param file the current file
* @param archiveFile the archive file
* @throws IOException in case of an I/O error
*/
private void updateDtx(File file, ArchiveFile archiveFile) {
// TODO Auto-generated method stub
}
/**
* The method <code>updateIns</code> provides means to means to analyse an
* ins file and update the search index accordingly.
*
* @param file the current file
* @param archiveFile the archive file
* @throws IOException in case of an I/O error
*/
private void updateIns(File file, ArchiveFile archiveFile)
throws FileNotFoundException,
IOException {
String path = file.getPath().substring(strip);
List<String> generated = new InsParser().parse(path + "#",
new InputStreamReader(new FileInputStream(file),
StandardCharsets.UTF_8));
List<ArchiveFile> toDelete =
new ArrayList<>(store.findAllByPath(path + "#"));
long lastModified = file.lastModified();
for (var gen : generated) {
ArchiveFile af =
find(toDelete, gen.substring(gen.lastIndexOf('/') + 1));
toDelete.remove(af);
addUpdate(af,
gen.substring(gen.lastIndexOf('/') + 1),
gen.substring(0, gen.lastIndexOf('#') + 1),
FileType.GENERATED,
lastModified);
}
store.remove(toDelete);
}
/**
* The method <code>updatePdf</code> provides means to analyse a PDF file
* and update the search index accordingly.
*
* @param file the current file
* @param archiveFile the archive file
* @throws IOException in case of an I/O error
*/
private void updatePdf(File file, ArchiveFile archiveFile)
throws IOException {
PDDocument doc = Loader.loadPDF(file);
PDDocumentInformation info = doc.getDocumentInformation();
ArchiveFile af = ArchiveFile.builder()
.type(FileType.FILE)
.name(file.getName())
.title(info.getTitle())
.path(file.getPath().substring(strip))
.mtime(file.lastModified())
.build();
addUpdate(af, file, FileType.FILE);
}
/**
* The method <code>updateTex</code> provides means to means to analyse a
* TeX file and update the search index accordingly.
*
* @param file the current file
* @param archiveFile the archive file
* @throws IOException in case of an I/O error
*/
private void updateTex(File file, ArchiveFile archiveFile)
throws IOException {
MiniTexParser parser =
new MiniTexParser(new FileReader(file, Charset.forName("UTF-8")));
Map<String, List<String>> map = parser.parse();
List<String> title = map.get("title");
if (title != null) {
archiveFile.setTitle(String.join("", title));
}
}
}