restore OCRMyPDF and ghostscript compression

This commit is contained in:
Anthony Stirling 2025-06-30 22:27:45 +01:00
parent 11e3ccd19f
commit 782c30f934
13 changed files with 646 additions and 143 deletions

View File

@ -51,7 +51,6 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
tini \ tini \
bash \ bash \
curl \ curl \
qpdf \
shadow \ shadow \
su-exec \ su-exec \
openssl \ openssl \
@ -69,9 +68,11 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
tesseract-ocr-data-deu \ tesseract-ocr-data-deu \
tesseract-ocr-data-fra \ tesseract-ocr-data-fra \
tesseract-ocr-data-por \ tesseract-ocr-data-por \
unpaper \
# CV # CV
py3-opencv \ py3-opencv \
python3 \ python3 \
ocrmypdf \
py3-pip \ py3-pip \
py3-pillow@testing \ py3-pillow@testing \
py3-pdf2image@testing && \ py3-pdf2image@testing && \

View File

@ -76,16 +76,17 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
# pdftohtml # pdftohtml
poppler-utils \ poppler-utils \
# OCR MY PDF (unpaper for descew and other advanced featues) # OCR MY PDF (unpaper for descew and other advanced featues)
qpdf \
tesseract-ocr-data-eng \ tesseract-ocr-data-eng \
tesseract-ocr-data-chi_sim \ tesseract-ocr-data-chi_sim \
tesseract-ocr-data-deu \ tesseract-ocr-data-deu \
tesseract-ocr-data-fra \ tesseract-ocr-data-fra \
tesseract-ocr-data-por \ tesseract-ocr-data-por \
unpaper \
font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \ font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \
# CV # CV
py3-opencv \ py3-opencv \
python3 \ python3 \
ocrmypdf \
py3-pip \ py3-pip \
py3-pillow@testing \ py3-pillow@testing \
py3-pdf2image@testing && \ py3-pdf2image@testing && \

View File

@ -545,6 +545,8 @@ public class ApplicationProperties {
private int calibreSessionLimit; private int calibreSessionLimit;
private int qpdfSessionLimit; private int qpdfSessionLimit;
private int tesseractSessionLimit; private int tesseractSessionLimit;
private int ghostscriptSessionLimit;
private int ocrMyPdfSessionLimit;
public int getQpdfSessionLimit() { public int getQpdfSessionLimit() {
return qpdfSessionLimit > 0 ? qpdfSessionLimit : 2; return qpdfSessionLimit > 0 ? qpdfSessionLimit : 2;
@ -577,6 +579,14 @@ public class ApplicationProperties {
public int getCalibreSessionLimit() { public int getCalibreSessionLimit() {
return calibreSessionLimit > 0 ? calibreSessionLimit : 1; return calibreSessionLimit > 0 ? calibreSessionLimit : 1;
} }
public int getGhostscriptSessionLimit() {
return ghostscriptSessionLimit > 0 ? ghostscriptSessionLimit : 8;
}
public int getOcrMyPdfSessionLimit() {
return ocrMyPdfSessionLimit > 0 ? ocrMyPdfSessionLimit : 2;
}
} }
@Data @Data
@ -589,6 +599,8 @@ public class ApplicationProperties {
private long calibreTimeoutMinutes; private long calibreTimeoutMinutes;
private long tesseractTimeoutMinutes; private long tesseractTimeoutMinutes;
private long qpdfTimeoutMinutes; private long qpdfTimeoutMinutes;
private long ghostscriptTimeoutMinutes;
private long ocrMyPdfTimeoutMinutes;
public long getTesseractTimeoutMinutes() { public long getTesseractTimeoutMinutes() {
return tesseractTimeoutMinutes > 0 ? tesseractTimeoutMinutes : 30; return tesseractTimeoutMinutes > 0 ? tesseractTimeoutMinutes : 30;
@ -621,6 +633,14 @@ public class ApplicationProperties {
public long getCalibreTimeoutMinutes() { public long getCalibreTimeoutMinutes() {
return calibreTimeoutMinutes > 0 ? calibreTimeoutMinutes : 30; return calibreTimeoutMinutes > 0 ? calibreTimeoutMinutes : 30;
} }
public long getGhostscriptTimeoutMinutes() {
return ghostscriptTimeoutMinutes > 0 ? ghostscriptTimeoutMinutes : 30;
}
public long getOcrMyPdfTimeoutMinutes() {
return ocrMyPdfTimeoutMinutes > 0 ? ocrMyPdfTimeoutMinutes : 30;
}
} }
} }
} }

View File

@ -308,7 +308,7 @@ public class TempFileCleanupService {
} }
java.util.List<Path> subdirectories = new java.util.ArrayList<>(); java.util.List<Path> subdirectories = new java.util.ArrayList<>();
try (Stream<Path> pathStream = Files.list(directory)) { try (Stream<Path> pathStream = Files.list(directory)) {
pathStream.forEach( pathStream.forEach(
path -> { path -> {
@ -347,7 +347,7 @@ public class TempFileCleanupService {
} }
}); });
} }
for (Path subdirectory : subdirectories) { for (Path subdirectory : subdirectories) {
try { try {
cleanupDirectoryStreaming( cleanupDirectoryStreaming(

View File

@ -84,6 +84,16 @@ public class ProcessExecutor {
.getProcessExecutor() .getProcessExecutor()
.getSessionLimit() .getSessionLimit()
.getCalibreSessionLimit(); .getCalibreSessionLimit();
case GHOSTSCRIPT ->
applicationProperties
.getProcessExecutor()
.getSessionLimit()
.getGhostscriptSessionLimit();
case OCR_MY_PDF ->
applicationProperties
.getProcessExecutor()
.getSessionLimit()
.getOcrMyPdfSessionLimit();
}; };
long timeoutMinutes = long timeoutMinutes =
@ -128,6 +138,16 @@ public class ProcessExecutor {
.getProcessExecutor() .getProcessExecutor()
.getTimeoutMinutes() .getTimeoutMinutes()
.getCalibreTimeoutMinutes(); .getCalibreTimeoutMinutes();
case GHOSTSCRIPT ->
applicationProperties
.getProcessExecutor()
.getTimeoutMinutes()
.getGhostscriptTimeoutMinutes();
case OCR_MY_PDF ->
applicationProperties
.getProcessExecutor()
.getTimeoutMinutes()
.getOcrMyPdfTimeoutMinutes();
}; };
return new ProcessExecutor(semaphoreLimit, liveUpdates, timeoutMinutes); return new ProcessExecutor(semaphoreLimit, liveUpdates, timeoutMinutes);
}); });
@ -278,7 +298,9 @@ public class ProcessExecutor {
INSTALL_APP, INSTALL_APP,
CALIBRE, CALIBRE,
TESSERACT, TESSERACT,
QPDF QPDF,
GHOSTSCRIPT,
OCR_MY_PDF
} }
public class ProcessExecutorResult { public class ProcessExecutorResult {

View File

@ -9,7 +9,6 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
@ -24,11 +23,10 @@ import lombok.extern.slf4j.Slf4j;
@Component @Component
public class TempFileRegistry { public class TempFileRegistry {
private final ConcurrentMap<Path, Instant> registeredFiles = new ConcurrentHashMap<>(); private final ConcurrentMap<Path, Instant> registeredFiles = new ConcurrentHashMap<>();
private final Set<Path> thirdPartyTempFiles = private final Set<Path> thirdPartyTempFiles =
Collections.newSetFromMap(new ConcurrentHashMap<>()); Collections.newSetFromMap(new ConcurrentHashMap<>());
private final Set<Path> tempDirectories = private final Set<Path> tempDirectories = Collections.newSetFromMap(new ConcurrentHashMap<>());
Collections.newSetFromMap(new ConcurrentHashMap<>());
/** /**
* Register a temporary file with the registry. * Register a temporary file with the registry.

View File

@ -21,6 +21,8 @@ public class EndpointConfiguration {
private final ApplicationProperties applicationProperties; private final ApplicationProperties applicationProperties;
private Map<String, Boolean> endpointStatuses = new ConcurrentHashMap<>(); private Map<String, Boolean> endpointStatuses = new ConcurrentHashMap<>();
private Map<String, Set<String>> endpointGroups = new ConcurrentHashMap<>(); private Map<String, Set<String>> endpointGroups = new ConcurrentHashMap<>();
private Set<String> disabledGroups = new HashSet<>();
private Map<String, Set<String>> endpointAlternatives = new ConcurrentHashMap<>();
private final boolean runningProOrHigher; private final boolean runningProOrHigher;
public EndpointConfiguration( public EndpointConfiguration(
@ -51,16 +53,36 @@ public class EndpointConfiguration {
if (endpoint.startsWith("/")) { if (endpoint.startsWith("/")) {
endpoint = endpoint.substring(1); endpoint = endpoint.substring(1);
} }
// Check if endpoint has alternatives (multiple tools can handle it)
Set<String> alternatives = endpointAlternatives.get(endpoint);
if (alternatives != null && !alternatives.isEmpty()) {
// Endpoint is enabled if ANY of its alternative tools are enabled
for (String toolGroup : alternatives) {
if (isGroupEnabled(toolGroup)) {
return true;
}
}
return false; // All alternative tools are disabled
}
// Fallback to standard endpoint status check
return endpointStatuses.getOrDefault(endpoint, true); return endpointStatuses.getOrDefault(endpoint, true);
} }
public boolean isGroupEnabled(String group) { public boolean isGroupEnabled(String group) {
// Check if group is explicitly disabled first
if (disabledGroups.contains(group)) {
return false;
}
Set<String> endpoints = endpointGroups.get(group); Set<String> endpoints = endpointGroups.get(group);
if (endpoints == null || endpoints.isEmpty()) { if (endpoints == null || endpoints.isEmpty()) {
log.debug("Group '{}' does not exist or has no endpoints", group); log.debug("Group '{}' does not exist or has no endpoints", group);
return false; return false;
} }
// Additional check: if all endpoints in group are disabled, consider group disabled
for (String endpoint : endpoints) { for (String endpoint : endpoints) {
if (!isEndpointEnabled(endpoint)) { if (!isEndpointEnabled(endpoint)) {
return false; return false;
@ -73,8 +95,23 @@ public class EndpointConfiguration {
public void addEndpointToGroup(String group, String endpoint) { public void addEndpointToGroup(String group, String endpoint) {
endpointGroups.computeIfAbsent(group, k -> new HashSet<>()).add(endpoint); endpointGroups.computeIfAbsent(group, k -> new HashSet<>()).add(endpoint);
} }
public void addEndpointAlternative(String endpoint, String toolGroup) {
endpointAlternatives.computeIfAbsent(endpoint, k -> new HashSet<>()).add(toolGroup);
}
public void disableGroup(String group) {
disabledGroups.add(group);
Set<String> endpoints = endpointGroups.get(group);
if (endpoints != null) {
for (String endpoint : endpoints) {
disableEndpoint(endpoint);
}
}
}
public void enableGroup(String group) { public void enableGroup(String group) {
disabledGroups.remove(group);
Set<String> endpoints = endpointGroups.get(group); Set<String> endpoints = endpointGroups.get(group);
if (endpoints != null) { if (endpoints != null) {
for (String endpoint : endpoints) { for (String endpoint : endpoints) {
@ -83,13 +120,8 @@ public class EndpointConfiguration {
} }
} }
public void disableGroup(String group) { public Set<String> getDisabledGroups() {
Set<String> endpoints = endpointGroups.get(group); return new HashSet<>(disabledGroups);
if (endpoints != null) {
for (String endpoint : endpoints) {
disableEndpoint(endpoint);
}
}
} }
public void logDisabledEndpointsSummary() { public void logDisabledEndpointsSummary() {
@ -101,6 +133,12 @@ public class EndpointConfiguration {
.sorted() .sorted()
.toList(); .toList();
if (!disabledGroups.isEmpty()) {
log.info(
"Disabled groups: {}",
String.join(", ", disabledGroups.stream().sorted().toList()));
}
if (!disabledList.isEmpty()) { if (!disabledList.isEmpty()) {
log.info( log.info(
"Total disabled endpoints: {}. Disabled endpoints: {}", "Total disabled endpoints: {}. Disabled endpoints: {}",
@ -212,7 +250,6 @@ public class EndpointConfiguration {
// Unoconvert // Unoconvert
addEndpointToGroup("Unoconvert", "file-to-pdf"); addEndpointToGroup("Unoconvert", "file-to-pdf");
addEndpointToGroup("tesseract", "ocr-pdf");
// Java // Java
addEndpointToGroup("Java", "merge-pdfs"); addEndpointToGroup("Java", "merge-pdfs");
@ -261,8 +298,13 @@ public class EndpointConfiguration {
addEndpointToGroup("Javascript", "compare"); addEndpointToGroup("Javascript", "compare");
addEndpointToGroup("Javascript", "adjust-contrast"); addEndpointToGroup("Javascript", "adjust-contrast");
// qpdf dependent endpoints // Multi-tool endpoints - endpoints that can be handled by multiple tools
addEndpointToGroup("qpdf", "repair"); addEndpointAlternative("repair", "qpdf");
addEndpointAlternative("repair", "Ghostscript");
addEndpointAlternative("compress-pdf", "qpdf");
addEndpointAlternative("compress-pdf", "Ghostscript");
addEndpointAlternative("ocr-pdf", "tesseract");
addEndpointAlternative("ocr-pdf", "OCRmyPDF");
// Weasyprint dependent endpoints // Weasyprint dependent endpoints
addEndpointToGroup("Weasyprint", "html-to-pdf"); addEndpointToGroup("Weasyprint", "html-to-pdf");

View File

@ -34,6 +34,8 @@ public class ExternalAppDepConfig {
new HashMap<>() { new HashMap<>() {
{ {
put("gs", List.of("Ghostscript"));
put("ocrmypdf", List.of("OCRmyPDF"));
put("soffice", List.of("LibreOffice")); put("soffice", List.of("LibreOffice"));
put(weasyprintPath, List.of("Weasyprint")); put(weasyprintPath, List.of("Weasyprint"));
put("pdftohtml", List.of("Pdftohtml")); put("pdftohtml", List.of("Pdftohtml"));
@ -109,6 +111,8 @@ public class ExternalAppDepConfig {
@PostConstruct @PostConstruct
public void checkDependencies() { public void checkDependencies() {
// Check core dependencies // Check core dependencies
checkDependencyAndDisableGroup("gs");
checkDependencyAndDisableGroup("ocrmypdf");
checkDependencyAndDisableGroup("tesseract"); checkDependencyAndDisableGroup("tesseract");
checkDependencyAndDisableGroup("soffice"); checkDependencyAndDisableGroup("soffice");
checkDependencyAndDisableGroup("qpdf"); checkDependencyAndDisableGroup("qpdf");

View File

@ -65,12 +65,14 @@ public class CompressController {
private final CustomPDFDocumentFactory pdfDocumentFactory; private final CustomPDFDocumentFactory pdfDocumentFactory;
private final boolean qpdfEnabled; private final boolean qpdfEnabled;
private final boolean ghostscriptEnabled;
public CompressController( public CompressController(
CustomPDFDocumentFactory pdfDocumentFactory, CustomPDFDocumentFactory pdfDocumentFactory,
EndpointConfiguration endpointConfiguration) { EndpointConfiguration endpointConfiguration) {
this.pdfDocumentFactory = pdfDocumentFactory; this.pdfDocumentFactory = pdfDocumentFactory;
this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf"); this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf");
this.ghostscriptEnabled = endpointConfiguration.isGroupEnabled("Ghostscript");
} }
@Data @Data
@ -697,25 +699,69 @@ public class CompressController {
boolean sizeMet = false; boolean sizeMet = false;
boolean imageCompressionApplied = false; boolean imageCompressionApplied = false;
boolean qpdfCompressionApplied = false; boolean externalCompressionApplied = false;
if (qpdfEnabled && optimizeLevel <= 3) {
optimizeLevel = 4;
}
while (!sizeMet && optimizeLevel <= 9) { while (!sizeMet && optimizeLevel <= 9) {
// Apply image compression for levels 4-9 // Apply external compression first
if ((optimizeLevel >= 3 || Boolean.TRUE.equals(convertToGrayscale)) if (!externalCompressionApplied) {
&& !imageCompressionApplied) { boolean ghostscriptSuccess = false;
double scaleFactor = getScaleFactorForLevel(optimizeLevel);
float jpegQuality = getJpegQualityForLevel(optimizeLevel);
// Compress images // Try Ghostscript first if available - for ANY compression level
if (ghostscriptEnabled) {
try {
applyGhostscriptCompression(
request, optimizeLevel, currentFile, tempFiles);
log.info("Ghostscript compression applied successfully");
ghostscriptSuccess = true;
} catch (IOException e) {
log.warn("Ghostscript compression failed, trying fallback methods");
}
}
// Fallback to QPDF if Ghostscript failed or not available (levels 1-3 only)
if (!ghostscriptSuccess && qpdfEnabled && optimizeLevel <= 3) {
try {
applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles);
log.info("QPDF compression applied successfully");
} catch (IOException e) {
log.warn("QPDF compression also failed");
}
}
if (!ghostscriptSuccess && !qpdfEnabled) {
log.info(
"No external compression tools available, using image compression only");
}
externalCompressionApplied = true;
// Skip image compression if Ghostscript succeeded
if (ghostscriptSuccess) {
imageCompressionApplied = true;
}
}
// Apply image compression for levels 4+ only if Ghostscript didn't run
if ((optimizeLevel >= 4 || Boolean.TRUE.equals(convertToGrayscale))
&& !imageCompressionApplied) {
// Use different scale factors based on level
double scaleFactor =
switch (optimizeLevel) {
case 4 -> 0.95; // 95% of original size
case 5 -> 0.9; // 90% of original size
case 6 -> 0.8; // 80% of original size
case 7 -> 0.7; // 70% of original size
case 8 -> 0.65; // 65% of original size
case 9 -> 0.5; // 50% of original size
default -> 1.0;
};
log.info("Applying image compression with scale factor: {}", scaleFactor);
Path compressedImageFile = Path compressedImageFile =
compressImagesInPDF( compressImagesInPDF(
currentFile, currentFile,
scaleFactor, scaleFactor,
jpegQuality, 0.7f, // Default JPEG quality
Boolean.TRUE.equals(convertToGrayscale)); Boolean.TRUE.equals(convertToGrayscale));
tempFiles.add(compressedImageFile); tempFiles.add(compressedImageFile);
@ -723,18 +769,6 @@ public class CompressController {
imageCompressionApplied = true; imageCompressionApplied = true;
} }
// Apply QPDF compression for all levels
if (!qpdfCompressionApplied && qpdfEnabled) {
applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles);
qpdfCompressionApplied = true;
} else if (!qpdfCompressionApplied) {
// If QPDF is disabled, mark as applied and log
if (!qpdfEnabled) {
log.info("Skipping QPDF compression as QPDF group is disabled");
}
qpdfCompressionApplied = true;
}
// Check if target size reached or not in auto mode // Check if target size reached or not in auto mode
long outputFileSize = Files.size(currentFile); long outputFileSize = Files.size(currentFile);
if (outputFileSize <= expectedOutputSize || !autoMode) { if (outputFileSize <= expectedOutputSize || !autoMode) {
@ -754,7 +788,7 @@ public class CompressController {
} else { } else {
// Reset flags for next iteration with higher optimization level // Reset flags for next iteration with higher optimization level
imageCompressionApplied = false; imageCompressionApplied = false;
qpdfCompressionApplied = false; externalCompressionApplied = false;
optimizeLevel = newOptimizeLevel; optimizeLevel = newOptimizeLevel;
} }
} }
@ -788,6 +822,96 @@ public class CompressController {
} }
} }
// Run Ghostscript compression
private void applyGhostscriptCompression(
OptimizePdfRequest request, int optimizeLevel, Path currentFile, List<Path> tempFiles)
throws IOException {
long preGsSize = Files.size(currentFile);
log.info("Pre-Ghostscript file size: {}", GeneralUtils.formatBytes(preGsSize));
// Create output file for Ghostscript
Path gsOutputFile = Files.createTempFile("gs_output_", ".pdf");
tempFiles.add(gsOutputFile);
// Build Ghostscript command based on optimization level
List<String> command = new ArrayList<>();
command.add("gs");
command.add("-sDEVICE=pdfwrite");
command.add("-dCompatibilityLevel=1.5");
command.add("-dNOPAUSE");
command.add("-dQUIET");
command.add("-dBATCH");
// Map optimization levels to Ghostscript settings
switch (optimizeLevel) {
case 1:
command.add("-dPDFSETTINGS=/prepress");
break;
case 2:
command.add("-dPDFSETTINGS=/printer");
break;
case 3:
command.add("-dPDFSETTINGS=/ebook");
break;
case 4:
case 5:
command.add("-dPDFSETTINGS=/screen");
break;
case 6:
case 7:
command.add("-dPDFSETTINGS=/screen");
command.add("-dColorImageResolution=150");
command.add("-dGrayImageResolution=150");
command.add("-dMonoImageResolution=300");
break;
case 8:
case 9:
command.add("-dPDFSETTINGS=/screen");
command.add("-dColorImageResolution=100");
command.add("-dGrayImageResolution=100");
command.add("-dMonoImageResolution=200");
break;
case 10:
command.add("-dPDFSETTINGS=/screen");
command.add("-dColorImageResolution=72");
command.add("-dGrayImageResolution=72");
command.add("-dMonoImageResolution=150");
break;
default:
command.add("-dPDFSETTINGS=/screen");
break;
}
command.add("-sOutputFile=" + gsOutputFile.toString());
command.add(currentFile.toString());
ProcessExecutorResult returnCode = null;
try {
returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
.runCommandWithOutputHandling(command);
if (returnCode.getRc() == 0) {
// Update current file to the Ghostscript output
Files.copy(gsOutputFile, currentFile, StandardCopyOption.REPLACE_EXISTING);
long postGsSize = Files.size(currentFile);
double gsReduction = 100.0 - ((postGsSize * 100.0) / preGsSize);
log.info(
"Post-Ghostscript file size: {} (reduced by {}%)",
GeneralUtils.formatBytes(postGsSize), String.format("%.1f", gsReduction));
} else {
log.warn("Ghostscript compression failed with return code: {}", returnCode.getRc());
throw new IOException("Ghostscript compression failed");
}
} catch (Exception e) {
log.warn("Ghostscript compression failed, will fallback to other methods", e);
throw new IOException("Ghostscript compression failed", e);
}
}
// Run QPDF compression // Run QPDF compression
private void applyQpdfCompression( private void applyQpdfCompression(
OptimizePdfRequest request, int optimizeLevel, Path currentFile, List<Path> tempFiles) OptimizePdfRequest request, int optimizeLevel, Path currentFile, List<Path> tempFiles)

View File

@ -2,6 +2,7 @@ package stirling.software.SPDF.controller.api.misc;
import java.awt.image.BufferedImage; import java.awt.image.BufferedImage;
import java.io.*; import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.*; import java.util.*;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
@ -26,26 +27,42 @@ import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag; import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.config.EndpointConfiguration;
import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest; import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
import stirling.software.common.model.ApplicationProperties; import stirling.software.common.model.ApplicationProperties;
import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.ProcessExecutor; import stirling.software.common.util.ProcessExecutor;
import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult; import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
import stirling.software.common.util.TempDirectory;
import stirling.software.common.util.TempFile;
import stirling.software.common.util.TempFileManager; import stirling.software.common.util.TempFileManager;
import stirling.software.common.util.WebResponseUtils;
@RestController @RestController
@RequestMapping("/api/v1/misc") @RequestMapping("/api/v1/misc")
@Tag(name = "Misc", description = "Miscellaneous APIs") @Tag(name = "Misc", description = "Miscellaneous APIs")
@Slf4j @Slf4j
@RequiredArgsConstructor
public class OCRController { public class OCRController {
private final ApplicationProperties applicationProperties; private final ApplicationProperties applicationProperties;
private final CustomPDFDocumentFactory pdfDocumentFactory; private final CustomPDFDocumentFactory pdfDocumentFactory;
private final TempFileManager tempFileManager; private final TempFileManager tempFileManager;
private final boolean ocrMyPdfEnabled;
private final boolean tesseractEnabled;
public OCRController(
ApplicationProperties applicationProperties,
CustomPDFDocumentFactory pdfDocumentFactory,
TempFileManager tempFileManager,
EndpointConfiguration endpointConfiguration) {
this.applicationProperties = applicationProperties;
this.pdfDocumentFactory = pdfDocumentFactory;
this.tempFileManager = tempFileManager;
this.ocrMyPdfEnabled = endpointConfiguration.isGroupEnabled("OCRmyPDF");
this.tesseractEnabled = endpointConfiguration.isGroupEnabled("tesseract");
}
/** Gets the list of available Tesseract languages from the tessdata directory */ /** Gets the list of available Tesseract languages from the tessdata directory */
public List<String> getAvailableTesseractLanguages() { public List<String> getAvailableTesseractLanguages() {
@ -63,39 +80,261 @@ public class OCRController {
@PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf") @PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
@Operation( @Operation(
summary = "Process PDF files with OCR using Tesseract", summary = "Process a PDF file with OCR",
description = description =
"Takes a PDF file as input, performs OCR using specified languages and OCR type" "This endpoint processes a PDF file using OCR (Optical Character Recognition). "
+ " (skip-text/force-ocr), and returns the processed PDF. Input:PDF" + "Users can specify languages, sidecar, deskew, clean, cleanFinal, ocrType, ocrRenderType, and removeImagesAfter options. "
+ " Output:PDF Type:SISO") + "Uses OCRmyPDF if available, falls back to Tesseract. Input:PDF Output:PDF Type:SI-Conditional")
public ResponseEntity<byte[]> processPdfWithOCR( public ResponseEntity<byte[]> processPdfWithOCR(
@ModelAttribute ProcessPdfWithOcrRequest request) @ModelAttribute ProcessPdfWithOcrRequest request)
throws IOException, InterruptedException { throws IOException, InterruptedException {
MultipartFile inputFile = request.getFileInput(); MultipartFile inputFile = request.getFileInput();
List<String> languages = request.getLanguages(); List<String> selectedLanguages = request.getLanguages();
Boolean sidecar = request.isSidecar();
Boolean deskew = request.isDeskew();
Boolean clean = request.isClean();
Boolean cleanFinal = request.isCleanFinal();
String ocrType = request.getOcrType(); String ocrType = request.getOcrType();
String ocrRenderType = request.getOcrRenderType();
Boolean removeImagesAfter = request.isRemoveImagesAfter();
// Create a temp directory using TempFileManager directly if (selectedLanguages == null || selectedLanguages.isEmpty()) {
Path tempDirPath = tempFileManager.createTempDirectory(); throw new IOException("Please select at least one language.");
File tempDir = tempDirPath.toFile(); }
try { if (!"hocr".equals(ocrRenderType) && !"sandwich".equals(ocrRenderType)) {
File tempInputFile = new File(tempDir, "input.pdf"); throw new IOException("ocrRenderType wrong");
File tempOutputDir = new File(tempDir, "output"); }
File tempImagesDir = new File(tempDir, "images");
File finalOutputFile = new File(tempDir, "final_output.pdf"); // Get available Tesseract languages
List<String> availableLanguages = getAvailableTesseractLanguages();
// Validate selected languages
selectedLanguages =
selectedLanguages.stream().filter(availableLanguages::contains).toList();
if (selectedLanguages.isEmpty()) {
throw new IOException("None of the selected languages are valid.");
}
// Use try-with-resources for proper temp file management
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
inputFile.transferTo(tempInputFile.getFile());
TempFile sidecarTextFile = null;
try {
// Use OCRmyPDF if available (no fallback - error if it fails)
if (ocrMyPdfEnabled) {
if (sidecar != null && sidecar) {
sidecarTextFile = new TempFile(tempFileManager, ".txt");
}
processWithOcrMyPdf(
selectedLanguages,
sidecar,
deskew,
clean,
cleanFinal,
ocrType,
ocrRenderType,
removeImagesAfter,
tempInputFile.getPath(),
tempOutputFile.getPath(),
sidecarTextFile != null ? sidecarTextFile.getPath() : null);
log.info("OCRmyPDF processing completed successfully");
}
// Use Tesseract only if OCRmyPDF is not available
else if (tesseractEnabled) {
processWithTesseract(
selectedLanguages,
ocrType,
tempInputFile.getPath(),
tempOutputFile.getPath());
log.info("Tesseract processing completed successfully");
} else {
throw new IOException("No OCR tools are available");
}
// Read the processed PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile.getPath());
// Return the OCR processed PDF as a response
String outputFilename =
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
.replaceFirst("[.][^.]+$", "")
+ "_OCR.pdf";
if (sidecar != null && sidecar && sidecarTextFile != null) {
// Create a zip file containing both the PDF and the text file
String outputZipFilename =
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
.replaceFirst("[.][^.]+$", "")
+ "_OCR.zip";
try (TempFile tempZipFile = new TempFile(tempFileManager, ".zip");
ZipOutputStream zipOut =
new ZipOutputStream(
Files.newOutputStream(tempZipFile.getPath()))) {
// Add PDF file to the zip
ZipEntry pdfEntry = new ZipEntry(outputFilename);
zipOut.putNextEntry(pdfEntry);
zipOut.write(pdfBytes);
zipOut.closeEntry();
// Add text file to the zip
ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
zipOut.putNextEntry(txtEntry);
Files.copy(sidecarTextFile.getPath(), zipOut);
zipOut.closeEntry();
zipOut.finish();
byte[] zipBytes = Files.readAllBytes(tempZipFile.getPath());
// Return the zip file containing both the PDF and the text file
return WebResponseUtils.bytesToWebResponse(
zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
}
} else {
// Return the OCR processed PDF as a response
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
} finally {
// Clean up sidecar temp file if created
if (sidecarTextFile != null) {
try {
sidecarTextFile.close();
} catch (Exception e) {
log.warn("Failed to close sidecar temp file", e);
}
}
}
}
}
private void processWithOcrMyPdf(
List<String> selectedLanguages,
Boolean sidecar,
Boolean deskew,
Boolean clean,
Boolean cleanFinal,
String ocrType,
String ocrRenderType,
Boolean removeImagesAfter,
Path tempInputFile,
Path tempOutputFile,
Path sidecarTextPath)
throws IOException, InterruptedException {
// Build OCRmyPDF command
String languageOption = String.join("+", selectedLanguages);
List<String> command =
new ArrayList<>(
Arrays.asList(
"ocrmypdf",
"--verbose",
"2",
"--output-type",
"pdf",
"--pdf-renderer",
ocrRenderType));
if (sidecar != null && sidecar && sidecarTextPath != null) {
command.add("--sidecar");
command.add(sidecarTextPath.toString());
}
if (deskew != null && deskew) {
command.add("--deskew");
}
if (clean != null && clean) {
command.add("--clean");
}
if (cleanFinal != null && cleanFinal) {
command.add("--clean-final");
}
if (ocrType != null && !"".equals(ocrType)) {
if ("skip-text".equals(ocrType)) {
command.add("--skip-text");
} else if ("force-ocr".equals(ocrType)) {
command.add("--force-ocr");
}
}
command.addAll(
Arrays.asList(
"--language",
languageOption,
tempInputFile.toString(),
tempOutputFile.toString()));
// Run CLI command
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
.runCommandWithOutputHandling(command);
if (result.getRc() != 0
&& result.getMessages().contains("multiprocessing/synchronize.py")
&& result.getMessages().contains("OSError: [Errno 38] Function not implemented")) {
command.add("--jobs");
command.add("1");
result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
.runCommandWithOutputHandling(command);
}
if (result.getRc() != 0) {
throw new IOException("OCRmyPDF failed with return code: " + result.getRc());
}
// Remove images from the OCR processed PDF if the flag is set to true
if (removeImagesAfter != null && removeImagesAfter) {
try (TempFile tempPdfWithoutImages = new TempFile(tempFileManager, "_no_images.pdf")) {
List<String> gsCommand =
Arrays.asList(
"gs",
"-sDEVICE=pdfwrite",
"-dFILTERIMAGE",
"-o",
tempPdfWithoutImages.getPath().toString(),
tempOutputFile.toString());
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
.runCommandWithOutputHandling(gsCommand);
// Replace output file with version without images
Files.copy(
tempPdfWithoutImages.getPath(),
tempOutputFile,
java.nio.file.StandardCopyOption.REPLACE_EXISTING);
}
}
}
private void processWithTesseract(
List<String> selectedLanguages, String ocrType, Path tempInputFile, Path tempOutputFile)
throws IOException, InterruptedException {
// Create temp directory for Tesseract processing
try (TempDirectory tempDir = new TempDirectory(tempFileManager)) {
File tempOutputDir = new File(tempDir.getPath().toFile(), "output");
File tempImagesDir = new File(tempDir.getPath().toFile(), "images");
File finalOutputFile = new File(tempDir.getPath().toFile(), "final_output.pdf");
// Create directories // Create directories
tempOutputDir.mkdirs(); tempOutputDir.mkdirs();
tempImagesDir.mkdirs(); tempImagesDir.mkdirs();
// Save input file
inputFile.transferTo(tempInputFile);
PDFMergerUtility merger = new PDFMergerUtility(); PDFMergerUtility merger = new PDFMergerUtility();
merger.setDestinationFileName(finalOutputFile.toString()); merger.setDestinationFileName(finalOutputFile.toString());
try (PDDocument document = pdfDocumentFactory.load(tempInputFile)) { try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) {
PDFRenderer pdfRenderer = new PDFRenderer(document); PDFRenderer pdfRenderer = new PDFRenderer(document);
int pageCount = document.getNumberOfPages(); int pageCount = document.getNumberOfPages();
@ -135,35 +374,20 @@ public class OCRController {
new File(tempOutputDir, String.format("page_%d", pageNum)) new File(tempOutputDir, String.format("page_%d", pageNum))
.toString()); .toString());
command.add("-l"); command.add("-l");
command.add(String.join("+", languages)); command.add(String.join("+", selectedLanguages));
// Always output PDF command.add("pdf"); // Always output PDF
command.add("pdf");
// Use ProcessExecutor to run tesseract command ProcessExecutorResult result =
try { ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
ProcessExecutorResult result = .runCommandWithOutputHandling(command);
ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
.runCommandWithOutputHandling(command);
log.debug( if (result.getRc() != 0) {
"Tesseract OCR completed for page {} with exit code {}", throw new RuntimeException(
pageNum, "Tesseract failed with exit code: " + result.getRc());
result.getRc());
// Add OCR'd PDF to merger
merger.addSource(pageOutputPath);
} catch (IOException | InterruptedException e) {
log.error(
"Error processing page {} with tesseract: {}",
pageNum,
e.getMessage());
// If OCR fails, fall back to the original page
try (PDDocument pageDoc = new PDDocument()) {
pageDoc.addPage(page);
pageDoc.save(pageOutputPath);
merger.addSource(pageOutputPath);
}
} }
// Add OCR'd PDF to merger
merger.addSource(pageOutputPath);
} else { } else {
// Save original page without OCR // Save original page without OCR
try (PDDocument pageDoc = new PDDocument()) { try (PDDocument pageDoc = new PDDocument()) {
@ -178,40 +402,11 @@ public class OCRController {
// Merge all pages into final PDF // Merge all pages into final PDF
merger.mergeDocuments(null); merger.mergeDocuments(null);
// Read the final PDF file // Copy final output to the expected location
byte[] pdfContent = java.nio.file.Files.readAllBytes(finalOutputFile.toPath()); Files.copy(
String outputFilename = finalOutputFile.toPath(),
Filenames.toSimpleFileName(inputFile.getOriginalFilename()) tempOutputFile,
.replaceFirst("[.][^.]+$", "") java.nio.file.StandardCopyOption.REPLACE_EXISTING);
+ "_OCR.pdf";
return ResponseEntity.ok()
.header(
"Content-Disposition",
"attachment; filename=\"" + outputFilename + "\"")
.contentType(MediaType.APPLICATION_PDF)
.body(pdfContent);
} finally {
// Clean up the temp directory and all its contents
tempFileManager.deleteTempDirectory(tempDirPath);
}
}
private void addFileToZip(File file, String filename, ZipOutputStream zipOut)
throws IOException {
if (!file.exists()) {
log.warn("File {} does not exist, skipping", file);
return;
}
try (FileInputStream fis = new FileInputStream(file)) {
ZipEntry zipEntry = new ZipEntry(filename);
zipOut.putNextEntry(zipEntry);
byte[] buffer = new byte[1024];
int length;
while ((length = fis.read(buffer)) >= 0) {
zipOut.write(buffer, 0, length);
}
zipOut.closeEntry();
} }
} }
} }

View File

@ -15,8 +15,7 @@ import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag; import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.RequiredArgsConstructor; import stirling.software.SPDF.config.EndpointConfiguration;
import stirling.software.common.model.api.PDFFile; import stirling.software.common.model.api.PDFFile;
import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.ProcessExecutor; import stirling.software.common.util.ProcessExecutor;
@ -28,17 +27,28 @@ import stirling.software.common.util.WebResponseUtils;
@RestController @RestController
@RequestMapping("/api/v1/misc") @RequestMapping("/api/v1/misc")
@Tag(name = "Misc", description = "Miscellaneous APIs") @Tag(name = "Misc", description = "Miscellaneous APIs")
@RequiredArgsConstructor
public class RepairController { public class RepairController {
private final CustomPDFDocumentFactory pdfDocumentFactory; private final CustomPDFDocumentFactory pdfDocumentFactory;
private final TempFileManager tempFileManager; private final TempFileManager tempFileManager;
private final boolean ghostscriptEnabled;
private final boolean qpdfEnabled;
public RepairController(
CustomPDFDocumentFactory pdfDocumentFactory,
TempFileManager tempFileManager,
EndpointConfiguration endpointConfiguration) {
this.pdfDocumentFactory = pdfDocumentFactory;
this.tempFileManager = tempFileManager;
this.ghostscriptEnabled = endpointConfiguration.isGroupEnabled("Ghostscript");
this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf");
}
@PostMapping(consumes = "multipart/form-data", value = "/repair") @PostMapping(consumes = "multipart/form-data", value = "/repair")
@Operation( @Operation(
summary = "Repair a PDF file", summary = "Repair a PDF file",
description = description =
"This endpoint repairs a given PDF file by running qpdf command. The PDF is" "This endpoint repairs a given PDF file by running Ghostscript (primary), qpdf (fallback), or PDFBox (if no external tools available). The PDF is"
+ " first saved to a temporary location, repaired, read back, and then" + " first saved to a temporary location, repaired, read back, and then"
+ " returned as a response. Input:PDF Output:PDF Type:SISO") + " returned as a response. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> repairPdf(@ModelAttribute PDFFile file) public ResponseEntity<byte[]> repairPdf(@ModelAttribute PDFFile file)
@ -46,25 +56,72 @@ public class RepairController {
MultipartFile inputFile = file.getFileInput(); MultipartFile inputFile = file.getFileInput();
// Use TempFile with try-with-resources for automatic cleanup // Use TempFile with try-with-resources for automatic cleanup
try (TempFile tempFile = new TempFile(tempFileManager, ".pdf")) { try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
// Save the uploaded file to the temporary location // Save the uploaded file to the temporary location
inputFile.transferTo(tempFile.getFile()); inputFile.transferTo(tempInputFile.getFile());
List<String> command = new ArrayList<>(); boolean repairSuccess = false;
command.add("qpdf");
command.add("--replace-input"); // Automatically fixes problems it can
command.add("--qdf"); // Linearizes and normalizes PDF structure
command.add("--object-streams=disable"); // Can help with some corruptions
command.add(tempFile.getFile().getAbsolutePath());
ProcessExecutorResult returnCode = // Try Ghostscript first if available
ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF) if (ghostscriptEnabled) {
.runCommandWithOutputHandling(command); try {
List<String> gsCommand = new ArrayList<>();
gsCommand.add("gs");
gsCommand.add("-o");
gsCommand.add(tempOutputFile.getPath().toString());
gsCommand.add("-sDEVICE=pdfwrite");
gsCommand.add(tempInputFile.getPath().toString());
// Read the optimized PDF file ProcessExecutorResult gsResult =
byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempFile.getFile()); ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
.runCommandWithOutputHandling(gsCommand);
// Return the optimized PDF as a response if (gsResult.getRc() == 0) {
repairSuccess = true;
}
} catch (Exception e) {
// Log and continue to QPDF fallback
System.out.println(
"Ghostscript repair failed, trying QPDF fallback: " + e.getMessage());
}
}
// Fallback to QPDF if Ghostscript failed or not available
if (!repairSuccess && qpdfEnabled) {
List<String> qpdfCommand = new ArrayList<>();
qpdfCommand.add("qpdf");
qpdfCommand.add("--replace-input"); // Automatically fixes problems it can
qpdfCommand.add("--qdf"); // Linearizes and normalizes PDF structure
qpdfCommand.add("--object-streams=disable"); // Can help with some corruptions
qpdfCommand.add(tempInputFile.getPath().toString());
qpdfCommand.add(tempOutputFile.getPath().toString());
ProcessExecutorResult qpdfResult =
ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
.runCommandWithOutputHandling(qpdfCommand);
repairSuccess = true;
}
// Use PDFBox as last resort if no external tools are available
if (!repairSuccess) {
if (!ghostscriptEnabled && !qpdfEnabled) {
// Basic PDFBox repair - load and save to fix structural issues
try (var document = pdfDocumentFactory.load(tempInputFile.getFile())) {
document.save(tempOutputFile.getFile());
repairSuccess = true;
}
} else {
throw new IOException("PDF repair failed with available tools");
}
}
// Read the repaired PDF file
byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempOutputFile.getFile());
// Return the repaired PDF as a response
String outputFilename = String outputFilename =
Filenames.toSimpleFileName(inputFile.getOriginalFilename()) Filenames.toSimpleFileName(inputFile.getOriginalFilename())
.replaceFirst("[.][^.]+$", "") .replaceFirst("[.][^.]+$", "")

View File

@ -19,6 +19,18 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
defaultValue = "[\"eng\"]") defaultValue = "[\"eng\"]")
private List<String> languages; private List<String> languages;
@Schema(description = "Include OCR text in a sidecar text file if set to true")
private boolean sidecar;
@Schema(description = "Deskew the input file if set to true")
private boolean deskew;
@Schema(description = "Clean the input file if set to true")
private boolean clean;
@Schema(description = "Clean the final output if set to true")
private boolean cleanFinal;
@Schema( @Schema(
description = "Specify the OCR type, e.g., 'skip-text', 'force-ocr', or 'Normal'", description = "Specify the OCR type, e.g., 'skip-text', 'force-ocr', or 'Normal'",
requiredMode = Schema.RequiredMode.REQUIRED, requiredMode = Schema.RequiredMode.REQUIRED,
@ -31,4 +43,7 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
allowableValues = {"hocr", "sandwich"}, allowableValues = {"hocr", "sandwich"},
defaultValue = "hocr") defaultValue = "hocr")
private String ocrRenderType = "hocr"; private String ocrRenderType = "hocr";
@Schema(description = "Remove images from the output PDF if set to true")
private boolean removeImagesAfter;
} }

View File

@ -79,6 +79,30 @@
</select> </select>
</div> </div>
<br> <br>
<div class="mb-3" th:if>
<label class="form-label">OCR Options</label>
<div class="form-check">
<input type="checkbox" class="form-check-input" id="sidecar" name="sidecar" value="true">
<label class="form-check-label" for="sidecar">Include OCR text in sidecar text file</label>
</div>
<div class="form-check">
<input type="checkbox" class="form-check-input" id="deskew" name="deskew" value="true">
<label class="form-check-label" for="deskew">Deskew input file</label>
</div>
<div class="form-check">
<input type="checkbox" class="form-check-input" id="clean" name="clean" value="true">
<label class="form-check-label" for="clean">Clean input file</label>
</div>
<div class="form-check">
<input type="checkbox" class="form-check-input" id="cleanFinal" name="cleanFinal" value="true">
<label class="form-check-label" for="cleanFinal">Clean final output</label>
</div>
<div class="form-check">
<input type="checkbox" class="form-check-input" id="removeImagesAfter" name="removeImagesAfter" value="true">
<label class="form-check-label" for="removeImagesAfter">Remove images from output PDF</label>
</div>
</div>
<br>
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{ocr.submit}"></button> <button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{ocr.submit}"></button>
</form> </form>
<script th:inline="javascript"> <script th:inline="javascript">