restore OCRMyPDF and ghostscript compression

This commit is contained in:
Anthony Stirling 2025-06-30 22:27:45 +01:00
parent 11e3ccd19f
commit 782c30f934
13 changed files with 646 additions and 143 deletions

View File

@ -51,7 +51,6 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
tini \
bash \
curl \
qpdf \
shadow \
su-exec \
openssl \
@ -69,9 +68,11 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
tesseract-ocr-data-deu \
tesseract-ocr-data-fra \
tesseract-ocr-data-por \
unpaper \
# CV
py3-opencv \
python3 \
ocrmypdf \
py3-pip \
py3-pillow@testing \
py3-pdf2image@testing && \

View File

@ -76,16 +76,17 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
# pdftohtml
poppler-utils \
# OCR MY PDF (unpaper for descew and other advanced featues)
qpdf \
tesseract-ocr-data-eng \
tesseract-ocr-data-chi_sim \
tesseract-ocr-data-deu \
tesseract-ocr-data-fra \
tesseract-ocr-data-por \
unpaper \
font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \
# CV
py3-opencv \
python3 \
ocrmypdf \
py3-pip \
py3-pillow@testing \
py3-pdf2image@testing && \

View File

@ -545,6 +545,8 @@ public class ApplicationProperties {
private int calibreSessionLimit;
private int qpdfSessionLimit;
private int tesseractSessionLimit;
private int ghostscriptSessionLimit;
private int ocrMyPdfSessionLimit;
public int getQpdfSessionLimit() {
return qpdfSessionLimit > 0 ? qpdfSessionLimit : 2;
@ -577,6 +579,14 @@ public class ApplicationProperties {
public int getCalibreSessionLimit() {
return calibreSessionLimit > 0 ? calibreSessionLimit : 1;
}
public int getGhostscriptSessionLimit() {
return ghostscriptSessionLimit > 0 ? ghostscriptSessionLimit : 8;
}
public int getOcrMyPdfSessionLimit() {
return ocrMyPdfSessionLimit > 0 ? ocrMyPdfSessionLimit : 2;
}
}
@Data
@ -589,6 +599,8 @@ public class ApplicationProperties {
private long calibreTimeoutMinutes;
private long tesseractTimeoutMinutes;
private long qpdfTimeoutMinutes;
private long ghostscriptTimeoutMinutes;
private long ocrMyPdfTimeoutMinutes;
public long getTesseractTimeoutMinutes() {
return tesseractTimeoutMinutes > 0 ? tesseractTimeoutMinutes : 30;
@ -621,6 +633,14 @@ public class ApplicationProperties {
public long getCalibreTimeoutMinutes() {
return calibreTimeoutMinutes > 0 ? calibreTimeoutMinutes : 30;
}
public long getGhostscriptTimeoutMinutes() {
return ghostscriptTimeoutMinutes > 0 ? ghostscriptTimeoutMinutes : 30;
}
public long getOcrMyPdfTimeoutMinutes() {
return ocrMyPdfTimeoutMinutes > 0 ? ocrMyPdfTimeoutMinutes : 30;
}
}
}
}

View File

@ -84,6 +84,16 @@ public class ProcessExecutor {
.getProcessExecutor()
.getSessionLimit()
.getCalibreSessionLimit();
case GHOSTSCRIPT ->
applicationProperties
.getProcessExecutor()
.getSessionLimit()
.getGhostscriptSessionLimit();
case OCR_MY_PDF ->
applicationProperties
.getProcessExecutor()
.getSessionLimit()
.getOcrMyPdfSessionLimit();
};
long timeoutMinutes =
@ -128,6 +138,16 @@ public class ProcessExecutor {
.getProcessExecutor()
.getTimeoutMinutes()
.getCalibreTimeoutMinutes();
case GHOSTSCRIPT ->
applicationProperties
.getProcessExecutor()
.getTimeoutMinutes()
.getGhostscriptTimeoutMinutes();
case OCR_MY_PDF ->
applicationProperties
.getProcessExecutor()
.getTimeoutMinutes()
.getOcrMyPdfTimeoutMinutes();
};
return new ProcessExecutor(semaphoreLimit, liveUpdates, timeoutMinutes);
});
@ -278,7 +298,9 @@ public class ProcessExecutor {
INSTALL_APP,
CALIBRE,
TESSERACT,
QPDF
QPDF,
GHOSTSCRIPT,
OCR_MY_PDF
}
public class ProcessExecutorResult {

View File

@ -9,7 +9,6 @@ import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.stream.Collectors;
import org.springframework.stereotype.Component;
@ -27,8 +26,7 @@ public class TempFileRegistry {
private final ConcurrentMap<Path, Instant> registeredFiles = new ConcurrentHashMap<>();
private final Set<Path> thirdPartyTempFiles =
Collections.newSetFromMap(new ConcurrentHashMap<>());
private final Set<Path> tempDirectories =
Collections.newSetFromMap(new ConcurrentHashMap<>());
private final Set<Path> tempDirectories = Collections.newSetFromMap(new ConcurrentHashMap<>());
/**
* Register a temporary file with the registry.

View File

@ -21,6 +21,8 @@ public class EndpointConfiguration {
private final ApplicationProperties applicationProperties;
private Map<String, Boolean> endpointStatuses = new ConcurrentHashMap<>();
private Map<String, Set<String>> endpointGroups = new ConcurrentHashMap<>();
private Set<String> disabledGroups = new HashSet<>();
private Map<String, Set<String>> endpointAlternatives = new ConcurrentHashMap<>();
private final boolean runningProOrHigher;
public EndpointConfiguration(
@ -51,16 +53,36 @@ public class EndpointConfiguration {
if (endpoint.startsWith("/")) {
endpoint = endpoint.substring(1);
}
// Check if endpoint has alternatives (multiple tools can handle it)
Set<String> alternatives = endpointAlternatives.get(endpoint);
if (alternatives != null && !alternatives.isEmpty()) {
// Endpoint is enabled if ANY of its alternative tools are enabled
for (String toolGroup : alternatives) {
if (isGroupEnabled(toolGroup)) {
return true;
}
}
return false; // All alternative tools are disabled
}
// Fallback to standard endpoint status check
return endpointStatuses.getOrDefault(endpoint, true);
}
public boolean isGroupEnabled(String group) {
// Check if group is explicitly disabled first
if (disabledGroups.contains(group)) {
return false;
}
Set<String> endpoints = endpointGroups.get(group);
if (endpoints == null || endpoints.isEmpty()) {
log.debug("Group '{}' does not exist or has no endpoints", group);
return false;
}
// Additional check: if all endpoints in group are disabled, consider group disabled
for (String endpoint : endpoints) {
if (!isEndpointEnabled(endpoint)) {
return false;
@ -74,7 +96,22 @@ public class EndpointConfiguration {
endpointGroups.computeIfAbsent(group, k -> new HashSet<>()).add(endpoint);
}
public void addEndpointAlternative(String endpoint, String toolGroup) {
endpointAlternatives.computeIfAbsent(endpoint, k -> new HashSet<>()).add(toolGroup);
}
public void disableGroup(String group) {
disabledGroups.add(group);
Set<String> endpoints = endpointGroups.get(group);
if (endpoints != null) {
for (String endpoint : endpoints) {
disableEndpoint(endpoint);
}
}
}
public void enableGroup(String group) {
disabledGroups.remove(group);
Set<String> endpoints = endpointGroups.get(group);
if (endpoints != null) {
for (String endpoint : endpoints) {
@ -83,13 +120,8 @@ public class EndpointConfiguration {
}
}
public void disableGroup(String group) {
Set<String> endpoints = endpointGroups.get(group);
if (endpoints != null) {
for (String endpoint : endpoints) {
disableEndpoint(endpoint);
}
}
public Set<String> getDisabledGroups() {
return new HashSet<>(disabledGroups);
}
public void logDisabledEndpointsSummary() {
@ -101,6 +133,12 @@ public class EndpointConfiguration {
.sorted()
.toList();
if (!disabledGroups.isEmpty()) {
log.info(
"Disabled groups: {}",
String.join(", ", disabledGroups.stream().sorted().toList()));
}
if (!disabledList.isEmpty()) {
log.info(
"Total disabled endpoints: {}. Disabled endpoints: {}",
@ -212,7 +250,6 @@ public class EndpointConfiguration {
// Unoconvert
addEndpointToGroup("Unoconvert", "file-to-pdf");
addEndpointToGroup("tesseract", "ocr-pdf");
// Java
addEndpointToGroup("Java", "merge-pdfs");
@ -261,8 +298,13 @@ public class EndpointConfiguration {
addEndpointToGroup("Javascript", "compare");
addEndpointToGroup("Javascript", "adjust-contrast");
// qpdf dependent endpoints
addEndpointToGroup("qpdf", "repair");
// Multi-tool endpoints - endpoints that can be handled by multiple tools
addEndpointAlternative("repair", "qpdf");
addEndpointAlternative("repair", "Ghostscript");
addEndpointAlternative("compress-pdf", "qpdf");
addEndpointAlternative("compress-pdf", "Ghostscript");
addEndpointAlternative("ocr-pdf", "tesseract");
addEndpointAlternative("ocr-pdf", "OCRmyPDF");
// Weasyprint dependent endpoints
addEndpointToGroup("Weasyprint", "html-to-pdf");

View File

@ -34,6 +34,8 @@ public class ExternalAppDepConfig {
new HashMap<>() {
{
put("gs", List.of("Ghostscript"));
put("ocrmypdf", List.of("OCRmyPDF"));
put("soffice", List.of("LibreOffice"));
put(weasyprintPath, List.of("Weasyprint"));
put("pdftohtml", List.of("Pdftohtml"));
@ -109,6 +111,8 @@ public class ExternalAppDepConfig {
@PostConstruct
public void checkDependencies() {
// Check core dependencies
checkDependencyAndDisableGroup("gs");
checkDependencyAndDisableGroup("ocrmypdf");
checkDependencyAndDisableGroup("tesseract");
checkDependencyAndDisableGroup("soffice");
checkDependencyAndDisableGroup("qpdf");

View File

@ -65,12 +65,14 @@ public class CompressController {
private final CustomPDFDocumentFactory pdfDocumentFactory;
private final boolean qpdfEnabled;
private final boolean ghostscriptEnabled;
public CompressController(
CustomPDFDocumentFactory pdfDocumentFactory,
EndpointConfiguration endpointConfiguration) {
this.pdfDocumentFactory = pdfDocumentFactory;
this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf");
this.ghostscriptEnabled = endpointConfiguration.isGroupEnabled("Ghostscript");
}
@Data
@ -697,25 +699,69 @@ public class CompressController {
boolean sizeMet = false;
boolean imageCompressionApplied = false;
boolean qpdfCompressionApplied = false;
if (qpdfEnabled && optimizeLevel <= 3) {
optimizeLevel = 4;
}
boolean externalCompressionApplied = false;
while (!sizeMet && optimizeLevel <= 9) {
// Apply image compression for levels 4-9
if ((optimizeLevel >= 3 || Boolean.TRUE.equals(convertToGrayscale))
&& !imageCompressionApplied) {
double scaleFactor = getScaleFactorForLevel(optimizeLevel);
float jpegQuality = getJpegQualityForLevel(optimizeLevel);
// Apply external compression first
if (!externalCompressionApplied) {
boolean ghostscriptSuccess = false;
// Compress images
// Try Ghostscript first if available - for ANY compression level
if (ghostscriptEnabled) {
try {
applyGhostscriptCompression(
request, optimizeLevel, currentFile, tempFiles);
log.info("Ghostscript compression applied successfully");
ghostscriptSuccess = true;
} catch (IOException e) {
log.warn("Ghostscript compression failed, trying fallback methods");
}
}
// Fallback to QPDF if Ghostscript failed or not available (levels 1-3 only)
if (!ghostscriptSuccess && qpdfEnabled && optimizeLevel <= 3) {
try {
applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles);
log.info("QPDF compression applied successfully");
} catch (IOException e) {
log.warn("QPDF compression also failed");
}
}
if (!ghostscriptSuccess && !qpdfEnabled) {
log.info(
"No external compression tools available, using image compression only");
}
externalCompressionApplied = true;
// Skip image compression if Ghostscript succeeded
if (ghostscriptSuccess) {
imageCompressionApplied = true;
}
}
// Apply image compression for levels 4+ only if Ghostscript didn't run
if ((optimizeLevel >= 4 || Boolean.TRUE.equals(convertToGrayscale))
&& !imageCompressionApplied) {
// Use different scale factors based on level
double scaleFactor =
switch (optimizeLevel) {
case 4 -> 0.95; // 95% of original size
case 5 -> 0.9; // 90% of original size
case 6 -> 0.8; // 80% of original size
case 7 -> 0.7; // 70% of original size
case 8 -> 0.65; // 65% of original size
case 9 -> 0.5; // 50% of original size
default -> 1.0;
};
log.info("Applying image compression with scale factor: {}", scaleFactor);
Path compressedImageFile =
compressImagesInPDF(
currentFile,
scaleFactor,
jpegQuality,
0.7f, // Default JPEG quality
Boolean.TRUE.equals(convertToGrayscale));
tempFiles.add(compressedImageFile);
@ -723,18 +769,6 @@ public class CompressController {
imageCompressionApplied = true;
}
// Apply QPDF compression for all levels
if (!qpdfCompressionApplied && qpdfEnabled) {
applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles);
qpdfCompressionApplied = true;
} else if (!qpdfCompressionApplied) {
// If QPDF is disabled, mark as applied and log
if (!qpdfEnabled) {
log.info("Skipping QPDF compression as QPDF group is disabled");
}
qpdfCompressionApplied = true;
}
// Check if target size reached or not in auto mode
long outputFileSize = Files.size(currentFile);
if (outputFileSize <= expectedOutputSize || !autoMode) {
@ -754,7 +788,7 @@ public class CompressController {
} else {
// Reset flags for next iteration with higher optimization level
imageCompressionApplied = false;
qpdfCompressionApplied = false;
externalCompressionApplied = false;
optimizeLevel = newOptimizeLevel;
}
}
@ -788,6 +822,96 @@ public class CompressController {
}
}
// Run Ghostscript compression
private void applyGhostscriptCompression(
OptimizePdfRequest request, int optimizeLevel, Path currentFile, List<Path> tempFiles)
throws IOException {
long preGsSize = Files.size(currentFile);
log.info("Pre-Ghostscript file size: {}", GeneralUtils.formatBytes(preGsSize));
// Create output file for Ghostscript
Path gsOutputFile = Files.createTempFile("gs_output_", ".pdf");
tempFiles.add(gsOutputFile);
// Build Ghostscript command based on optimization level
List<String> command = new ArrayList<>();
command.add("gs");
command.add("-sDEVICE=pdfwrite");
command.add("-dCompatibilityLevel=1.5");
command.add("-dNOPAUSE");
command.add("-dQUIET");
command.add("-dBATCH");
// Map optimization levels to Ghostscript settings
switch (optimizeLevel) {
case 1:
command.add("-dPDFSETTINGS=/prepress");
break;
case 2:
command.add("-dPDFSETTINGS=/printer");
break;
case 3:
command.add("-dPDFSETTINGS=/ebook");
break;
case 4:
case 5:
command.add("-dPDFSETTINGS=/screen");
break;
case 6:
case 7:
command.add("-dPDFSETTINGS=/screen");
command.add("-dColorImageResolution=150");
command.add("-dGrayImageResolution=150");
command.add("-dMonoImageResolution=300");
break;
case 8:
case 9:
command.add("-dPDFSETTINGS=/screen");
command.add("-dColorImageResolution=100");
command.add("-dGrayImageResolution=100");
command.add("-dMonoImageResolution=200");
break;
case 10:
command.add("-dPDFSETTINGS=/screen");
command.add("-dColorImageResolution=72");
command.add("-dGrayImageResolution=72");
command.add("-dMonoImageResolution=150");
break;
default:
command.add("-dPDFSETTINGS=/screen");
break;
}
command.add("-sOutputFile=" + gsOutputFile.toString());
command.add(currentFile.toString());
ProcessExecutorResult returnCode = null;
try {
returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
.runCommandWithOutputHandling(command);
if (returnCode.getRc() == 0) {
// Update current file to the Ghostscript output
Files.copy(gsOutputFile, currentFile, StandardCopyOption.REPLACE_EXISTING);
long postGsSize = Files.size(currentFile);
double gsReduction = 100.0 - ((postGsSize * 100.0) / preGsSize);
log.info(
"Post-Ghostscript file size: {} (reduced by {}%)",
GeneralUtils.formatBytes(postGsSize), String.format("%.1f", gsReduction));
} else {
log.warn("Ghostscript compression failed with return code: {}", returnCode.getRc());
throw new IOException("Ghostscript compression failed");
}
} catch (Exception e) {
log.warn("Ghostscript compression failed, will fallback to other methods", e);
throw new IOException("Ghostscript compression failed", e);
}
}
// Run QPDF compression
private void applyQpdfCompression(
OptimizePdfRequest request, int optimizeLevel, Path currentFile, List<Path> tempFiles)

View File

@ -2,6 +2,7 @@ package stirling.software.SPDF.controller.api.misc;
import java.awt.image.BufferedImage;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.zip.ZipEntry;
@ -26,26 +27,42 @@ import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.config.EndpointConfiguration;
import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
import stirling.software.common.model.ApplicationProperties;
import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.ProcessExecutor;
import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
import stirling.software.common.util.TempDirectory;
import stirling.software.common.util.TempFile;
import stirling.software.common.util.TempFileManager;
import stirling.software.common.util.WebResponseUtils;
@RestController
@RequestMapping("/api/v1/misc")
@Tag(name = "Misc", description = "Miscellaneous APIs")
@Slf4j
@RequiredArgsConstructor
public class OCRController {
private final ApplicationProperties applicationProperties;
private final CustomPDFDocumentFactory pdfDocumentFactory;
private final TempFileManager tempFileManager;
private final boolean ocrMyPdfEnabled;
private final boolean tesseractEnabled;
public OCRController(
ApplicationProperties applicationProperties,
CustomPDFDocumentFactory pdfDocumentFactory,
TempFileManager tempFileManager,
EndpointConfiguration endpointConfiguration) {
this.applicationProperties = applicationProperties;
this.pdfDocumentFactory = pdfDocumentFactory;
this.tempFileManager = tempFileManager;
this.ocrMyPdfEnabled = endpointConfiguration.isGroupEnabled("OCRmyPDF");
this.tesseractEnabled = endpointConfiguration.isGroupEnabled("tesseract");
}
/** Gets the list of available Tesseract languages from the tessdata directory */
public List<String> getAvailableTesseractLanguages() {
@ -63,39 +80,261 @@ public class OCRController {
@PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
@Operation(
summary = "Process PDF files with OCR using Tesseract",
summary = "Process a PDF file with OCR",
description =
"Takes a PDF file as input, performs OCR using specified languages and OCR type"
+ " (skip-text/force-ocr), and returns the processed PDF. Input:PDF"
+ " Output:PDF Type:SISO")
"This endpoint processes a PDF file using OCR (Optical Character Recognition). "
+ "Users can specify languages, sidecar, deskew, clean, cleanFinal, ocrType, ocrRenderType, and removeImagesAfter options. "
+ "Uses OCRmyPDF if available, falls back to Tesseract. Input:PDF Output:PDF Type:SI-Conditional")
public ResponseEntity<byte[]> processPdfWithOCR(
@ModelAttribute ProcessPdfWithOcrRequest request)
throws IOException, InterruptedException {
MultipartFile inputFile = request.getFileInput();
List<String> languages = request.getLanguages();
List<String> selectedLanguages = request.getLanguages();
Boolean sidecar = request.isSidecar();
Boolean deskew = request.isDeskew();
Boolean clean = request.isClean();
Boolean cleanFinal = request.isCleanFinal();
String ocrType = request.getOcrType();
String ocrRenderType = request.getOcrRenderType();
Boolean removeImagesAfter = request.isRemoveImagesAfter();
// Create a temp directory using TempFileManager directly
Path tempDirPath = tempFileManager.createTempDirectory();
File tempDir = tempDirPath.toFile();
if (selectedLanguages == null || selectedLanguages.isEmpty()) {
throw new IOException("Please select at least one language.");
}
if (!"hocr".equals(ocrRenderType) && !"sandwich".equals(ocrRenderType)) {
throw new IOException("ocrRenderType wrong");
}
// Get available Tesseract languages
List<String> availableLanguages = getAvailableTesseractLanguages();
// Validate selected languages
selectedLanguages =
selectedLanguages.stream().filter(availableLanguages::contains).toList();
if (selectedLanguages.isEmpty()) {
throw new IOException("None of the selected languages are valid.");
}
// Use try-with-resources for proper temp file management
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
inputFile.transferTo(tempInputFile.getFile());
TempFile sidecarTextFile = null;
try {
File tempInputFile = new File(tempDir, "input.pdf");
File tempOutputDir = new File(tempDir, "output");
File tempImagesDir = new File(tempDir, "images");
File finalOutputFile = new File(tempDir, "final_output.pdf");
// Use OCRmyPDF if available (no fallback - error if it fails)
if (ocrMyPdfEnabled) {
if (sidecar != null && sidecar) {
sidecarTextFile = new TempFile(tempFileManager, ".txt");
}
processWithOcrMyPdf(
selectedLanguages,
sidecar,
deskew,
clean,
cleanFinal,
ocrType,
ocrRenderType,
removeImagesAfter,
tempInputFile.getPath(),
tempOutputFile.getPath(),
sidecarTextFile != null ? sidecarTextFile.getPath() : null);
log.info("OCRmyPDF processing completed successfully");
}
// Use Tesseract only if OCRmyPDF is not available
else if (tesseractEnabled) {
processWithTesseract(
selectedLanguages,
ocrType,
tempInputFile.getPath(),
tempOutputFile.getPath());
log.info("Tesseract processing completed successfully");
} else {
throw new IOException("No OCR tools are available");
}
// Read the processed PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile.getPath());
// Return the OCR processed PDF as a response
String outputFilename =
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
.replaceFirst("[.][^.]+$", "")
+ "_OCR.pdf";
if (sidecar != null && sidecar && sidecarTextFile != null) {
// Create a zip file containing both the PDF and the text file
String outputZipFilename =
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
.replaceFirst("[.][^.]+$", "")
+ "_OCR.zip";
try (TempFile tempZipFile = new TempFile(tempFileManager, ".zip");
ZipOutputStream zipOut =
new ZipOutputStream(
Files.newOutputStream(tempZipFile.getPath()))) {
// Add PDF file to the zip
ZipEntry pdfEntry = new ZipEntry(outputFilename);
zipOut.putNextEntry(pdfEntry);
zipOut.write(pdfBytes);
zipOut.closeEntry();
// Add text file to the zip
ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
zipOut.putNextEntry(txtEntry);
Files.copy(sidecarTextFile.getPath(), zipOut);
zipOut.closeEntry();
zipOut.finish();
byte[] zipBytes = Files.readAllBytes(tempZipFile.getPath());
// Return the zip file containing both the PDF and the text file
return WebResponseUtils.bytesToWebResponse(
zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
}
} else {
// Return the OCR processed PDF as a response
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
} finally {
// Clean up sidecar temp file if created
if (sidecarTextFile != null) {
try {
sidecarTextFile.close();
} catch (Exception e) {
log.warn("Failed to close sidecar temp file", e);
}
}
}
}
}
private void processWithOcrMyPdf(
List<String> selectedLanguages,
Boolean sidecar,
Boolean deskew,
Boolean clean,
Boolean cleanFinal,
String ocrType,
String ocrRenderType,
Boolean removeImagesAfter,
Path tempInputFile,
Path tempOutputFile,
Path sidecarTextPath)
throws IOException, InterruptedException {
// Build OCRmyPDF command
String languageOption = String.join("+", selectedLanguages);
List<String> command =
new ArrayList<>(
Arrays.asList(
"ocrmypdf",
"--verbose",
"2",
"--output-type",
"pdf",
"--pdf-renderer",
ocrRenderType));
if (sidecar != null && sidecar && sidecarTextPath != null) {
command.add("--sidecar");
command.add(sidecarTextPath.toString());
}
if (deskew != null && deskew) {
command.add("--deskew");
}
if (clean != null && clean) {
command.add("--clean");
}
if (cleanFinal != null && cleanFinal) {
command.add("--clean-final");
}
if (ocrType != null && !"".equals(ocrType)) {
if ("skip-text".equals(ocrType)) {
command.add("--skip-text");
} else if ("force-ocr".equals(ocrType)) {
command.add("--force-ocr");
}
}
command.addAll(
Arrays.asList(
"--language",
languageOption,
tempInputFile.toString(),
tempOutputFile.toString()));
// Run CLI command
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
.runCommandWithOutputHandling(command);
if (result.getRc() != 0
&& result.getMessages().contains("multiprocessing/synchronize.py")
&& result.getMessages().contains("OSError: [Errno 38] Function not implemented")) {
command.add("--jobs");
command.add("1");
result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
.runCommandWithOutputHandling(command);
}
if (result.getRc() != 0) {
throw new IOException("OCRmyPDF failed with return code: " + result.getRc());
}
// Remove images from the OCR processed PDF if the flag is set to true
if (removeImagesAfter != null && removeImagesAfter) {
try (TempFile tempPdfWithoutImages = new TempFile(tempFileManager, "_no_images.pdf")) {
List<String> gsCommand =
Arrays.asList(
"gs",
"-sDEVICE=pdfwrite",
"-dFILTERIMAGE",
"-o",
tempPdfWithoutImages.getPath().toString(),
tempOutputFile.toString());
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
.runCommandWithOutputHandling(gsCommand);
// Replace output file with version without images
Files.copy(
tempPdfWithoutImages.getPath(),
tempOutputFile,
java.nio.file.StandardCopyOption.REPLACE_EXISTING);
}
}
}
private void processWithTesseract(
List<String> selectedLanguages, String ocrType, Path tempInputFile, Path tempOutputFile)
throws IOException, InterruptedException {
// Create temp directory for Tesseract processing
try (TempDirectory tempDir = new TempDirectory(tempFileManager)) {
File tempOutputDir = new File(tempDir.getPath().toFile(), "output");
File tempImagesDir = new File(tempDir.getPath().toFile(), "images");
File finalOutputFile = new File(tempDir.getPath().toFile(), "final_output.pdf");
// Create directories
tempOutputDir.mkdirs();
tempImagesDir.mkdirs();
// Save input file
inputFile.transferTo(tempInputFile);
PDFMergerUtility merger = new PDFMergerUtility();
merger.setDestinationFileName(finalOutputFile.toString());
try (PDDocument document = pdfDocumentFactory.load(tempInputFile)) {
try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) {
PDFRenderer pdfRenderer = new PDFRenderer(document);
int pageCount = document.getNumberOfPages();
@ -135,35 +374,20 @@ public class OCRController {
new File(tempOutputDir, String.format("page_%d", pageNum))
.toString());
command.add("-l");
command.add(String.join("+", languages));
// Always output PDF
command.add("pdf");
command.add(String.join("+", selectedLanguages));
command.add("pdf"); // Always output PDF
// Use ProcessExecutor to run tesseract command
try {
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
.runCommandWithOutputHandling(command);
log.debug(
"Tesseract OCR completed for page {} with exit code {}",
pageNum,
result.getRc());
if (result.getRc() != 0) {
throw new RuntimeException(
"Tesseract failed with exit code: " + result.getRc());
}
// Add OCR'd PDF to merger
merger.addSource(pageOutputPath);
} catch (IOException | InterruptedException e) {
log.error(
"Error processing page {} with tesseract: {}",
pageNum,
e.getMessage());
// If OCR fails, fall back to the original page
try (PDDocument pageDoc = new PDDocument()) {
pageDoc.addPage(page);
pageDoc.save(pageOutputPath);
merger.addSource(pageOutputPath);
}
}
} else {
// Save original page without OCR
try (PDDocument pageDoc = new PDDocument()) {
@ -178,40 +402,11 @@ public class OCRController {
// Merge all pages into final PDF
merger.mergeDocuments(null);
// Read the final PDF file
byte[] pdfContent = java.nio.file.Files.readAllBytes(finalOutputFile.toPath());
String outputFilename =
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
.replaceFirst("[.][^.]+$", "")
+ "_OCR.pdf";
return ResponseEntity.ok()
.header(
"Content-Disposition",
"attachment; filename=\"" + outputFilename + "\"")
.contentType(MediaType.APPLICATION_PDF)
.body(pdfContent);
} finally {
// Clean up the temp directory and all its contents
tempFileManager.deleteTempDirectory(tempDirPath);
}
}
private void addFileToZip(File file, String filename, ZipOutputStream zipOut)
throws IOException {
if (!file.exists()) {
log.warn("File {} does not exist, skipping", file);
return;
}
try (FileInputStream fis = new FileInputStream(file)) {
ZipEntry zipEntry = new ZipEntry(filename);
zipOut.putNextEntry(zipEntry);
byte[] buffer = new byte[1024];
int length;
while ((length = fis.read(buffer)) >= 0) {
zipOut.write(buffer, 0, length);
}
zipOut.closeEntry();
// Copy final output to the expected location
Files.copy(
finalOutputFile.toPath(),
tempOutputFile,
java.nio.file.StandardCopyOption.REPLACE_EXISTING);
}
}
}

View File

@ -15,8 +15,7 @@ import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.RequiredArgsConstructor;
import stirling.software.SPDF.config.EndpointConfiguration;
import stirling.software.common.model.api.PDFFile;
import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.ProcessExecutor;
@ -28,17 +27,28 @@ import stirling.software.common.util.WebResponseUtils;
@RestController
@RequestMapping("/api/v1/misc")
@Tag(name = "Misc", description = "Miscellaneous APIs")
@RequiredArgsConstructor
public class RepairController {
private final CustomPDFDocumentFactory pdfDocumentFactory;
private final TempFileManager tempFileManager;
private final boolean ghostscriptEnabled;
private final boolean qpdfEnabled;
public RepairController(
CustomPDFDocumentFactory pdfDocumentFactory,
TempFileManager tempFileManager,
EndpointConfiguration endpointConfiguration) {
this.pdfDocumentFactory = pdfDocumentFactory;
this.tempFileManager = tempFileManager;
this.ghostscriptEnabled = endpointConfiguration.isGroupEnabled("Ghostscript");
this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf");
}
@PostMapping(consumes = "multipart/form-data", value = "/repair")
@Operation(
summary = "Repair a PDF file",
description =
"This endpoint repairs a given PDF file by running qpdf command. The PDF is"
"This endpoint repairs a given PDF file by running Ghostscript (primary), qpdf (fallback), or PDFBox (if no external tools available). The PDF is"
+ " first saved to a temporary location, repaired, read back, and then"
+ " returned as a response. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> repairPdf(@ModelAttribute PDFFile file)
@ -46,25 +56,72 @@ public class RepairController {
MultipartFile inputFile = file.getFileInput();
// Use TempFile with try-with-resources for automatic cleanup
try (TempFile tempFile = new TempFile(tempFileManager, ".pdf")) {
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
// Save the uploaded file to the temporary location
inputFile.transferTo(tempFile.getFile());
inputFile.transferTo(tempInputFile.getFile());
List<String> command = new ArrayList<>();
command.add("qpdf");
command.add("--replace-input"); // Automatically fixes problems it can
command.add("--qdf"); // Linearizes and normalizes PDF structure
command.add("--object-streams=disable"); // Can help with some corruptions
command.add(tempFile.getFile().getAbsolutePath());
boolean repairSuccess = false;
ProcessExecutorResult returnCode =
// Try Ghostscript first if available
if (ghostscriptEnabled) {
try {
List<String> gsCommand = new ArrayList<>();
gsCommand.add("gs");
gsCommand.add("-o");
gsCommand.add(tempOutputFile.getPath().toString());
gsCommand.add("-sDEVICE=pdfwrite");
gsCommand.add(tempInputFile.getPath().toString());
ProcessExecutorResult gsResult =
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
.runCommandWithOutputHandling(gsCommand);
if (gsResult.getRc() == 0) {
repairSuccess = true;
}
} catch (Exception e) {
// Log and continue to QPDF fallback
System.out.println(
"Ghostscript repair failed, trying QPDF fallback: " + e.getMessage());
}
}
// Fallback to QPDF if Ghostscript failed or not available
if (!repairSuccess && qpdfEnabled) {
List<String> qpdfCommand = new ArrayList<>();
qpdfCommand.add("qpdf");
qpdfCommand.add("--replace-input"); // Automatically fixes problems it can
qpdfCommand.add("--qdf"); // Linearizes and normalizes PDF structure
qpdfCommand.add("--object-streams=disable"); // Can help with some corruptions
qpdfCommand.add(tempInputFile.getPath().toString());
qpdfCommand.add(tempOutputFile.getPath().toString());
ProcessExecutorResult qpdfResult =
ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
.runCommandWithOutputHandling(command);
.runCommandWithOutputHandling(qpdfCommand);
// Read the optimized PDF file
byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempFile.getFile());
repairSuccess = true;
}
// Return the optimized PDF as a response
// Use PDFBox as last resort if no external tools are available
if (!repairSuccess) {
if (!ghostscriptEnabled && !qpdfEnabled) {
// Basic PDFBox repair - load and save to fix structural issues
try (var document = pdfDocumentFactory.load(tempInputFile.getFile())) {
document.save(tempOutputFile.getFile());
repairSuccess = true;
}
} else {
throw new IOException("PDF repair failed with available tools");
}
}
// Read the repaired PDF file
byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempOutputFile.getFile());
// Return the repaired PDF as a response
String outputFilename =
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
.replaceFirst("[.][^.]+$", "")

View File

@ -19,6 +19,18 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
defaultValue = "[\"eng\"]")
private List<String> languages;
@Schema(description = "Include OCR text in a sidecar text file if set to true")
private boolean sidecar;
@Schema(description = "Deskew the input file if set to true")
private boolean deskew;
@Schema(description = "Clean the input file if set to true")
private boolean clean;
@Schema(description = "Clean the final output if set to true")
private boolean cleanFinal;
@Schema(
description = "Specify the OCR type, e.g., 'skip-text', 'force-ocr', or 'Normal'",
requiredMode = Schema.RequiredMode.REQUIRED,
@ -31,4 +43,7 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
allowableValues = {"hocr", "sandwich"},
defaultValue = "hocr")
private String ocrRenderType = "hocr";
@Schema(description = "Remove images from the output PDF if set to true")
private boolean removeImagesAfter;
}

View File

@ -79,6 +79,30 @@
</select>
</div>
<br>
<div class="mb-3" th:if>
<label class="form-label">OCR Options</label>
<div class="form-check">
<input type="checkbox" class="form-check-input" id="sidecar" name="sidecar" value="true">
<label class="form-check-label" for="sidecar">Include OCR text in sidecar text file</label>
</div>
<div class="form-check">
<input type="checkbox" class="form-check-input" id="deskew" name="deskew" value="true">
<label class="form-check-label" for="deskew">Deskew input file</label>
</div>
<div class="form-check">
<input type="checkbox" class="form-check-input" id="clean" name="clean" value="true">
<label class="form-check-label" for="clean">Clean input file</label>
</div>
<div class="form-check">
<input type="checkbox" class="form-check-input" id="cleanFinal" name="cleanFinal" value="true">
<label class="form-check-label" for="cleanFinal">Clean final output</label>
</div>
<div class="form-check">
<input type="checkbox" class="form-check-input" id="removeImagesAfter" name="removeImagesAfter" value="true">
<label class="form-check-label" for="removeImagesAfter">Remove images from output PDF</label>
</div>
</div>
<br>
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{ocr.submit}"></button>
</form>
<script th:inline="javascript">