From 782c30f934d06128aecb0530e2278324173237dd Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com.> Date: Mon, 30 Jun 2025 22:27:45 +0100 Subject: [PATCH] restore OCRMyPDF and ghostscript compression --- Dockerfile | 3 +- Dockerfile.fat | 3 +- .../common/model/ApplicationProperties.java | 20 + .../service/TempFileCleanupService.java | 4 +- .../software/common/util/ProcessExecutor.java | 24 +- .../common/util/TempFileRegistry.java | 10 +- .../SPDF/config/EndpointConfiguration.java | 62 ++- .../SPDF/config/ExternalAppDepConfig.java | 4 + .../api/misc/CompressController.java | 174 +++++++-- .../controller/api/misc/OCRController.java | 353 ++++++++++++++---- .../controller/api/misc/RepairController.java | 93 ++++- .../api/misc/ProcessPdfWithOcrRequest.java | 15 + .../resources/templates/misc/ocr-pdf.html | 24 ++ 13 files changed, 646 insertions(+), 143 deletions(-) diff --git a/Dockerfile b/Dockerfile index fd02b29f7..1edf05841 100644 --- a/Dockerfile +++ b/Dockerfile @@ -51,7 +51,6 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a tini \ bash \ curl \ - qpdf \ shadow \ su-exec \ openssl \ @@ -69,9 +68,11 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a tesseract-ocr-data-deu \ tesseract-ocr-data-fra \ tesseract-ocr-data-por \ + unpaper \ # CV py3-opencv \ python3 \ + ocrmypdf \ py3-pip \ py3-pillow@testing \ py3-pdf2image@testing && \ diff --git a/Dockerfile.fat b/Dockerfile.fat index 666ba98be..976c1ee17 100644 --- a/Dockerfile.fat +++ b/Dockerfile.fat @@ -76,16 +76,17 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a # pdftohtml poppler-utils \ # OCR MY PDF (unpaper for descew and other advanced featues) - qpdf \ tesseract-ocr-data-eng \ tesseract-ocr-data-chi_sim \ tesseract-ocr-data-deu \ tesseract-ocr-data-fra \ tesseract-ocr-data-por \ + unpaper \ font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \ # CV py3-opencv \ python3 \ + ocrmypdf \ py3-pip \ py3-pillow@testing \ py3-pdf2image@testing && \ diff --git a/common/src/main/java/stirling/software/common/model/ApplicationProperties.java b/common/src/main/java/stirling/software/common/model/ApplicationProperties.java index 0017fa34a..e4edf2baa 100644 --- a/common/src/main/java/stirling/software/common/model/ApplicationProperties.java +++ b/common/src/main/java/stirling/software/common/model/ApplicationProperties.java @@ -545,6 +545,8 @@ public class ApplicationProperties { private int calibreSessionLimit; private int qpdfSessionLimit; private int tesseractSessionLimit; + private int ghostscriptSessionLimit; + private int ocrMyPdfSessionLimit; public int getQpdfSessionLimit() { return qpdfSessionLimit > 0 ? qpdfSessionLimit : 2; @@ -577,6 +579,14 @@ public class ApplicationProperties { public int getCalibreSessionLimit() { return calibreSessionLimit > 0 ? calibreSessionLimit : 1; } + + public int getGhostscriptSessionLimit() { + return ghostscriptSessionLimit > 0 ? ghostscriptSessionLimit : 8; + } + + public int getOcrMyPdfSessionLimit() { + return ocrMyPdfSessionLimit > 0 ? ocrMyPdfSessionLimit : 2; + } } @Data @@ -589,6 +599,8 @@ public class ApplicationProperties { private long calibreTimeoutMinutes; private long tesseractTimeoutMinutes; private long qpdfTimeoutMinutes; + private long ghostscriptTimeoutMinutes; + private long ocrMyPdfTimeoutMinutes; public long getTesseractTimeoutMinutes() { return tesseractTimeoutMinutes > 0 ? tesseractTimeoutMinutes : 30; @@ -621,6 +633,14 @@ public class ApplicationProperties { public long getCalibreTimeoutMinutes() { return calibreTimeoutMinutes > 0 ? calibreTimeoutMinutes : 30; } + + public long getGhostscriptTimeoutMinutes() { + return ghostscriptTimeoutMinutes > 0 ? ghostscriptTimeoutMinutes : 30; + } + + public long getOcrMyPdfTimeoutMinutes() { + return ocrMyPdfTimeoutMinutes > 0 ? ocrMyPdfTimeoutMinutes : 30; + } } } } diff --git a/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java b/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java index d53c4ea84..895aa70de 100644 --- a/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java +++ b/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java @@ -308,7 +308,7 @@ public class TempFileCleanupService { } java.util.List subdirectories = new java.util.ArrayList<>(); - + try (Stream pathStream = Files.list(directory)) { pathStream.forEach( path -> { @@ -347,7 +347,7 @@ public class TempFileCleanupService { } }); } - + for (Path subdirectory : subdirectories) { try { cleanupDirectoryStreaming( diff --git a/common/src/main/java/stirling/software/common/util/ProcessExecutor.java b/common/src/main/java/stirling/software/common/util/ProcessExecutor.java index 09c5ff675..ee7297153 100644 --- a/common/src/main/java/stirling/software/common/util/ProcessExecutor.java +++ b/common/src/main/java/stirling/software/common/util/ProcessExecutor.java @@ -84,6 +84,16 @@ public class ProcessExecutor { .getProcessExecutor() .getSessionLimit() .getCalibreSessionLimit(); + case GHOSTSCRIPT -> + applicationProperties + .getProcessExecutor() + .getSessionLimit() + .getGhostscriptSessionLimit(); + case OCR_MY_PDF -> + applicationProperties + .getProcessExecutor() + .getSessionLimit() + .getOcrMyPdfSessionLimit(); }; long timeoutMinutes = @@ -128,6 +138,16 @@ public class ProcessExecutor { .getProcessExecutor() .getTimeoutMinutes() .getCalibreTimeoutMinutes(); + case GHOSTSCRIPT -> + applicationProperties + .getProcessExecutor() + .getTimeoutMinutes() + .getGhostscriptTimeoutMinutes(); + case OCR_MY_PDF -> + applicationProperties + .getProcessExecutor() + .getTimeoutMinutes() + .getOcrMyPdfTimeoutMinutes(); }; return new ProcessExecutor(semaphoreLimit, liveUpdates, timeoutMinutes); }); @@ -278,7 +298,9 @@ public class ProcessExecutor { INSTALL_APP, CALIBRE, TESSERACT, - QPDF + QPDF, + GHOSTSCRIPT, + OCR_MY_PDF } public class ProcessExecutorResult { diff --git a/common/src/main/java/stirling/software/common/util/TempFileRegistry.java b/common/src/main/java/stirling/software/common/util/TempFileRegistry.java index 1e55c6b15..323b3bff3 100644 --- a/common/src/main/java/stirling/software/common/util/TempFileRegistry.java +++ b/common/src/main/java/stirling/software/common/util/TempFileRegistry.java @@ -9,7 +9,6 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; -import java.util.concurrent.ConcurrentSkipListSet; import java.util.stream.Collectors; import org.springframework.stereotype.Component; @@ -24,11 +23,10 @@ import lombok.extern.slf4j.Slf4j; @Component public class TempFileRegistry { - private final ConcurrentMap registeredFiles = new ConcurrentHashMap<>(); - private final Set thirdPartyTempFiles = - Collections.newSetFromMap(new ConcurrentHashMap<>()); - private final Set tempDirectories = - Collections.newSetFromMap(new ConcurrentHashMap<>()); + private final ConcurrentMap registeredFiles = new ConcurrentHashMap<>(); + private final Set thirdPartyTempFiles = + Collections.newSetFromMap(new ConcurrentHashMap<>()); + private final Set tempDirectories = Collections.newSetFromMap(new ConcurrentHashMap<>()); /** * Register a temporary file with the registry. diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java b/stirling-pdf/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java index 361eeace3..25115acb1 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java @@ -21,6 +21,8 @@ public class EndpointConfiguration { private final ApplicationProperties applicationProperties; private Map endpointStatuses = new ConcurrentHashMap<>(); private Map> endpointGroups = new ConcurrentHashMap<>(); + private Set disabledGroups = new HashSet<>(); + private Map> endpointAlternatives = new ConcurrentHashMap<>(); private final boolean runningProOrHigher; public EndpointConfiguration( @@ -51,16 +53,36 @@ public class EndpointConfiguration { if (endpoint.startsWith("/")) { endpoint = endpoint.substring(1); } + + // Check if endpoint has alternatives (multiple tools can handle it) + Set alternatives = endpointAlternatives.get(endpoint); + if (alternatives != null && !alternatives.isEmpty()) { + // Endpoint is enabled if ANY of its alternative tools are enabled + for (String toolGroup : alternatives) { + if (isGroupEnabled(toolGroup)) { + return true; + } + } + return false; // All alternative tools are disabled + } + + // Fallback to standard endpoint status check return endpointStatuses.getOrDefault(endpoint, true); } public boolean isGroupEnabled(String group) { + // Check if group is explicitly disabled first + if (disabledGroups.contains(group)) { + return false; + } + Set endpoints = endpointGroups.get(group); if (endpoints == null || endpoints.isEmpty()) { log.debug("Group '{}' does not exist or has no endpoints", group); return false; } + // Additional check: if all endpoints in group are disabled, consider group disabled for (String endpoint : endpoints) { if (!isEndpointEnabled(endpoint)) { return false; @@ -73,8 +95,23 @@ public class EndpointConfiguration { public void addEndpointToGroup(String group, String endpoint) { endpointGroups.computeIfAbsent(group, k -> new HashSet<>()).add(endpoint); } + + public void addEndpointAlternative(String endpoint, String toolGroup) { + endpointAlternatives.computeIfAbsent(endpoint, k -> new HashSet<>()).add(toolGroup); + } + + public void disableGroup(String group) { + disabledGroups.add(group); + Set endpoints = endpointGroups.get(group); + if (endpoints != null) { + for (String endpoint : endpoints) { + disableEndpoint(endpoint); + } + } + } public void enableGroup(String group) { + disabledGroups.remove(group); Set endpoints = endpointGroups.get(group); if (endpoints != null) { for (String endpoint : endpoints) { @@ -83,13 +120,8 @@ public class EndpointConfiguration { } } - public void disableGroup(String group) { - Set endpoints = endpointGroups.get(group); - if (endpoints != null) { - for (String endpoint : endpoints) { - disableEndpoint(endpoint); - } - } + public Set getDisabledGroups() { + return new HashSet<>(disabledGroups); } public void logDisabledEndpointsSummary() { @@ -101,6 +133,12 @@ public class EndpointConfiguration { .sorted() .toList(); + if (!disabledGroups.isEmpty()) { + log.info( + "Disabled groups: {}", + String.join(", ", disabledGroups.stream().sorted().toList())); + } + if (!disabledList.isEmpty()) { log.info( "Total disabled endpoints: {}. Disabled endpoints: {}", @@ -212,7 +250,6 @@ public class EndpointConfiguration { // Unoconvert addEndpointToGroup("Unoconvert", "file-to-pdf"); - addEndpointToGroup("tesseract", "ocr-pdf"); // Java addEndpointToGroup("Java", "merge-pdfs"); @@ -261,8 +298,13 @@ public class EndpointConfiguration { addEndpointToGroup("Javascript", "compare"); addEndpointToGroup("Javascript", "adjust-contrast"); - // qpdf dependent endpoints - addEndpointToGroup("qpdf", "repair"); + // Multi-tool endpoints - endpoints that can be handled by multiple tools + addEndpointAlternative("repair", "qpdf"); + addEndpointAlternative("repair", "Ghostscript"); + addEndpointAlternative("compress-pdf", "qpdf"); + addEndpointAlternative("compress-pdf", "Ghostscript"); + addEndpointAlternative("ocr-pdf", "tesseract"); + addEndpointAlternative("ocr-pdf", "OCRmyPDF"); // Weasyprint dependent endpoints addEndpointToGroup("Weasyprint", "html-to-pdf"); diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java b/stirling-pdf/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java index 7dd6d2b3b..6d857c679 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java @@ -34,6 +34,8 @@ public class ExternalAppDepConfig { new HashMap<>() { { + put("gs", List.of("Ghostscript")); + put("ocrmypdf", List.of("OCRmyPDF")); put("soffice", List.of("LibreOffice")); put(weasyprintPath, List.of("Weasyprint")); put("pdftohtml", List.of("Pdftohtml")); @@ -109,6 +111,8 @@ public class ExternalAppDepConfig { @PostConstruct public void checkDependencies() { // Check core dependencies + checkDependencyAndDisableGroup("gs"); + checkDependencyAndDisableGroup("ocrmypdf"); checkDependencyAndDisableGroup("tesseract"); checkDependencyAndDisableGroup("soffice"); checkDependencyAndDisableGroup("qpdf"); diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java index 8509f5056..13828d88f 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java @@ -65,12 +65,14 @@ public class CompressController { private final CustomPDFDocumentFactory pdfDocumentFactory; private final boolean qpdfEnabled; + private final boolean ghostscriptEnabled; public CompressController( CustomPDFDocumentFactory pdfDocumentFactory, EndpointConfiguration endpointConfiguration) { this.pdfDocumentFactory = pdfDocumentFactory; this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf"); + this.ghostscriptEnabled = endpointConfiguration.isGroupEnabled("Ghostscript"); } @Data @@ -697,25 +699,69 @@ public class CompressController { boolean sizeMet = false; boolean imageCompressionApplied = false; - boolean qpdfCompressionApplied = false; - - if (qpdfEnabled && optimizeLevel <= 3) { - optimizeLevel = 4; - } + boolean externalCompressionApplied = false; while (!sizeMet && optimizeLevel <= 9) { - // Apply image compression for levels 4-9 - if ((optimizeLevel >= 3 || Boolean.TRUE.equals(convertToGrayscale)) - && !imageCompressionApplied) { - double scaleFactor = getScaleFactorForLevel(optimizeLevel); - float jpegQuality = getJpegQualityForLevel(optimizeLevel); + // Apply external compression first + if (!externalCompressionApplied) { + boolean ghostscriptSuccess = false; - // Compress images + // Try Ghostscript first if available - for ANY compression level + if (ghostscriptEnabled) { + try { + applyGhostscriptCompression( + request, optimizeLevel, currentFile, tempFiles); + log.info("Ghostscript compression applied successfully"); + ghostscriptSuccess = true; + } catch (IOException e) { + log.warn("Ghostscript compression failed, trying fallback methods"); + } + } + + // Fallback to QPDF if Ghostscript failed or not available (levels 1-3 only) + if (!ghostscriptSuccess && qpdfEnabled && optimizeLevel <= 3) { + try { + applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles); + log.info("QPDF compression applied successfully"); + } catch (IOException e) { + log.warn("QPDF compression also failed"); + } + } + + if (!ghostscriptSuccess && !qpdfEnabled) { + log.info( + "No external compression tools available, using image compression only"); + } + + externalCompressionApplied = true; + + // Skip image compression if Ghostscript succeeded + if (ghostscriptSuccess) { + imageCompressionApplied = true; + } + } + + // Apply image compression for levels 4+ only if Ghostscript didn't run + if ((optimizeLevel >= 4 || Boolean.TRUE.equals(convertToGrayscale)) + && !imageCompressionApplied) { + // Use different scale factors based on level + double scaleFactor = + switch (optimizeLevel) { + case 4 -> 0.95; // 95% of original size + case 5 -> 0.9; // 90% of original size + case 6 -> 0.8; // 80% of original size + case 7 -> 0.7; // 70% of original size + case 8 -> 0.65; // 65% of original size + case 9 -> 0.5; // 50% of original size + default -> 1.0; + }; + + log.info("Applying image compression with scale factor: {}", scaleFactor); Path compressedImageFile = compressImagesInPDF( currentFile, scaleFactor, - jpegQuality, + 0.7f, // Default JPEG quality Boolean.TRUE.equals(convertToGrayscale)); tempFiles.add(compressedImageFile); @@ -723,18 +769,6 @@ public class CompressController { imageCompressionApplied = true; } - // Apply QPDF compression for all levels - if (!qpdfCompressionApplied && qpdfEnabled) { - applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles); - qpdfCompressionApplied = true; - } else if (!qpdfCompressionApplied) { - // If QPDF is disabled, mark as applied and log - if (!qpdfEnabled) { - log.info("Skipping QPDF compression as QPDF group is disabled"); - } - qpdfCompressionApplied = true; - } - // Check if target size reached or not in auto mode long outputFileSize = Files.size(currentFile); if (outputFileSize <= expectedOutputSize || !autoMode) { @@ -754,7 +788,7 @@ public class CompressController { } else { // Reset flags for next iteration with higher optimization level imageCompressionApplied = false; - qpdfCompressionApplied = false; + externalCompressionApplied = false; optimizeLevel = newOptimizeLevel; } } @@ -788,6 +822,96 @@ public class CompressController { } } + // Run Ghostscript compression + private void applyGhostscriptCompression( + OptimizePdfRequest request, int optimizeLevel, Path currentFile, List tempFiles) + throws IOException { + + long preGsSize = Files.size(currentFile); + log.info("Pre-Ghostscript file size: {}", GeneralUtils.formatBytes(preGsSize)); + + // Create output file for Ghostscript + Path gsOutputFile = Files.createTempFile("gs_output_", ".pdf"); + tempFiles.add(gsOutputFile); + + // Build Ghostscript command based on optimization level + List command = new ArrayList<>(); + command.add("gs"); + command.add("-sDEVICE=pdfwrite"); + command.add("-dCompatibilityLevel=1.5"); + command.add("-dNOPAUSE"); + command.add("-dQUIET"); + command.add("-dBATCH"); + + // Map optimization levels to Ghostscript settings + switch (optimizeLevel) { + case 1: + command.add("-dPDFSETTINGS=/prepress"); + break; + case 2: + command.add("-dPDFSETTINGS=/printer"); + break; + case 3: + command.add("-dPDFSETTINGS=/ebook"); + break; + case 4: + case 5: + command.add("-dPDFSETTINGS=/screen"); + break; + case 6: + case 7: + command.add("-dPDFSETTINGS=/screen"); + command.add("-dColorImageResolution=150"); + command.add("-dGrayImageResolution=150"); + command.add("-dMonoImageResolution=300"); + break; + case 8: + case 9: + command.add("-dPDFSETTINGS=/screen"); + command.add("-dColorImageResolution=100"); + command.add("-dGrayImageResolution=100"); + command.add("-dMonoImageResolution=200"); + break; + case 10: + command.add("-dPDFSETTINGS=/screen"); + command.add("-dColorImageResolution=72"); + command.add("-dGrayImageResolution=72"); + command.add("-dMonoImageResolution=150"); + break; + default: + command.add("-dPDFSETTINGS=/screen"); + break; + } + + command.add("-sOutputFile=" + gsOutputFile.toString()); + command.add(currentFile.toString()); + + ProcessExecutorResult returnCode = null; + try { + returnCode = + ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT) + .runCommandWithOutputHandling(command); + + if (returnCode.getRc() == 0) { + // Update current file to the Ghostscript output + Files.copy(gsOutputFile, currentFile, StandardCopyOption.REPLACE_EXISTING); + + long postGsSize = Files.size(currentFile); + double gsReduction = 100.0 - ((postGsSize * 100.0) / preGsSize); + log.info( + "Post-Ghostscript file size: {} (reduced by {}%)", + GeneralUtils.formatBytes(postGsSize), String.format("%.1f", gsReduction)); + } else { + log.warn("Ghostscript compression failed with return code: {}", returnCode.getRc()); + throw new IOException("Ghostscript compression failed"); + } + + } catch (Exception e) { + log.warn("Ghostscript compression failed, will fallback to other methods", e); + throw new IOException("Ghostscript compression failed", e); + } + } + // Run QPDF compression private void applyQpdfCompression( OptimizePdfRequest request, int optimizeLevel, Path currentFile, List tempFiles) diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java index 93061b570..4721d86b9 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java @@ -2,6 +2,7 @@ package stirling.software.SPDF.controller.api.misc; import java.awt.image.BufferedImage; import java.io.*; +import java.nio.file.Files; import java.nio.file.Path; import java.util.*; import java.util.zip.ZipEntry; @@ -26,26 +27,42 @@ import io.github.pixee.security.Filenames; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; -import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import stirling.software.SPDF.config.EndpointConfiguration; import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest; import stirling.software.common.model.ApplicationProperties; import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.util.ProcessExecutor; import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult; +import stirling.software.common.util.TempDirectory; +import stirling.software.common.util.TempFile; import stirling.software.common.util.TempFileManager; +import stirling.software.common.util.WebResponseUtils; @RestController @RequestMapping("/api/v1/misc") @Tag(name = "Misc", description = "Miscellaneous APIs") @Slf4j -@RequiredArgsConstructor public class OCRController { private final ApplicationProperties applicationProperties; private final CustomPDFDocumentFactory pdfDocumentFactory; private final TempFileManager tempFileManager; + private final boolean ocrMyPdfEnabled; + private final boolean tesseractEnabled; + + public OCRController( + ApplicationProperties applicationProperties, + CustomPDFDocumentFactory pdfDocumentFactory, + TempFileManager tempFileManager, + EndpointConfiguration endpointConfiguration) { + this.applicationProperties = applicationProperties; + this.pdfDocumentFactory = pdfDocumentFactory; + this.tempFileManager = tempFileManager; + this.ocrMyPdfEnabled = endpointConfiguration.isGroupEnabled("OCRmyPDF"); + this.tesseractEnabled = endpointConfiguration.isGroupEnabled("tesseract"); + } /** Gets the list of available Tesseract languages from the tessdata directory */ public List getAvailableTesseractLanguages() { @@ -63,39 +80,261 @@ public class OCRController { @PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf") @Operation( - summary = "Process PDF files with OCR using Tesseract", + summary = "Process a PDF file with OCR", description = - "Takes a PDF file as input, performs OCR using specified languages and OCR type" - + " (skip-text/force-ocr), and returns the processed PDF. Input:PDF" - + " Output:PDF Type:SISO") + "This endpoint processes a PDF file using OCR (Optical Character Recognition). " + + "Users can specify languages, sidecar, deskew, clean, cleanFinal, ocrType, ocrRenderType, and removeImagesAfter options. " + + "Uses OCRmyPDF if available, falls back to Tesseract. Input:PDF Output:PDF Type:SI-Conditional") public ResponseEntity processPdfWithOCR( @ModelAttribute ProcessPdfWithOcrRequest request) throws IOException, InterruptedException { MultipartFile inputFile = request.getFileInput(); - List languages = request.getLanguages(); + List selectedLanguages = request.getLanguages(); + Boolean sidecar = request.isSidecar(); + Boolean deskew = request.isDeskew(); + Boolean clean = request.isClean(); + Boolean cleanFinal = request.isCleanFinal(); String ocrType = request.getOcrType(); + String ocrRenderType = request.getOcrRenderType(); + Boolean removeImagesAfter = request.isRemoveImagesAfter(); - // Create a temp directory using TempFileManager directly - Path tempDirPath = tempFileManager.createTempDirectory(); - File tempDir = tempDirPath.toFile(); + if (selectedLanguages == null || selectedLanguages.isEmpty()) { + throw new IOException("Please select at least one language."); + } - try { - File tempInputFile = new File(tempDir, "input.pdf"); - File tempOutputDir = new File(tempDir, "output"); - File tempImagesDir = new File(tempDir, "images"); - File finalOutputFile = new File(tempDir, "final_output.pdf"); + if (!"hocr".equals(ocrRenderType) && !"sandwich".equals(ocrRenderType)) { + throw new IOException("ocrRenderType wrong"); + } + + // Get available Tesseract languages + List availableLanguages = getAvailableTesseractLanguages(); + + // Validate selected languages + selectedLanguages = + selectedLanguages.stream().filter(availableLanguages::contains).toList(); + + if (selectedLanguages.isEmpty()) { + throw new IOException("None of the selected languages are valid."); + } + + // Use try-with-resources for proper temp file management + try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf"); + TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) { + + inputFile.transferTo(tempInputFile.getFile()); + + TempFile sidecarTextFile = null; + + try { + // Use OCRmyPDF if available (no fallback - error if it fails) + if (ocrMyPdfEnabled) { + if (sidecar != null && sidecar) { + sidecarTextFile = new TempFile(tempFileManager, ".txt"); + } + + processWithOcrMyPdf( + selectedLanguages, + sidecar, + deskew, + clean, + cleanFinal, + ocrType, + ocrRenderType, + removeImagesAfter, + tempInputFile.getPath(), + tempOutputFile.getPath(), + sidecarTextFile != null ? sidecarTextFile.getPath() : null); + log.info("OCRmyPDF processing completed successfully"); + } + // Use Tesseract only if OCRmyPDF is not available + else if (tesseractEnabled) { + processWithTesseract( + selectedLanguages, + ocrType, + tempInputFile.getPath(), + tempOutputFile.getPath()); + log.info("Tesseract processing completed successfully"); + } else { + throw new IOException("No OCR tools are available"); + } + + // Read the processed PDF file + byte[] pdfBytes = Files.readAllBytes(tempOutputFile.getPath()); + + // Return the OCR processed PDF as a response + String outputFilename = + Filenames.toSimpleFileName(inputFile.getOriginalFilename()) + .replaceFirst("[.][^.]+$", "") + + "_OCR.pdf"; + + if (sidecar != null && sidecar && sidecarTextFile != null) { + // Create a zip file containing both the PDF and the text file + String outputZipFilename = + Filenames.toSimpleFileName(inputFile.getOriginalFilename()) + .replaceFirst("[.][^.]+$", "") + + "_OCR.zip"; + + try (TempFile tempZipFile = new TempFile(tempFileManager, ".zip"); + ZipOutputStream zipOut = + new ZipOutputStream( + Files.newOutputStream(tempZipFile.getPath()))) { + + // Add PDF file to the zip + ZipEntry pdfEntry = new ZipEntry(outputFilename); + zipOut.putNextEntry(pdfEntry); + zipOut.write(pdfBytes); + zipOut.closeEntry(); + + // Add text file to the zip + ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt")); + zipOut.putNextEntry(txtEntry); + Files.copy(sidecarTextFile.getPath(), zipOut); + zipOut.closeEntry(); + + zipOut.finish(); + + byte[] zipBytes = Files.readAllBytes(tempZipFile.getPath()); + + // Return the zip file containing both the PDF and the text file + return WebResponseUtils.bytesToWebResponse( + zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM); + } + } else { + // Return the OCR processed PDF as a response + return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename); + } + + } finally { + // Clean up sidecar temp file if created + if (sidecarTextFile != null) { + try { + sidecarTextFile.close(); + } catch (Exception e) { + log.warn("Failed to close sidecar temp file", e); + } + } + } + } + } + + private void processWithOcrMyPdf( + List selectedLanguages, + Boolean sidecar, + Boolean deskew, + Boolean clean, + Boolean cleanFinal, + String ocrType, + String ocrRenderType, + Boolean removeImagesAfter, + Path tempInputFile, + Path tempOutputFile, + Path sidecarTextPath) + throws IOException, InterruptedException { + + // Build OCRmyPDF command + String languageOption = String.join("+", selectedLanguages); + + List command = + new ArrayList<>( + Arrays.asList( + "ocrmypdf", + "--verbose", + "2", + "--output-type", + "pdf", + "--pdf-renderer", + ocrRenderType)); + + if (sidecar != null && sidecar && sidecarTextPath != null) { + command.add("--sidecar"); + command.add(sidecarTextPath.toString()); + } + + if (deskew != null && deskew) { + command.add("--deskew"); + } + if (clean != null && clean) { + command.add("--clean"); + } + if (cleanFinal != null && cleanFinal) { + command.add("--clean-final"); + } + if (ocrType != null && !"".equals(ocrType)) { + if ("skip-text".equals(ocrType)) { + command.add("--skip-text"); + } else if ("force-ocr".equals(ocrType)) { + command.add("--force-ocr"); + } + } + + command.addAll( + Arrays.asList( + "--language", + languageOption, + tempInputFile.toString(), + tempOutputFile.toString())); + + // Run CLI command + ProcessExecutorResult result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF) + .runCommandWithOutputHandling(command); + + if (result.getRc() != 0 + && result.getMessages().contains("multiprocessing/synchronize.py") + && result.getMessages().contains("OSError: [Errno 38] Function not implemented")) { + command.add("--jobs"); + command.add("1"); + result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF) + .runCommandWithOutputHandling(command); + } + + if (result.getRc() != 0) { + throw new IOException("OCRmyPDF failed with return code: " + result.getRc()); + } + + // Remove images from the OCR processed PDF if the flag is set to true + if (removeImagesAfter != null && removeImagesAfter) { + try (TempFile tempPdfWithoutImages = new TempFile(tempFileManager, "_no_images.pdf")) { + List gsCommand = + Arrays.asList( + "gs", + "-sDEVICE=pdfwrite", + "-dFILTERIMAGE", + "-o", + tempPdfWithoutImages.getPath().toString(), + tempOutputFile.toString()); + + ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT) + .runCommandWithOutputHandling(gsCommand); + + // Replace output file with version without images + Files.copy( + tempPdfWithoutImages.getPath(), + tempOutputFile, + java.nio.file.StandardCopyOption.REPLACE_EXISTING); + } + } + } + + private void processWithTesseract( + List selectedLanguages, String ocrType, Path tempInputFile, Path tempOutputFile) + throws IOException, InterruptedException { + + // Create temp directory for Tesseract processing + try (TempDirectory tempDir = new TempDirectory(tempFileManager)) { + File tempOutputDir = new File(tempDir.getPath().toFile(), "output"); + File tempImagesDir = new File(tempDir.getPath().toFile(), "images"); + File finalOutputFile = new File(tempDir.getPath().toFile(), "final_output.pdf"); // Create directories tempOutputDir.mkdirs(); tempImagesDir.mkdirs(); - // Save input file - inputFile.transferTo(tempInputFile); - PDFMergerUtility merger = new PDFMergerUtility(); merger.setDestinationFileName(finalOutputFile.toString()); - try (PDDocument document = pdfDocumentFactory.load(tempInputFile)) { + try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) { PDFRenderer pdfRenderer = new PDFRenderer(document); int pageCount = document.getNumberOfPages(); @@ -135,35 +374,20 @@ public class OCRController { new File(tempOutputDir, String.format("page_%d", pageNum)) .toString()); command.add("-l"); - command.add(String.join("+", languages)); - // Always output PDF - command.add("pdf"); + command.add(String.join("+", selectedLanguages)); + command.add("pdf"); // Always output PDF - // Use ProcessExecutor to run tesseract command - try { - ProcessExecutorResult result = - ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT) - .runCommandWithOutputHandling(command); + ProcessExecutorResult result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT) + .runCommandWithOutputHandling(command); - log.debug( - "Tesseract OCR completed for page {} with exit code {}", - pageNum, - result.getRc()); - - // Add OCR'd PDF to merger - merger.addSource(pageOutputPath); - } catch (IOException | InterruptedException e) { - log.error( - "Error processing page {} with tesseract: {}", - pageNum, - e.getMessage()); - // If OCR fails, fall back to the original page - try (PDDocument pageDoc = new PDDocument()) { - pageDoc.addPage(page); - pageDoc.save(pageOutputPath); - merger.addSource(pageOutputPath); - } + if (result.getRc() != 0) { + throw new RuntimeException( + "Tesseract failed with exit code: " + result.getRc()); } + + // Add OCR'd PDF to merger + merger.addSource(pageOutputPath); } else { // Save original page without OCR try (PDDocument pageDoc = new PDDocument()) { @@ -178,40 +402,11 @@ public class OCRController { // Merge all pages into final PDF merger.mergeDocuments(null); - // Read the final PDF file - byte[] pdfContent = java.nio.file.Files.readAllBytes(finalOutputFile.toPath()); - String outputFilename = - Filenames.toSimpleFileName(inputFile.getOriginalFilename()) - .replaceFirst("[.][^.]+$", "") - + "_OCR.pdf"; - - return ResponseEntity.ok() - .header( - "Content-Disposition", - "attachment; filename=\"" + outputFilename + "\"") - .contentType(MediaType.APPLICATION_PDF) - .body(pdfContent); - } finally { - // Clean up the temp directory and all its contents - tempFileManager.deleteTempDirectory(tempDirPath); - } - } - - private void addFileToZip(File file, String filename, ZipOutputStream zipOut) - throws IOException { - if (!file.exists()) { - log.warn("File {} does not exist, skipping", file); - return; - } - try (FileInputStream fis = new FileInputStream(file)) { - ZipEntry zipEntry = new ZipEntry(filename); - zipOut.putNextEntry(zipEntry); - byte[] buffer = new byte[1024]; - int length; - while ((length = fis.read(buffer)) >= 0) { - zipOut.write(buffer, 0, length); - } - zipOut.closeEntry(); + // Copy final output to the expected location + Files.copy( + finalOutputFile.toPath(), + tempOutputFile, + java.nio.file.StandardCopyOption.REPLACE_EXISTING); } } } diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/RepairController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/RepairController.java index b8c347ef1..1a72c22de 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/RepairController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/RepairController.java @@ -15,8 +15,7 @@ import io.github.pixee.security.Filenames; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; -import lombok.RequiredArgsConstructor; - +import stirling.software.SPDF.config.EndpointConfiguration; import stirling.software.common.model.api.PDFFile; import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.util.ProcessExecutor; @@ -28,17 +27,28 @@ import stirling.software.common.util.WebResponseUtils; @RestController @RequestMapping("/api/v1/misc") @Tag(name = "Misc", description = "Miscellaneous APIs") -@RequiredArgsConstructor public class RepairController { private final CustomPDFDocumentFactory pdfDocumentFactory; private final TempFileManager tempFileManager; + private final boolean ghostscriptEnabled; + private final boolean qpdfEnabled; + + public RepairController( + CustomPDFDocumentFactory pdfDocumentFactory, + TempFileManager tempFileManager, + EndpointConfiguration endpointConfiguration) { + this.pdfDocumentFactory = pdfDocumentFactory; + this.tempFileManager = tempFileManager; + this.ghostscriptEnabled = endpointConfiguration.isGroupEnabled("Ghostscript"); + this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf"); + } @PostMapping(consumes = "multipart/form-data", value = "/repair") @Operation( summary = "Repair a PDF file", description = - "This endpoint repairs a given PDF file by running qpdf command. The PDF is" + "This endpoint repairs a given PDF file by running Ghostscript (primary), qpdf (fallback), or PDFBox (if no external tools available). The PDF is" + " first saved to a temporary location, repaired, read back, and then" + " returned as a response. Input:PDF Output:PDF Type:SISO") public ResponseEntity repairPdf(@ModelAttribute PDFFile file) @@ -46,25 +56,72 @@ public class RepairController { MultipartFile inputFile = file.getFileInput(); // Use TempFile with try-with-resources for automatic cleanup - try (TempFile tempFile = new TempFile(tempFileManager, ".pdf")) { + try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf"); + TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) { + // Save the uploaded file to the temporary location - inputFile.transferTo(tempFile.getFile()); + inputFile.transferTo(tempInputFile.getFile()); - List command = new ArrayList<>(); - command.add("qpdf"); - command.add("--replace-input"); // Automatically fixes problems it can - command.add("--qdf"); // Linearizes and normalizes PDF structure - command.add("--object-streams=disable"); // Can help with some corruptions - command.add(tempFile.getFile().getAbsolutePath()); + boolean repairSuccess = false; - ProcessExecutorResult returnCode = - ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF) - .runCommandWithOutputHandling(command); + // Try Ghostscript first if available + if (ghostscriptEnabled) { + try { + List gsCommand = new ArrayList<>(); + gsCommand.add("gs"); + gsCommand.add("-o"); + gsCommand.add(tempOutputFile.getPath().toString()); + gsCommand.add("-sDEVICE=pdfwrite"); + gsCommand.add(tempInputFile.getPath().toString()); - // Read the optimized PDF file - byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempFile.getFile()); + ProcessExecutorResult gsResult = + ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT) + .runCommandWithOutputHandling(gsCommand); - // Return the optimized PDF as a response + if (gsResult.getRc() == 0) { + repairSuccess = true; + } + } catch (Exception e) { + // Log and continue to QPDF fallback + System.out.println( + "Ghostscript repair failed, trying QPDF fallback: " + e.getMessage()); + } + } + + // Fallback to QPDF if Ghostscript failed or not available + if (!repairSuccess && qpdfEnabled) { + List qpdfCommand = new ArrayList<>(); + qpdfCommand.add("qpdf"); + qpdfCommand.add("--replace-input"); // Automatically fixes problems it can + qpdfCommand.add("--qdf"); // Linearizes and normalizes PDF structure + qpdfCommand.add("--object-streams=disable"); // Can help with some corruptions + qpdfCommand.add(tempInputFile.getPath().toString()); + qpdfCommand.add(tempOutputFile.getPath().toString()); + + ProcessExecutorResult qpdfResult = + ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF) + .runCommandWithOutputHandling(qpdfCommand); + + repairSuccess = true; + } + + // Use PDFBox as last resort if no external tools are available + if (!repairSuccess) { + if (!ghostscriptEnabled && !qpdfEnabled) { + // Basic PDFBox repair - load and save to fix structural issues + try (var document = pdfDocumentFactory.load(tempInputFile.getFile())) { + document.save(tempOutputFile.getFile()); + repairSuccess = true; + } + } else { + throw new IOException("PDF repair failed with available tools"); + } + } + + // Read the repaired PDF file + byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempOutputFile.getFile()); + + // Return the repaired PDF as a response String outputFilename = Filenames.toSimpleFileName(inputFile.getOriginalFilename()) .replaceFirst("[.][^.]+$", "") diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/model/api/misc/ProcessPdfWithOcrRequest.java b/stirling-pdf/src/main/java/stirling/software/SPDF/model/api/misc/ProcessPdfWithOcrRequest.java index 00279eb96..2955d7160 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/model/api/misc/ProcessPdfWithOcrRequest.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/model/api/misc/ProcessPdfWithOcrRequest.java @@ -19,6 +19,18 @@ public class ProcessPdfWithOcrRequest extends PDFFile { defaultValue = "[\"eng\"]") private List languages; + @Schema(description = "Include OCR text in a sidecar text file if set to true") + private boolean sidecar; + + @Schema(description = "Deskew the input file if set to true") + private boolean deskew; + + @Schema(description = "Clean the input file if set to true") + private boolean clean; + + @Schema(description = "Clean the final output if set to true") + private boolean cleanFinal; + @Schema( description = "Specify the OCR type, e.g., 'skip-text', 'force-ocr', or 'Normal'", requiredMode = Schema.RequiredMode.REQUIRED, @@ -31,4 +43,7 @@ public class ProcessPdfWithOcrRequest extends PDFFile { allowableValues = {"hocr", "sandwich"}, defaultValue = "hocr") private String ocrRenderType = "hocr"; + + @Schema(description = "Remove images from the output PDF if set to true") + private boolean removeImagesAfter; } diff --git a/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html b/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html index f98c20d29..441542d57 100644 --- a/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html +++ b/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html @@ -79,6 +79,30 @@
+
+ +
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+