restore OCRMyPDF and ghostscript compression

2025-08-27 14:49:23 +00:00 · 2025-06-30 22:27:45 +01:00 · 2025-06-30 22:27:45 +01:00 · 782c30f934
commit 782c30f934
parent 11e3ccd19f
13 changed files with 646 additions and 143 deletions
--- a/3
+++ b/3
@ -51,7 +51,6 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
    tini \
    bash \
    curl \
    qpdf \
    shadow \
    su-exec \
    openssl \
@ -69,9 +68,11 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
 	tesseract-ocr-data-deu \
 	tesseract-ocr-data-fra \
 	tesseract-ocr-data-por \
    unpaper \
    # CV
    py3-opencv \
    python3 \
    ocrmypdf \
    py3-pip \
    py3-pillow@testing \
    py3-pdf2image@testing && \
--- a/Dockerfile.fat
+++ b/Dockerfile.fat
@ -76,16 +76,17 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
    # pdftohtml
    poppler-utils \
    # OCR MY PDF (unpaper for descew and other advanced featues)
    qpdf \
    tesseract-ocr-data-eng \
    tesseract-ocr-data-chi_sim \
 	tesseract-ocr-data-deu \
 	tesseract-ocr-data-fra \
 	tesseract-ocr-data-por \
    unpaper \
    font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \
    # CV
    py3-opencv \
    python3 \
    ocrmypdf \
    py3-pip \
    py3-pillow@testing \
    py3-pdf2image@testing && \
--- a/common/src/main/java/stirling/software/common/model/ApplicationProperties.java
+++ b/common/src/main/java/stirling/software/common/model/ApplicationProperties.java
@ -545,6 +545,8 @@ public class ApplicationProperties {
            private int calibreSessionLimit;
            private int qpdfSessionLimit;
            private int tesseractSessionLimit;
            private int ghostscriptSessionLimit;
            private int ocrMyPdfSessionLimit;
            public int getQpdfSessionLimit() {
                return qpdfSessionLimit > 0 ? qpdfSessionLimit : 2;
@ -577,6 +579,14 @@ public class ApplicationProperties {
            public int getCalibreSessionLimit() {
                return calibreSessionLimit > 0 ? calibreSessionLimit : 1;
            }
            public int getGhostscriptSessionLimit() {
                return ghostscriptSessionLimit > 0 ? ghostscriptSessionLimit : 8;
            }
            public int getOcrMyPdfSessionLimit() {
                return ocrMyPdfSessionLimit > 0 ? ocrMyPdfSessionLimit : 2;
            }
        }
        @Data
@ -589,6 +599,8 @@ public class ApplicationProperties {
            private long calibreTimeoutMinutes;
            private long tesseractTimeoutMinutes;
            private long qpdfTimeoutMinutes;
            private long ghostscriptTimeoutMinutes;
            private long ocrMyPdfTimeoutMinutes;
            public long getTesseractTimeoutMinutes() {
                return tesseractTimeoutMinutes > 0 ? tesseractTimeoutMinutes : 30;
@ -621,6 +633,14 @@ public class ApplicationProperties {
            public long getCalibreTimeoutMinutes() {
                return calibreTimeoutMinutes > 0 ? calibreTimeoutMinutes : 30;
            }
            public long getGhostscriptTimeoutMinutes() {
                return ghostscriptTimeoutMinutes > 0 ? ghostscriptTimeoutMinutes : 30;
            }
            public long getOcrMyPdfTimeoutMinutes() {
                return ocrMyPdfTimeoutMinutes > 0 ? ocrMyPdfTimeoutMinutes : 30;
            }
        }
    }
 }
--- a/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java
+++ b/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java
@ -308,7 +308,7 @@ public class TempFileCleanupService {
        }
        java.util.List<Path> subdirectories = new java.util.ArrayList<>();
-        
+
        try (Stream<Path> pathStream = Files.list(directory)) {
            pathStream.forEach(
                    path -> {
@ -347,7 +347,7 @@ public class TempFileCleanupService {
                        }
                    });
        }
-        
+
        for (Path subdirectory : subdirectories) {
            try {
                cleanupDirectoryStreaming(
--- a/common/src/main/java/stirling/software/common/util/ProcessExecutor.java
+++ b/common/src/main/java/stirling/software/common/util/ProcessExecutor.java
@ -84,6 +84,16 @@ public class ProcessExecutor {
                                                .getProcessExecutor()
                                                .getSessionLimit()
                                                .getCalibreSessionLimit();
                                case GHOSTSCRIPT ->
                                        applicationProperties
                                                .getProcessExecutor()
                                                .getSessionLimit()
                                                .getGhostscriptSessionLimit();
                                case OCR_MY_PDF ->
                                        applicationProperties
                                                .getProcessExecutor()
                                                .getSessionLimit()
                                                .getOcrMyPdfSessionLimit();
                            };
                    long timeoutMinutes =
@ -128,6 +138,16 @@ public class ProcessExecutor {
                                                .getProcessExecutor()
                                                .getTimeoutMinutes()
                                                .getCalibreTimeoutMinutes();
                                case GHOSTSCRIPT ->
                                        applicationProperties
                                                .getProcessExecutor()
                                                .getTimeoutMinutes()
                                                .getGhostscriptTimeoutMinutes();
                                case OCR_MY_PDF ->
                                        applicationProperties
                                                .getProcessExecutor()
                                                .getTimeoutMinutes()
                                                .getOcrMyPdfTimeoutMinutes();
                            };
                    return new ProcessExecutor(semaphoreLimit, liveUpdates, timeoutMinutes);
                });
@ -278,7 +298,9 @@ public class ProcessExecutor {
        INSTALL_APP,
        CALIBRE,
        TESSERACT,
-        QPDF
+        QPDF,
        GHOSTSCRIPT,
        OCR_MY_PDF
    }
    public class ProcessExecutorResult {
--- a/common/src/main/java/stirling/software/common/util/TempFileRegistry.java
+++ b/common/src/main/java/stirling/software/common/util/TempFileRegistry.java
@ -9,7 +9,6 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
 import java.util.concurrent.ConcurrentSkipListSet;
 import java.util.stream.Collectors;
 import org.springframework.stereotype.Component;
@ -24,11 +23,10 @@ import lombok.extern.slf4j.Slf4j;
@Component
 public class TempFileRegistry {
-	 private final ConcurrentMap<Path, Instant> registeredFiles = new ConcurrentHashMap<>();
+    private final ConcurrentMap<Path, Instant> registeredFiles = new ConcurrentHashMap<>();
-	     private final Set<Path> thirdPartyTempFiles =
+    private final Set<Path> thirdPartyTempFiles =
-	             Collections.newSetFromMap(new ConcurrentHashMap<>());
+            Collections.newSetFromMap(new ConcurrentHashMap<>());
-	     private final Set<Path> tempDirectories =
+    private final Set<Path> tempDirectories = Collections.newSetFromMap(new ConcurrentHashMap<>());
 	            Collections.newSetFromMap(new ConcurrentHashMap<>());
    /**
     * Register a temporary file with the registry.
--- a/stirling-pdf/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java
+++ b/stirling-pdf/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java
@ -21,6 +21,8 @@ public class EndpointConfiguration {
    private final ApplicationProperties applicationProperties;
    private Map<String, Boolean> endpointStatuses = new ConcurrentHashMap<>();
    private Map<String, Set<String>> endpointGroups = new ConcurrentHashMap<>();
    private Set<String> disabledGroups = new HashSet<>();
    private Map<String, Set<String>> endpointAlternatives = new ConcurrentHashMap<>();
    private final boolean runningProOrHigher;
    public EndpointConfiguration(
@ -51,16 +53,36 @@ public class EndpointConfiguration {
        if (endpoint.startsWith("/")) {
            endpoint = endpoint.substring(1);
        }
        // Check if endpoint has alternatives (multiple tools can handle it)
        Set<String> alternatives = endpointAlternatives.get(endpoint);
        if (alternatives != null && !alternatives.isEmpty()) {
            // Endpoint is enabled if ANY of its alternative tools are enabled
            for (String toolGroup : alternatives) {
                if (isGroupEnabled(toolGroup)) {
                    return true;
                }
            }
            return false; // All alternative tools are disabled
        }
        // Fallback to standard endpoint status check
        return endpointStatuses.getOrDefault(endpoint, true);
    }
    public boolean isGroupEnabled(String group) {
        // Check if group is explicitly disabled first
        if (disabledGroups.contains(group)) {
            return false;
        }
        Set<String> endpoints = endpointGroups.get(group);
        if (endpoints == null || endpoints.isEmpty()) {
            log.debug("Group '{}' does not exist or has no endpoints", group);
            return false;
        }
        // Additional check: if all endpoints in group are disabled, consider group disabled
        for (String endpoint : endpoints) {
            if (!isEndpointEnabled(endpoint)) {
                return false;
@ -73,8 +95,23 @@ public class EndpointConfiguration {
    public void addEndpointToGroup(String group, String endpoint) {
        endpointGroups.computeIfAbsent(group, k -> new HashSet<>()).add(endpoint);
    }
    public void addEndpointAlternative(String endpoint, String toolGroup) {
        endpointAlternatives.computeIfAbsent(endpoint, k -> new HashSet<>()).add(toolGroup);
    }
    public void disableGroup(String group) {
        disabledGroups.add(group);
        Set<String> endpoints = endpointGroups.get(group);
        if (endpoints != null) {
            for (String endpoint : endpoints) {
                disableEndpoint(endpoint);
            }
        }
    }
    public void enableGroup(String group) {
        disabledGroups.remove(group);
        Set<String> endpoints = endpointGroups.get(group);
        if (endpoints != null) {
            for (String endpoint : endpoints) {
@ -83,13 +120,8 @@ public class EndpointConfiguration {
        }
    }
-    public void disableGroup(String group) {
+    public Set<String> getDisabledGroups() {
-        Set<String> endpoints = endpointGroups.get(group);
+        return new HashSet<>(disabledGroups);
        if (endpoints != null) {
            for (String endpoint : endpoints) {
                disableEndpoint(endpoint);
            }
        }
    }
    public void logDisabledEndpointsSummary() {
@ -101,6 +133,12 @@ public class EndpointConfiguration {
                        .sorted()
                        .toList();
        if (!disabledGroups.isEmpty()) {
            log.info(
                    "Disabled groups: {}",
                    String.join(", ", disabledGroups.stream().sorted().toList()));
        }
        if (!disabledList.isEmpty()) {
            log.info(
                    "Total disabled endpoints: {}. Disabled endpoints: {}",
@ -212,7 +250,6 @@ public class EndpointConfiguration {
        // Unoconvert
        addEndpointToGroup("Unoconvert", "file-to-pdf");
        addEndpointToGroup("tesseract", "ocr-pdf");
        // Java
        addEndpointToGroup("Java", "merge-pdfs");
@ -261,8 +298,13 @@ public class EndpointConfiguration {
        addEndpointToGroup("Javascript", "compare");
        addEndpointToGroup("Javascript", "adjust-contrast");
-        // qpdf dependent endpoints
+        // Multi-tool endpoints - endpoints that can be handled by multiple tools
-        addEndpointToGroup("qpdf", "repair");
+        addEndpointAlternative("repair", "qpdf");
        addEndpointAlternative("repair", "Ghostscript");
        addEndpointAlternative("compress-pdf", "qpdf");
        addEndpointAlternative("compress-pdf", "Ghostscript");
        addEndpointAlternative("ocr-pdf", "tesseract");
        addEndpointAlternative("ocr-pdf", "OCRmyPDF");
        // Weasyprint dependent endpoints
        addEndpointToGroup("Weasyprint", "html-to-pdf");
--- a/stirling-pdf/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java
+++ b/stirling-pdf/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java
@ -34,6 +34,8 @@ public class ExternalAppDepConfig {
                new HashMap<>() {
                    {
                        put("gs", List.of("Ghostscript"));
                        put("ocrmypdf", List.of("OCRmyPDF"));
                        put("soffice", List.of("LibreOffice"));
                        put(weasyprintPath, List.of("Weasyprint"));
                        put("pdftohtml", List.of("Pdftohtml"));
@ -109,6 +111,8 @@ public class ExternalAppDepConfig {
    @PostConstruct
    public void checkDependencies() {
        // Check core dependencies
        checkDependencyAndDisableGroup("gs");
        checkDependencyAndDisableGroup("ocrmypdf");
        checkDependencyAndDisableGroup("tesseract");
        checkDependencyAndDisableGroup("soffice");
        checkDependencyAndDisableGroup("qpdf");
--- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java
+++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java
@ -65,12 +65,14 @@ public class CompressController {
    private final CustomPDFDocumentFactory pdfDocumentFactory;
    private final boolean qpdfEnabled;
    private final boolean ghostscriptEnabled;
    public CompressController(
            CustomPDFDocumentFactory pdfDocumentFactory,
            EndpointConfiguration endpointConfiguration) {
        this.pdfDocumentFactory = pdfDocumentFactory;
        this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf");
        this.ghostscriptEnabled = endpointConfiguration.isGroupEnabled("Ghostscript");
    }
    @Data
@ -697,25 +699,69 @@ public class CompressController {
            boolean sizeMet = false;
            boolean imageCompressionApplied = false;
-            boolean qpdfCompressionApplied = false;
+            boolean externalCompressionApplied = false;
            if (qpdfEnabled && optimizeLevel <= 3) {
                optimizeLevel = 4;
            }
            while (!sizeMet && optimizeLevel <= 9) {
-                // Apply image compression for levels 4-9
+                // Apply external compression first
-                if ((optimizeLevel >= 3 || Boolean.TRUE.equals(convertToGrayscale))
+                if (!externalCompressionApplied) {
-                        && !imageCompressionApplied) {
+                    boolean ghostscriptSuccess = false;
                    double scaleFactor = getScaleFactorForLevel(optimizeLevel);
                    float jpegQuality = getJpegQualityForLevel(optimizeLevel);
-                    // Compress images
+                    // Try Ghostscript first if available - for ANY compression level
                    if (ghostscriptEnabled) {
                        try {
                            applyGhostscriptCompression(
                                    request, optimizeLevel, currentFile, tempFiles);
                            log.info("Ghostscript compression applied successfully");
                            ghostscriptSuccess = true;
                        } catch (IOException e) {
                            log.warn("Ghostscript compression failed, trying fallback methods");
                        }
                    }
                    // Fallback to QPDF if Ghostscript failed or not available (levels 1-3 only)
                    if (!ghostscriptSuccess && qpdfEnabled && optimizeLevel <= 3) {
                        try {
                            applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles);
                            log.info("QPDF compression applied successfully");
                        } catch (IOException e) {
                            log.warn("QPDF compression also failed");
                        }
                    }
                    if (!ghostscriptSuccess && !qpdfEnabled) {
                        log.info(
                                "No external compression tools available, using image compression only");
                    }
                    externalCompressionApplied = true;
                    // Skip image compression if Ghostscript succeeded
                    if (ghostscriptSuccess) {
                        imageCompressionApplied = true;
                    }
                }
                // Apply image compression for levels 4+ only if Ghostscript didn't run
                if ((optimizeLevel >= 4 || Boolean.TRUE.equals(convertToGrayscale))
                        && !imageCompressionApplied) {
                    // Use different scale factors based on level
                    double scaleFactor =
                            switch (optimizeLevel) {
                                case 4 -> 0.95; // 95% of original size
                                case 5 -> 0.9; // 90% of original size
                                case 6 -> 0.8; // 80% of original size
                                case 7 -> 0.7; // 70% of original size
                                case 8 -> 0.65; // 65% of original size
                                case 9 -> 0.5; // 50% of original size
                                default -> 1.0;
                            };
                    log.info("Applying image compression with scale factor: {}", scaleFactor);
                    Path compressedImageFile =
                            compressImagesInPDF(
                                    currentFile,
                                    scaleFactor,
-                                    jpegQuality,
+                                    0.7f, // Default JPEG quality
                                    Boolean.TRUE.equals(convertToGrayscale));
                    tempFiles.add(compressedImageFile);
@ -723,18 +769,6 @@ public class CompressController {
                    imageCompressionApplied = true;
                }
                // Apply QPDF compression for all levels
                if (!qpdfCompressionApplied && qpdfEnabled) {
                    applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles);
                    qpdfCompressionApplied = true;
                } else if (!qpdfCompressionApplied) {
                    // If QPDF is disabled, mark as applied and log
                    if (!qpdfEnabled) {
                        log.info("Skipping QPDF compression as QPDF group is disabled");
                    }
                    qpdfCompressionApplied = true;
                }
                // Check if target size reached or not in auto mode
                long outputFileSize = Files.size(currentFile);
                if (outputFileSize <= expectedOutputSize || !autoMode) {
@ -754,7 +788,7 @@ public class CompressController {
                    } else {
                        // Reset flags for next iteration with higher optimization level
                        imageCompressionApplied = false;
-                        qpdfCompressionApplied = false;
+                        externalCompressionApplied = false;
                        optimizeLevel = newOptimizeLevel;
                    }
                }
@ -788,6 +822,96 @@ public class CompressController {
        }
    }
    // Run Ghostscript compression
    private void applyGhostscriptCompression(
            OptimizePdfRequest request, int optimizeLevel, Path currentFile, List<Path> tempFiles)
            throws IOException {
        long preGsSize = Files.size(currentFile);
        log.info("Pre-Ghostscript file size: {}", GeneralUtils.formatBytes(preGsSize));
        // Create output file for Ghostscript
        Path gsOutputFile = Files.createTempFile("gs_output_", ".pdf");
        tempFiles.add(gsOutputFile);
        // Build Ghostscript command based on optimization level
        List<String> command = new ArrayList<>();
        command.add("gs");
        command.add("-sDEVICE=pdfwrite");
        command.add("-dCompatibilityLevel=1.5");
        command.add("-dNOPAUSE");
        command.add("-dQUIET");
        command.add("-dBATCH");
        // Map optimization levels to Ghostscript settings
        switch (optimizeLevel) {
            case 1:
                command.add("-dPDFSETTINGS=/prepress");
                break;
            case 2:
                command.add("-dPDFSETTINGS=/printer");
                break;
            case 3:
                command.add("-dPDFSETTINGS=/ebook");
                break;
            case 4:
            case 5:
                command.add("-dPDFSETTINGS=/screen");
                break;
            case 6:
            case 7:
                command.add("-dPDFSETTINGS=/screen");
                command.add("-dColorImageResolution=150");
                command.add("-dGrayImageResolution=150");
                command.add("-dMonoImageResolution=300");
                break;
            case 8:
            case 9:
                command.add("-dPDFSETTINGS=/screen");
                command.add("-dColorImageResolution=100");
                command.add("-dGrayImageResolution=100");
                command.add("-dMonoImageResolution=200");
                break;
            case 10:
                command.add("-dPDFSETTINGS=/screen");
                command.add("-dColorImageResolution=72");
                command.add("-dGrayImageResolution=72");
                command.add("-dMonoImageResolution=150");
                break;
            default:
                command.add("-dPDFSETTINGS=/screen");
                break;
        }
        command.add("-sOutputFile=" + gsOutputFile.toString());
        command.add(currentFile.toString());
        ProcessExecutorResult returnCode = null;
        try {
            returnCode =
                    ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
                            .runCommandWithOutputHandling(command);
            if (returnCode.getRc() == 0) {
                // Update current file to the Ghostscript output
                Files.copy(gsOutputFile, currentFile, StandardCopyOption.REPLACE_EXISTING);
                long postGsSize = Files.size(currentFile);
                double gsReduction = 100.0 - ((postGsSize * 100.0) / preGsSize);
                log.info(
                        "Post-Ghostscript file size: {} (reduced by {}%)",
                        GeneralUtils.formatBytes(postGsSize), String.format("%.1f", gsReduction));
            } else {
                log.warn("Ghostscript compression failed with return code: {}", returnCode.getRc());
                throw new IOException("Ghostscript compression failed");
            }
        } catch (Exception e) {
            log.warn("Ghostscript compression failed, will fallback to other methods", e);
            throw new IOException("Ghostscript compression failed", e);
        }
    }
    // Run QPDF compression
    private void applyQpdfCompression(
            OptimizePdfRequest request, int optimizeLevel, Path currentFile, List<Path> tempFiles)
--- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java
+++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java
@ -2,6 +2,7 @@ package stirling.software.SPDF.controller.api.misc;
 import java.awt.image.BufferedImage;
 import java.io.*;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.*;
 import java.util.zip.ZipEntry;
@ -26,26 +27,42 @@ import io.github.pixee.security.Filenames;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.tags.Tag;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
 import stirling.software.SPDF.config.EndpointConfiguration;
 import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
 import stirling.software.common.model.ApplicationProperties;
 import stirling.software.common.service.CustomPDFDocumentFactory;
 import stirling.software.common.util.ProcessExecutor;
 import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
 import stirling.software.common.util.TempDirectory;
 import stirling.software.common.util.TempFile;
 import stirling.software.common.util.TempFileManager;
 import stirling.software.common.util.WebResponseUtils;
@RestController
@RequestMapping("/api/v1/misc")
@Tag(name = "Misc", description = "Miscellaneous APIs")
@Slf4j
@RequiredArgsConstructor
 public class OCRController {
    private final ApplicationProperties applicationProperties;
    private final CustomPDFDocumentFactory pdfDocumentFactory;
    private final TempFileManager tempFileManager;
    private final boolean ocrMyPdfEnabled;
    private final boolean tesseractEnabled;
    public OCRController(
            ApplicationProperties applicationProperties,
            CustomPDFDocumentFactory pdfDocumentFactory,
            TempFileManager tempFileManager,
            EndpointConfiguration endpointConfiguration) {
        this.applicationProperties = applicationProperties;
        this.pdfDocumentFactory = pdfDocumentFactory;
        this.tempFileManager = tempFileManager;
        this.ocrMyPdfEnabled = endpointConfiguration.isGroupEnabled("OCRmyPDF");
        this.tesseractEnabled = endpointConfiguration.isGroupEnabled("tesseract");
    }
    /** Gets the list of available Tesseract languages from the tessdata directory */
    public List<String> getAvailableTesseractLanguages() {
@ -63,39 +80,261 @@ public class OCRController {
    @PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
    @Operation(
-            summary = "Process PDF files with OCR using Tesseract",
+            summary = "Process a PDF file with OCR",
            description =
-                    "Takes a PDF file as input, performs OCR using specified languages and OCR type"
+                    "This endpoint processes a PDF file using OCR (Optical Character Recognition). "
-                            + " (skip-text/force-ocr), and returns the processed PDF. Input:PDF"
+                            + "Users can specify languages, sidecar, deskew, clean, cleanFinal, ocrType, ocrRenderType, and removeImagesAfter options. "
-                            + " Output:PDF Type:SISO")
+                            + "Uses OCRmyPDF if available, falls back to Tesseract. Input:PDF Output:PDF Type:SI-Conditional")
    public ResponseEntity<byte[]> processPdfWithOCR(
            @ModelAttribute ProcessPdfWithOcrRequest request)
            throws IOException, InterruptedException {
        MultipartFile inputFile = request.getFileInput();
-        List<String> languages = request.getLanguages();
+        List<String> selectedLanguages = request.getLanguages();
        Boolean sidecar = request.isSidecar();
        Boolean deskew = request.isDeskew();
        Boolean clean = request.isClean();
        Boolean cleanFinal = request.isCleanFinal();
        String ocrType = request.getOcrType();
        String ocrRenderType = request.getOcrRenderType();
        Boolean removeImagesAfter = request.isRemoveImagesAfter();
-        // Create a temp directory using TempFileManager directly
+        if (selectedLanguages == null || selectedLanguages.isEmpty()) {
-        Path tempDirPath = tempFileManager.createTempDirectory();
+            throw new IOException("Please select at least one language.");
-        File tempDir = tempDirPath.toFile();
+        }
-        try {
+        if (!"hocr".equals(ocrRenderType) && !"sandwich".equals(ocrRenderType)) {
-            File tempInputFile = new File(tempDir, "input.pdf");
+            throw new IOException("ocrRenderType wrong");
-            File tempOutputDir = new File(tempDir, "output");
+        }
-            File tempImagesDir = new File(tempDir, "images");
+
-            File finalOutputFile = new File(tempDir, "final_output.pdf");
+        // Get available Tesseract languages
        List<String> availableLanguages = getAvailableTesseractLanguages();
        // Validate selected languages
        selectedLanguages =
                selectedLanguages.stream().filter(availableLanguages::contains).toList();
        if (selectedLanguages.isEmpty()) {
            throw new IOException("None of the selected languages are valid.");
        }
        // Use try-with-resources for proper temp file management
        try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
                TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
            inputFile.transferTo(tempInputFile.getFile());
            TempFile sidecarTextFile = null;
            try {
                // Use OCRmyPDF if available (no fallback - error if it fails)
                if (ocrMyPdfEnabled) {
                    if (sidecar != null && sidecar) {
                        sidecarTextFile = new TempFile(tempFileManager, ".txt");
                    }
                    processWithOcrMyPdf(
                            selectedLanguages,
                            sidecar,
                            deskew,
                            clean,
                            cleanFinal,
                            ocrType,
                            ocrRenderType,
                            removeImagesAfter,
                            tempInputFile.getPath(),
                            tempOutputFile.getPath(),
                            sidecarTextFile != null ? sidecarTextFile.getPath() : null);
                    log.info("OCRmyPDF processing completed successfully");
                }
                // Use Tesseract only if OCRmyPDF is not available
                else if (tesseractEnabled) {
                    processWithTesseract(
                            selectedLanguages,
                            ocrType,
                            tempInputFile.getPath(),
                            tempOutputFile.getPath());
                    log.info("Tesseract processing completed successfully");
                } else {
                    throw new IOException("No OCR tools are available");
                }
                // Read the processed PDF file
                byte[] pdfBytes = Files.readAllBytes(tempOutputFile.getPath());
                // Return the OCR processed PDF as a response
                String outputFilename =
                        Filenames.toSimpleFileName(inputFile.getOriginalFilename())
                                        .replaceFirst("[.][^.]+$", "")
                                + "_OCR.pdf";
                if (sidecar != null && sidecar && sidecarTextFile != null) {
                    // Create a zip file containing both the PDF and the text file
                    String outputZipFilename =
                            Filenames.toSimpleFileName(inputFile.getOriginalFilename())
                                            .replaceFirst("[.][^.]+$", "")
                                    + "_OCR.zip";
                    try (TempFile tempZipFile = new TempFile(tempFileManager, ".zip");
                            ZipOutputStream zipOut =
                                    new ZipOutputStream(
                                            Files.newOutputStream(tempZipFile.getPath()))) {
                        // Add PDF file to the zip
                        ZipEntry pdfEntry = new ZipEntry(outputFilename);
                        zipOut.putNextEntry(pdfEntry);
                        zipOut.write(pdfBytes);
                        zipOut.closeEntry();
                        // Add text file to the zip
                        ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
                        zipOut.putNextEntry(txtEntry);
                        Files.copy(sidecarTextFile.getPath(), zipOut);
                        zipOut.closeEntry();
                        zipOut.finish();
                        byte[] zipBytes = Files.readAllBytes(tempZipFile.getPath());
                        // Return the zip file containing both the PDF and the text file
                        return WebResponseUtils.bytesToWebResponse(
                                zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
                    }
                } else {
                    // Return the OCR processed PDF as a response
                    return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
                }
            } finally {
                // Clean up sidecar temp file if created
                if (sidecarTextFile != null) {
                    try {
                        sidecarTextFile.close();
                    } catch (Exception e) {
                        log.warn("Failed to close sidecar temp file", e);
                    }
                }
            }
        }
    }
    private void processWithOcrMyPdf(
            List<String> selectedLanguages,
            Boolean sidecar,
            Boolean deskew,
            Boolean clean,
            Boolean cleanFinal,
            String ocrType,
            String ocrRenderType,
            Boolean removeImagesAfter,
            Path tempInputFile,
            Path tempOutputFile,
            Path sidecarTextPath)
            throws IOException, InterruptedException {
        // Build OCRmyPDF command
        String languageOption = String.join("+", selectedLanguages);
        List<String> command =
                new ArrayList<>(
                        Arrays.asList(
                                "ocrmypdf",
                                "--verbose",
                                "2",
                                "--output-type",
                                "pdf",
                                "--pdf-renderer",
                                ocrRenderType));
        if (sidecar != null && sidecar && sidecarTextPath != null) {
            command.add("--sidecar");
            command.add(sidecarTextPath.toString());
        }
        if (deskew != null && deskew) {
            command.add("--deskew");
        }
        if (clean != null && clean) {
            command.add("--clean");
        }
        if (cleanFinal != null && cleanFinal) {
            command.add("--clean-final");
        }
        if (ocrType != null && !"".equals(ocrType)) {
            if ("skip-text".equals(ocrType)) {
                command.add("--skip-text");
            } else if ("force-ocr".equals(ocrType)) {
                command.add("--force-ocr");
            }
        }
        command.addAll(
                Arrays.asList(
                        "--language",
                        languageOption,
                        tempInputFile.toString(),
                        tempOutputFile.toString()));
        // Run CLI command
        ProcessExecutorResult result =
                ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
                        .runCommandWithOutputHandling(command);
        if (result.getRc() != 0
                && result.getMessages().contains("multiprocessing/synchronize.py")
                && result.getMessages().contains("OSError: [Errno 38] Function not implemented")) {
            command.add("--jobs");
            command.add("1");
            result =
                    ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
                            .runCommandWithOutputHandling(command);
        }
        if (result.getRc() != 0) {
            throw new IOException("OCRmyPDF failed with return code: " + result.getRc());
        }
        // Remove images from the OCR processed PDF if the flag is set to true
        if (removeImagesAfter != null && removeImagesAfter) {
            try (TempFile tempPdfWithoutImages = new TempFile(tempFileManager, "_no_images.pdf")) {
                List<String> gsCommand =
                        Arrays.asList(
                                "gs",
                                "-sDEVICE=pdfwrite",
                                "-dFILTERIMAGE",
                                "-o",
                                tempPdfWithoutImages.getPath().toString(),
                                tempOutputFile.toString());
                ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
                        .runCommandWithOutputHandling(gsCommand);
                // Replace output file with version without images
                Files.copy(
                        tempPdfWithoutImages.getPath(),
                        tempOutputFile,
                        java.nio.file.StandardCopyOption.REPLACE_EXISTING);
            }
        }
    }
    private void processWithTesseract(
            List<String> selectedLanguages, String ocrType, Path tempInputFile, Path tempOutputFile)
            throws IOException, InterruptedException {
        // Create temp directory for Tesseract processing
        try (TempDirectory tempDir = new TempDirectory(tempFileManager)) {
            File tempOutputDir = new File(tempDir.getPath().toFile(), "output");
            File tempImagesDir = new File(tempDir.getPath().toFile(), "images");
            File finalOutputFile = new File(tempDir.getPath().toFile(), "final_output.pdf");
            // Create directories
            tempOutputDir.mkdirs();
            tempImagesDir.mkdirs();
            // Save input file
            inputFile.transferTo(tempInputFile);
            PDFMergerUtility merger = new PDFMergerUtility();
            merger.setDestinationFileName(finalOutputFile.toString());
-            try (PDDocument document = pdfDocumentFactory.load(tempInputFile)) {
+            try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) {
                PDFRenderer pdfRenderer = new PDFRenderer(document);
                int pageCount = document.getNumberOfPages();
@ -135,35 +374,20 @@ public class OCRController {
                                new File(tempOutputDir, String.format("page_%d", pageNum))
                                        .toString());
                        command.add("-l");
-                        command.add(String.join("+", languages));
+                        command.add(String.join("+", selectedLanguages));
-                        // Always output PDF
+                        command.add("pdf"); // Always output PDF
                        command.add("pdf");
-                        // Use ProcessExecutor to run tesseract command
+                        ProcessExecutorResult result =
-                        try {
+                                ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
-                            ProcessExecutorResult result =
+                                        .runCommandWithOutputHandling(command);
                                    ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
                                            .runCommandWithOutputHandling(command);
-                            log.debug(
+                        if (result.getRc() != 0) {
-                                    "Tesseract OCR completed for page {} with exit code {}",
+                            throw new RuntimeException(
-                                    pageNum,
+                                    "Tesseract failed with exit code: " + result.getRc());
                                    result.getRc());
                            // Add OCR'd PDF to merger
                            merger.addSource(pageOutputPath);
                        } catch (IOException | InterruptedException e) {
                            log.error(
                                    "Error processing page {} with tesseract: {}",
                                    pageNum,
                                    e.getMessage());
                            // If OCR fails, fall back to the original page
                            try (PDDocument pageDoc = new PDDocument()) {
                                pageDoc.addPage(page);
                                pageDoc.save(pageOutputPath);
                                merger.addSource(pageOutputPath);
                            }
                        }
                        // Add OCR'd PDF to merger
                        merger.addSource(pageOutputPath);
                    } else {
                        // Save original page without OCR
                        try (PDDocument pageDoc = new PDDocument()) {
@ -178,40 +402,11 @@ public class OCRController {
            // Merge all pages into final PDF
            merger.mergeDocuments(null);
-            // Read the final PDF file
+            // Copy final output to the expected location
-            byte[] pdfContent = java.nio.file.Files.readAllBytes(finalOutputFile.toPath());
+            Files.copy(
-            String outputFilename =
+                    finalOutputFile.toPath(),
-                    Filenames.toSimpleFileName(inputFile.getOriginalFilename())
+                    tempOutputFile,
-                                    .replaceFirst("[.][^.]+$", "")
+                    java.nio.file.StandardCopyOption.REPLACE_EXISTING);
                            + "_OCR.pdf";
            return ResponseEntity.ok()
                    .header(
                            "Content-Disposition",
                            "attachment; filename=\"" + outputFilename + "\"")
                    .contentType(MediaType.APPLICATION_PDF)
                    .body(pdfContent);
        } finally {
            // Clean up the temp directory and all its contents
            tempFileManager.deleteTempDirectory(tempDirPath);
        }
    }
    private void addFileToZip(File file, String filename, ZipOutputStream zipOut)
            throws IOException {
        if (!file.exists()) {
            log.warn("File {} does not exist, skipping", file);
            return;
        }
        try (FileInputStream fis = new FileInputStream(file)) {
            ZipEntry zipEntry = new ZipEntry(filename);
            zipOut.putNextEntry(zipEntry);
            byte[] buffer = new byte[1024];
            int length;
            while ((length = fis.read(buffer)) >= 0) {
                zipOut.write(buffer, 0, length);
            }
            zipOut.closeEntry();
        }
    }
 }
--- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/RepairController.java
+++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/RepairController.java
@ -15,8 +15,7 @@ import io.github.pixee.security.Filenames;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.tags.Tag;
-import lombok.RequiredArgsConstructor;
+import stirling.software.SPDF.config.EndpointConfiguration;
 import stirling.software.common.model.api.PDFFile;
 import stirling.software.common.service.CustomPDFDocumentFactory;
 import stirling.software.common.util.ProcessExecutor;
@ -28,17 +27,28 @@ import stirling.software.common.util.WebResponseUtils;
@RestController
@RequestMapping("/api/v1/misc")
@Tag(name = "Misc", description = "Miscellaneous APIs")
@RequiredArgsConstructor
 public class RepairController {
    private final CustomPDFDocumentFactory pdfDocumentFactory;
    private final TempFileManager tempFileManager;
    private final boolean ghostscriptEnabled;
    private final boolean qpdfEnabled;
    public RepairController(
            CustomPDFDocumentFactory pdfDocumentFactory,
            TempFileManager tempFileManager,
            EndpointConfiguration endpointConfiguration) {
        this.pdfDocumentFactory = pdfDocumentFactory;
        this.tempFileManager = tempFileManager;
        this.ghostscriptEnabled = endpointConfiguration.isGroupEnabled("Ghostscript");
        this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf");
    }
    @PostMapping(consumes = "multipart/form-data", value = "/repair")
    @Operation(
            summary = "Repair a PDF file",
            description =
-                    "This endpoint repairs a given PDF file by running qpdf command. The PDF is"
+                    "This endpoint repairs a given PDF file by running Ghostscript (primary), qpdf (fallback), or PDFBox (if no external tools available). The PDF is"
                            + " first saved to a temporary location, repaired, read back, and then"
                            + " returned as a response. Input:PDF Output:PDF Type:SISO")
    public ResponseEntity<byte[]> repairPdf(@ModelAttribute PDFFile file)
@ -46,25 +56,72 @@ public class RepairController {
        MultipartFile inputFile = file.getFileInput();
        // Use TempFile with try-with-resources for automatic cleanup
-        try (TempFile tempFile = new TempFile(tempFileManager, ".pdf")) {
+        try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
                TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
            // Save the uploaded file to the temporary location
-            inputFile.transferTo(tempFile.getFile());
+            inputFile.transferTo(tempInputFile.getFile());
-            List<String> command = new ArrayList<>();
+            boolean repairSuccess = false;
            command.add("qpdf");
            command.add("--replace-input"); // Automatically fixes problems it can
            command.add("--qdf"); // Linearizes and normalizes PDF structure
            command.add("--object-streams=disable"); // Can help with some corruptions
            command.add(tempFile.getFile().getAbsolutePath());
-            ProcessExecutorResult returnCode =
+            // Try Ghostscript first if available
-                    ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
+            if (ghostscriptEnabled) {
-                            .runCommandWithOutputHandling(command);
+                try {
                    List<String> gsCommand = new ArrayList<>();
                    gsCommand.add("gs");
                    gsCommand.add("-o");
                    gsCommand.add(tempOutputFile.getPath().toString());
                    gsCommand.add("-sDEVICE=pdfwrite");
                    gsCommand.add(tempInputFile.getPath().toString());
-            // Read the optimized PDF file
+                    ProcessExecutorResult gsResult =
-            byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempFile.getFile());
+                            ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
                                    .runCommandWithOutputHandling(gsCommand);
-            // Return the optimized PDF as a response
+                    if (gsResult.getRc() == 0) {
                        repairSuccess = true;
                    }
                } catch (Exception e) {
                    // Log and continue to QPDF fallback
                    System.out.println(
                            "Ghostscript repair failed, trying QPDF fallback: " + e.getMessage());
                }
            }
            // Fallback to QPDF if Ghostscript failed or not available
            if (!repairSuccess && qpdfEnabled) {
                List<String> qpdfCommand = new ArrayList<>();
                qpdfCommand.add("qpdf");
                qpdfCommand.add("--replace-input"); // Automatically fixes problems it can
                qpdfCommand.add("--qdf"); // Linearizes and normalizes PDF structure
                qpdfCommand.add("--object-streams=disable"); // Can help with some corruptions
                qpdfCommand.add(tempInputFile.getPath().toString());
                qpdfCommand.add(tempOutputFile.getPath().toString());
                ProcessExecutorResult qpdfResult =
                        ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
                                .runCommandWithOutputHandling(qpdfCommand);
                repairSuccess = true;
            }
            // Use PDFBox as last resort if no external tools are available
            if (!repairSuccess) {
                if (!ghostscriptEnabled && !qpdfEnabled) {
                    // Basic PDFBox repair - load and save to fix structural issues
                    try (var document = pdfDocumentFactory.load(tempInputFile.getFile())) {
                        document.save(tempOutputFile.getFile());
                        repairSuccess = true;
                    }
                } else {
                    throw new IOException("PDF repair failed with available tools");
                }
            }
            // Read the repaired PDF file
            byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempOutputFile.getFile());
            // Return the repaired PDF as a response
            String outputFilename =
                    Filenames.toSimpleFileName(inputFile.getOriginalFilename())
                                    .replaceFirst("[.][^.]+$", "")
--- a/stirling-pdf/src/main/java/stirling/software/SPDF/model/api/misc/ProcessPdfWithOcrRequest.java
+++ b/stirling-pdf/src/main/java/stirling/software/SPDF/model/api/misc/ProcessPdfWithOcrRequest.java
@ -19,6 +19,18 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
            defaultValue = "[\"eng\"]")
    private List<String> languages;
    @Schema(description = "Include OCR text in a sidecar text file if set to true")
    private boolean sidecar;
    @Schema(description = "Deskew the input file if set to true")
    private boolean deskew;
    @Schema(description = "Clean the input file if set to true")
    private boolean clean;
    @Schema(description = "Clean the final output if set to true")
    private boolean cleanFinal;
    @Schema(
            description = "Specify the OCR type, e.g., 'skip-text', 'force-ocr', or 'Normal'",
            requiredMode = Schema.RequiredMode.REQUIRED,
@ -31,4 +43,7 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
            allowableValues = {"hocr", "sandwich"},
            defaultValue = "hocr")
    private String ocrRenderType = "hocr";
    @Schema(description = "Remove images from the output PDF if set to true")
    private boolean removeImagesAfter;
 }
--- a/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html
+++ b/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html
@ -79,6 +79,30 @@
                  </select>
                </div>
                <br>
                <div class="mb-3" th:if>
                  <label class="form-label">OCR Options</label>
                  <div class="form-check">
                    <input type="checkbox" class="form-check-input" id="sidecar" name="sidecar" value="true">
                    <label class="form-check-label" for="sidecar">Include OCR text in sidecar text file</label>
                  </div>
                  <div class="form-check">
                    <input type="checkbox" class="form-check-input" id="deskew" name="deskew" value="true">
                    <label class="form-check-label" for="deskew">Deskew input file</label>
                  </div>
                  <div class="form-check">
                    <input type="checkbox" class="form-check-input" id="clean" name="clean" value="true">
                    <label class="form-check-label" for="clean">Clean input file</label>
                  </div>
                  <div class="form-check">
                    <input type="checkbox" class="form-check-input" id="cleanFinal" name="cleanFinal" value="true">
                    <label class="form-check-label" for="cleanFinal">Clean final output</label>
                  </div>
                  <div class="form-check">
                    <input type="checkbox" class="form-check-input" id="removeImagesAfter" name="removeImagesAfter" value="true">
                    <label class="form-check-label" for="removeImagesAfter">Remove images from output PDF</label>
                  </div>
                </div>
                <br>
                <button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{ocr.submit}"></button>
              </form>
              <script th:inline="javascript">