restore OCRMyPDF and ghostscript compression

2025-08-27 14:49:23 +00:00 · 2025-06-30 22:27:45 +01:00 · 2025-06-30 22:27:45 +01:00 · 782c30f934
commit 782c30f934
parent 11e3ccd19f
13 changed files with 646 additions and 143 deletions
--- a/3
+++ b/3
@ -51,7 +51,6 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
    tini \
    bash \
    curl \
-    qpdf \
    shadow \
    su-exec \
    openssl \
@ -69,9 +68,11 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
 	tesseract-ocr-data-deu \
 	tesseract-ocr-data-fra \
 	tesseract-ocr-data-por \
+    unpaper \
    # CV
    py3-opencv \
    python3 \
+    ocrmypdf \
    py3-pip \
    py3-pillow@testing \
    py3-pdf2image@testing && \
--- a/Dockerfile.fat
+++ b/Dockerfile.fat
@ -76,16 +76,17 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
    # pdftohtml
    poppler-utils \
    # OCR MY PDF (unpaper for descew and other advanced featues)
-    qpdf \
    tesseract-ocr-data-eng \
    tesseract-ocr-data-chi_sim \
 	tesseract-ocr-data-deu \
 	tesseract-ocr-data-fra \
 	tesseract-ocr-data-por \
+    unpaper \
    font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \
    # CV
    py3-opencv \
    python3 \
+    ocrmypdf \
    py3-pip \
    py3-pillow@testing \
    py3-pdf2image@testing && \
--- a/common/src/main/java/stirling/software/common/model/ApplicationProperties.java
+++ b/common/src/main/java/stirling/software/common/model/ApplicationProperties.java
@ -545,6 +545,8 @@ public class ApplicationProperties {
            private int calibreSessionLimit;
            private int qpdfSessionLimit;
            private int tesseractSessionLimit;
+            private int ghostscriptSessionLimit;
+            private int ocrMyPdfSessionLimit;

            public int getQpdfSessionLimit() {
                return qpdfSessionLimit > 0 ? qpdfSessionLimit : 2;
@ -577,6 +579,14 @@ public class ApplicationProperties {
            public int getCalibreSessionLimit() {
                return calibreSessionLimit > 0 ? calibreSessionLimit : 1;
            }
+
+            public int getGhostscriptSessionLimit() {
+                return ghostscriptSessionLimit > 0 ? ghostscriptSessionLimit : 8;
+            }
+
+            public int getOcrMyPdfSessionLimit() {
+                return ocrMyPdfSessionLimit > 0 ? ocrMyPdfSessionLimit : 2;
+            }
        }

        @Data
@ -589,6 +599,8 @@ public class ApplicationProperties {
            private long calibreTimeoutMinutes;
            private long tesseractTimeoutMinutes;
            private long qpdfTimeoutMinutes;
+            private long ghostscriptTimeoutMinutes;
+            private long ocrMyPdfTimeoutMinutes;

            public long getTesseractTimeoutMinutes() {
                return tesseractTimeoutMinutes > 0 ? tesseractTimeoutMinutes : 30;
@ -621,6 +633,14 @@ public class ApplicationProperties {
            public long getCalibreTimeoutMinutes() {
                return calibreTimeoutMinutes > 0 ? calibreTimeoutMinutes : 30;
            }
+
+            public long getGhostscriptTimeoutMinutes() {
+                return ghostscriptTimeoutMinutes > 0 ? ghostscriptTimeoutMinutes : 30;
+            }
+
+            public long getOcrMyPdfTimeoutMinutes() {
+                return ocrMyPdfTimeoutMinutes > 0 ? ocrMyPdfTimeoutMinutes : 30;
+            }
        }
    }
 }
--- a/common/src/main/java/stirling/software/common/util/ProcessExecutor.java
+++ b/common/src/main/java/stirling/software/common/util/ProcessExecutor.java
@ -84,6 +84,16 @@ public class ProcessExecutor {
                                                .getProcessExecutor()
                                                .getSessionLimit()
                                                .getCalibreSessionLimit();
+                                case GHOSTSCRIPT ->
+                                        applicationProperties
+                                                .getProcessExecutor()
+                                                .getSessionLimit()
+                                                .getGhostscriptSessionLimit();
+                                case OCR_MY_PDF ->
+                                        applicationProperties
+                                                .getProcessExecutor()
+                                                .getSessionLimit()
+                                                .getOcrMyPdfSessionLimit();
                            };

                    long timeoutMinutes =
@ -128,6 +138,16 @@ public class ProcessExecutor {
                                                .getProcessExecutor()
                                                .getTimeoutMinutes()
                                                .getCalibreTimeoutMinutes();
+                                case GHOSTSCRIPT ->
+                                        applicationProperties
+                                                .getProcessExecutor()
+                                                .getTimeoutMinutes()
+                                                .getGhostscriptTimeoutMinutes();
+                                case OCR_MY_PDF ->
+                                        applicationProperties
+                                                .getProcessExecutor()
+                                                .getTimeoutMinutes()
+                                                .getOcrMyPdfTimeoutMinutes();
                            };
                    return new ProcessExecutor(semaphoreLimit, liveUpdates, timeoutMinutes);
                });
@ -278,7 +298,9 @@ public class ProcessExecutor {
        INSTALL_APP,
        CALIBRE,
        TESSERACT,
-        QPDF
+        QPDF,
+        GHOSTSCRIPT,
+        OCR_MY_PDF
    }

    public class ProcessExecutorResult {
--- a/common/src/main/java/stirling/software/common/util/TempFileRegistry.java
+++ b/common/src/main/java/stirling/software/common/util/TempFileRegistry.java
@ -9,7 +9,6 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.ConcurrentSkipListSet;
 import java.util.stream.Collectors;

 import org.springframework.stereotype.Component;
@ -27,8 +26,7 @@ public class TempFileRegistry {
    private final ConcurrentMap<Path, Instant> registeredFiles = new ConcurrentHashMap<>();
    private final Set<Path> thirdPartyTempFiles =
            Collections.newSetFromMap(new ConcurrentHashMap<>());
-	     private final Set<Path> tempDirectories =
-	            Collections.newSetFromMap(new ConcurrentHashMap<>());
+    private final Set<Path> tempDirectories = Collections.newSetFromMap(new ConcurrentHashMap<>());

    /**
     * Register a temporary file with the registry.
--- a/stirling-pdf/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java
+++ b/stirling-pdf/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java
@ -21,6 +21,8 @@ public class EndpointConfiguration {
    private final ApplicationProperties applicationProperties;
    private Map<String, Boolean> endpointStatuses = new ConcurrentHashMap<>();
    private Map<String, Set<String>> endpointGroups = new ConcurrentHashMap<>();
+    private Set<String> disabledGroups = new HashSet<>();
+    private Map<String, Set<String>> endpointAlternatives = new ConcurrentHashMap<>();
    private final boolean runningProOrHigher;

    public EndpointConfiguration(
@ -51,16 +53,36 @@ public class EndpointConfiguration {
        if (endpoint.startsWith("/")) {
            endpoint = endpoint.substring(1);
        }
+        
+        // Check if endpoint has alternatives (multiple tools can handle it)
+        Set<String> alternatives = endpointAlternatives.get(endpoint);
+        if (alternatives != null && !alternatives.isEmpty()) {
+            // Endpoint is enabled if ANY of its alternative tools are enabled
+            for (String toolGroup : alternatives) {
+                if (isGroupEnabled(toolGroup)) {
+                    return true;
+                }
+            }
+            return false; // All alternative tools are disabled
+        }
+        
+        // Fallback to standard endpoint status check
        return endpointStatuses.getOrDefault(endpoint, true);
    }

    public boolean isGroupEnabled(String group) {
+        // Check if group is explicitly disabled first
+        if (disabledGroups.contains(group)) {
+            return false;
+        }
+
        Set<String> endpoints = endpointGroups.get(group);
        if (endpoints == null || endpoints.isEmpty()) {
            log.debug("Group '{}' does not exist or has no endpoints", group);
            return false;
        }

+        // Additional check: if all endpoints in group are disabled, consider group disabled
        for (String endpoint : endpoints) {
            if (!isEndpointEnabled(endpoint)) {
                return false;
@ -74,7 +96,22 @@ public class EndpointConfiguration {
        endpointGroups.computeIfAbsent(group, k -> new HashSet<>()).add(endpoint);
    }
    
+    public void addEndpointAlternative(String endpoint, String toolGroup) {
+        endpointAlternatives.computeIfAbsent(endpoint, k -> new HashSet<>()).add(toolGroup);
+    }
+
+    public void disableGroup(String group) {
+        disabledGroups.add(group);
+        Set<String> endpoints = endpointGroups.get(group);
+        if (endpoints != null) {
+            for (String endpoint : endpoints) {
+                disableEndpoint(endpoint);
+            }
+        }
+    }
+
    public void enableGroup(String group) {
+        disabledGroups.remove(group);
        Set<String> endpoints = endpointGroups.get(group);
        if (endpoints != null) {
            for (String endpoint : endpoints) {
@ -83,13 +120,8 @@ public class EndpointConfiguration {
        }
    }

-    public void disableGroup(String group) {
-        Set<String> endpoints = endpointGroups.get(group);
-        if (endpoints != null) {
-            for (String endpoint : endpoints) {
-                disableEndpoint(endpoint);
-            }
-        }
+    public Set<String> getDisabledGroups() {
+        return new HashSet<>(disabledGroups);
    }

    public void logDisabledEndpointsSummary() {
@ -101,6 +133,12 @@ public class EndpointConfiguration {
                        .sorted()
                        .toList();

+        if (!disabledGroups.isEmpty()) {
+            log.info(
+                    "Disabled groups: {}",
+                    String.join(", ", disabledGroups.stream().sorted().toList()));
+        }
+
        if (!disabledList.isEmpty()) {
            log.info(
                    "Total disabled endpoints: {}. Disabled endpoints: {}",
@ -212,7 +250,6 @@ public class EndpointConfiguration {
        // Unoconvert
        addEndpointToGroup("Unoconvert", "file-to-pdf");

-        addEndpointToGroup("tesseract", "ocr-pdf");

        // Java
        addEndpointToGroup("Java", "merge-pdfs");
@ -261,8 +298,13 @@ public class EndpointConfiguration {
        addEndpointToGroup("Javascript", "compare");
        addEndpointToGroup("Javascript", "adjust-contrast");

-        // qpdf dependent endpoints
-        addEndpointToGroup("qpdf", "repair");
+        // Multi-tool endpoints - endpoints that can be handled by multiple tools
+        addEndpointAlternative("repair", "qpdf");
+        addEndpointAlternative("repair", "Ghostscript");
+        addEndpointAlternative("compress-pdf", "qpdf");
+        addEndpointAlternative("compress-pdf", "Ghostscript");
+        addEndpointAlternative("ocr-pdf", "tesseract");
+        addEndpointAlternative("ocr-pdf", "OCRmyPDF");

        // Weasyprint dependent endpoints
        addEndpointToGroup("Weasyprint", "html-to-pdf");
--- a/stirling-pdf/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java
+++ b/stirling-pdf/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java
@ -34,6 +34,8 @@ public class ExternalAppDepConfig {
                new HashMap<>() {

                    {
+                        put("gs", List.of("Ghostscript"));
+                        put("ocrmypdf", List.of("OCRmyPDF"));
                        put("soffice", List.of("LibreOffice"));
                        put(weasyprintPath, List.of("Weasyprint"));
                        put("pdftohtml", List.of("Pdftohtml"));
@ -109,6 +111,8 @@ public class ExternalAppDepConfig {
    @PostConstruct
    public void checkDependencies() {
        // Check core dependencies
+        checkDependencyAndDisableGroup("gs");
+        checkDependencyAndDisableGroup("ocrmypdf");
        checkDependencyAndDisableGroup("tesseract");
        checkDependencyAndDisableGroup("soffice");
        checkDependencyAndDisableGroup("qpdf");
--- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java
+++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java
@ -65,12 +65,14 @@ public class CompressController {

    private final CustomPDFDocumentFactory pdfDocumentFactory;
    private final boolean qpdfEnabled;
+    private final boolean ghostscriptEnabled;

    public CompressController(
            CustomPDFDocumentFactory pdfDocumentFactory,
            EndpointConfiguration endpointConfiguration) {
        this.pdfDocumentFactory = pdfDocumentFactory;
        this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf");
+        this.ghostscriptEnabled = endpointConfiguration.isGroupEnabled("Ghostscript");
    }

    @Data
@ -697,25 +699,69 @@ public class CompressController {

            boolean sizeMet = false;
            boolean imageCompressionApplied = false;
-            boolean qpdfCompressionApplied = false;
-
-            if (qpdfEnabled && optimizeLevel <= 3) {
-                optimizeLevel = 4;
-            }
+            boolean externalCompressionApplied = false;

            while (!sizeMet && optimizeLevel <= 9) {
-                // Apply image compression for levels 4-9
-                if ((optimizeLevel >= 3 || Boolean.TRUE.equals(convertToGrayscale))
-                        && !imageCompressionApplied) {
-                    double scaleFactor = getScaleFactorForLevel(optimizeLevel);
-                    float jpegQuality = getJpegQualityForLevel(optimizeLevel);
+                // Apply external compression first
+                if (!externalCompressionApplied) {
+                    boolean ghostscriptSuccess = false;

-                    // Compress images
+                    // Try Ghostscript first if available - for ANY compression level
+                    if (ghostscriptEnabled) {
+                        try {
+                            applyGhostscriptCompression(
+                                    request, optimizeLevel, currentFile, tempFiles);
+                            log.info("Ghostscript compression applied successfully");
+                            ghostscriptSuccess = true;
+                        } catch (IOException e) {
+                            log.warn("Ghostscript compression failed, trying fallback methods");
+                        }
+                    }
+
+                    // Fallback to QPDF if Ghostscript failed or not available (levels 1-3 only)
+                    if (!ghostscriptSuccess && qpdfEnabled && optimizeLevel <= 3) {
+                        try {
+                            applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles);
+                            log.info("QPDF compression applied successfully");
+                        } catch (IOException e) {
+                            log.warn("QPDF compression also failed");
+                        }
+                    }
+
+                    if (!ghostscriptSuccess && !qpdfEnabled) {
+                        log.info(
+                                "No external compression tools available, using image compression only");
+                    }
+
+                    externalCompressionApplied = true;
+
+                    // Skip image compression if Ghostscript succeeded
+                    if (ghostscriptSuccess) {
+                        imageCompressionApplied = true;
+                    }
+                }
+
+                // Apply image compression for levels 4+ only if Ghostscript didn't run
+                if ((optimizeLevel >= 4 || Boolean.TRUE.equals(convertToGrayscale))
+                        && !imageCompressionApplied) {
+                    // Use different scale factors based on level
+                    double scaleFactor =
+                            switch (optimizeLevel) {
+                                case 4 -> 0.95; // 95% of original size
+                                case 5 -> 0.9; // 90% of original size
+                                case 6 -> 0.8; // 80% of original size
+                                case 7 -> 0.7; // 70% of original size
+                                case 8 -> 0.65; // 65% of original size
+                                case 9 -> 0.5; // 50% of original size
+                                default -> 1.0;
+                            };
+
+                    log.info("Applying image compression with scale factor: {}", scaleFactor);
                    Path compressedImageFile =
                            compressImagesInPDF(
                                    currentFile,
                                    scaleFactor,
-                                    jpegQuality,
+                                    0.7f, // Default JPEG quality
                                    Boolean.TRUE.equals(convertToGrayscale));

                    tempFiles.add(compressedImageFile);
@ -723,18 +769,6 @@ public class CompressController {
                    imageCompressionApplied = true;
                }

-                // Apply QPDF compression for all levels
-                if (!qpdfCompressionApplied && qpdfEnabled) {
-                    applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles);
-                    qpdfCompressionApplied = true;
-                } else if (!qpdfCompressionApplied) {
-                    // If QPDF is disabled, mark as applied and log
-                    if (!qpdfEnabled) {
-                        log.info("Skipping QPDF compression as QPDF group is disabled");
-                    }
-                    qpdfCompressionApplied = true;
-                }
-
                // Check if target size reached or not in auto mode
                long outputFileSize = Files.size(currentFile);
                if (outputFileSize <= expectedOutputSize || !autoMode) {
@ -754,7 +788,7 @@ public class CompressController {
                    } else {
                        // Reset flags for next iteration with higher optimization level
                        imageCompressionApplied = false;
-                        qpdfCompressionApplied = false;
+                        externalCompressionApplied = false;
                        optimizeLevel = newOptimizeLevel;
                    }
                }
@ -788,6 +822,96 @@ public class CompressController {
        }
    }

+    // Run Ghostscript compression
+    private void applyGhostscriptCompression(
+            OptimizePdfRequest request, int optimizeLevel, Path currentFile, List<Path> tempFiles)
+            throws IOException {
+
+        long preGsSize = Files.size(currentFile);
+        log.info("Pre-Ghostscript file size: {}", GeneralUtils.formatBytes(preGsSize));
+
+        // Create output file for Ghostscript
+        Path gsOutputFile = Files.createTempFile("gs_output_", ".pdf");
+        tempFiles.add(gsOutputFile);
+
+        // Build Ghostscript command based on optimization level
+        List<String> command = new ArrayList<>();
+        command.add("gs");
+        command.add("-sDEVICE=pdfwrite");
+        command.add("-dCompatibilityLevel=1.5");
+        command.add("-dNOPAUSE");
+        command.add("-dQUIET");
+        command.add("-dBATCH");
+
+        // Map optimization levels to Ghostscript settings
+        switch (optimizeLevel) {
+            case 1:
+                command.add("-dPDFSETTINGS=/prepress");
+                break;
+            case 2:
+                command.add("-dPDFSETTINGS=/printer");
+                break;
+            case 3:
+                command.add("-dPDFSETTINGS=/ebook");
+                break;
+            case 4:
+            case 5:
+                command.add("-dPDFSETTINGS=/screen");
+                break;
+            case 6:
+            case 7:
+                command.add("-dPDFSETTINGS=/screen");
+                command.add("-dColorImageResolution=150");
+                command.add("-dGrayImageResolution=150");
+                command.add("-dMonoImageResolution=300");
+                break;
+            case 8:
+            case 9:
+                command.add("-dPDFSETTINGS=/screen");
+                command.add("-dColorImageResolution=100");
+                command.add("-dGrayImageResolution=100");
+                command.add("-dMonoImageResolution=200");
+                break;
+            case 10:
+                command.add("-dPDFSETTINGS=/screen");
+                command.add("-dColorImageResolution=72");
+                command.add("-dGrayImageResolution=72");
+                command.add("-dMonoImageResolution=150");
+                break;
+            default:
+                command.add("-dPDFSETTINGS=/screen");
+                break;
+        }
+
+        command.add("-sOutputFile=" + gsOutputFile.toString());
+        command.add(currentFile.toString());
+
+        ProcessExecutorResult returnCode = null;
+        try {
+            returnCode =
+                    ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
+                            .runCommandWithOutputHandling(command);
+
+            if (returnCode.getRc() == 0) {
+                // Update current file to the Ghostscript output
+                Files.copy(gsOutputFile, currentFile, StandardCopyOption.REPLACE_EXISTING);
+
+                long postGsSize = Files.size(currentFile);
+                double gsReduction = 100.0 - ((postGsSize * 100.0) / preGsSize);
+                log.info(
+                        "Post-Ghostscript file size: {} (reduced by {}%)",
+                        GeneralUtils.formatBytes(postGsSize), String.format("%.1f", gsReduction));
+            } else {
+                log.warn("Ghostscript compression failed with return code: {}", returnCode.getRc());
+                throw new IOException("Ghostscript compression failed");
+            }
+
+        } catch (Exception e) {
+            log.warn("Ghostscript compression failed, will fallback to other methods", e);
+            throw new IOException("Ghostscript compression failed", e);
+        }
+    }
+
    // Run QPDF compression
    private void applyQpdfCompression(
            OptimizePdfRequest request, int optimizeLevel, Path currentFile, List<Path> tempFiles)
--- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java
+++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java
@ -2,6 +2,7 @@ package stirling.software.SPDF.controller.api.misc;

 import java.awt.image.BufferedImage;
 import java.io.*;
+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.*;
 import java.util.zip.ZipEntry;
@ -26,26 +27,42 @@ import io.github.pixee.security.Filenames;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.tags.Tag;

-import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;

+import stirling.software.SPDF.config.EndpointConfiguration;
 import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
 import stirling.software.common.model.ApplicationProperties;
 import stirling.software.common.service.CustomPDFDocumentFactory;
 import stirling.software.common.util.ProcessExecutor;
 import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
+import stirling.software.common.util.TempDirectory;
+import stirling.software.common.util.TempFile;
 import stirling.software.common.util.TempFileManager;
+import stirling.software.common.util.WebResponseUtils;

@RestController
@RequestMapping("/api/v1/misc")
@Tag(name = "Misc", description = "Miscellaneous APIs")
@Slf4j
-@RequiredArgsConstructor
 public class OCRController {

    private final ApplicationProperties applicationProperties;
    private final CustomPDFDocumentFactory pdfDocumentFactory;
    private final TempFileManager tempFileManager;
+    private final boolean ocrMyPdfEnabled;
+    private final boolean tesseractEnabled;
+
+    public OCRController(
+            ApplicationProperties applicationProperties,
+            CustomPDFDocumentFactory pdfDocumentFactory,
+            TempFileManager tempFileManager,
+            EndpointConfiguration endpointConfiguration) {
+        this.applicationProperties = applicationProperties;
+        this.pdfDocumentFactory = pdfDocumentFactory;
+        this.tempFileManager = tempFileManager;
+        this.ocrMyPdfEnabled = endpointConfiguration.isGroupEnabled("OCRmyPDF");
+        this.tesseractEnabled = endpointConfiguration.isGroupEnabled("tesseract");
+    }

    /** Gets the list of available Tesseract languages from the tessdata directory */
    public List<String> getAvailableTesseractLanguages() {
@ -63,39 +80,261 @@ public class OCRController {

    @PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
    @Operation(
-            summary = "Process PDF files with OCR using Tesseract",
+            summary = "Process a PDF file with OCR",
            description =
-                    "Takes a PDF file as input, performs OCR using specified languages and OCR type"
-                            + " (skip-text/force-ocr), and returns the processed PDF. Input:PDF"
-                            + " Output:PDF Type:SISO")
+                    "This endpoint processes a PDF file using OCR (Optical Character Recognition). "
+                            + "Users can specify languages, sidecar, deskew, clean, cleanFinal, ocrType, ocrRenderType, and removeImagesAfter options. "
+                            + "Uses OCRmyPDF if available, falls back to Tesseract. Input:PDF Output:PDF Type:SI-Conditional")
    public ResponseEntity<byte[]> processPdfWithOCR(
            @ModelAttribute ProcessPdfWithOcrRequest request)
            throws IOException, InterruptedException {
        MultipartFile inputFile = request.getFileInput();
-        List<String> languages = request.getLanguages();
+        List<String> selectedLanguages = request.getLanguages();
+        Boolean sidecar = request.isSidecar();
+        Boolean deskew = request.isDeskew();
+        Boolean clean = request.isClean();
+        Boolean cleanFinal = request.isCleanFinal();
        String ocrType = request.getOcrType();
+        String ocrRenderType = request.getOcrRenderType();
+        Boolean removeImagesAfter = request.isRemoveImagesAfter();

-        // Create a temp directory using TempFileManager directly
-        Path tempDirPath = tempFileManager.createTempDirectory();
-        File tempDir = tempDirPath.toFile();
+        if (selectedLanguages == null || selectedLanguages.isEmpty()) {
+            throw new IOException("Please select at least one language.");
+        }
+
+        if (!"hocr".equals(ocrRenderType) && !"sandwich".equals(ocrRenderType)) {
+            throw new IOException("ocrRenderType wrong");
+        }
+
+        // Get available Tesseract languages
+        List<String> availableLanguages = getAvailableTesseractLanguages();
+
+        // Validate selected languages
+        selectedLanguages =
+                selectedLanguages.stream().filter(availableLanguages::contains).toList();
+
+        if (selectedLanguages.isEmpty()) {
+            throw new IOException("None of the selected languages are valid.");
+        }
+
+        // Use try-with-resources for proper temp file management
+        try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
+                TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
+
+            inputFile.transferTo(tempInputFile.getFile());
+
+            TempFile sidecarTextFile = null;

            try {
-            File tempInputFile = new File(tempDir, "input.pdf");
-            File tempOutputDir = new File(tempDir, "output");
-            File tempImagesDir = new File(tempDir, "images");
-            File finalOutputFile = new File(tempDir, "final_output.pdf");
+                // Use OCRmyPDF if available (no fallback - error if it fails)
+                if (ocrMyPdfEnabled) {
+                    if (sidecar != null && sidecar) {
+                        sidecarTextFile = new TempFile(tempFileManager, ".txt");
+                    }
+
+                    processWithOcrMyPdf(
+                            selectedLanguages,
+                            sidecar,
+                            deskew,
+                            clean,
+                            cleanFinal,
+                            ocrType,
+                            ocrRenderType,
+                            removeImagesAfter,
+                            tempInputFile.getPath(),
+                            tempOutputFile.getPath(),
+                            sidecarTextFile != null ? sidecarTextFile.getPath() : null);
+                    log.info("OCRmyPDF processing completed successfully");
+                }
+                // Use Tesseract only if OCRmyPDF is not available
+                else if (tesseractEnabled) {
+                    processWithTesseract(
+                            selectedLanguages,
+                            ocrType,
+                            tempInputFile.getPath(),
+                            tempOutputFile.getPath());
+                    log.info("Tesseract processing completed successfully");
+                } else {
+                    throw new IOException("No OCR tools are available");
+                }
+
+                // Read the processed PDF file
+                byte[] pdfBytes = Files.readAllBytes(tempOutputFile.getPath());
+
+                // Return the OCR processed PDF as a response
+                String outputFilename =
+                        Filenames.toSimpleFileName(inputFile.getOriginalFilename())
+                                        .replaceFirst("[.][^.]+$", "")
+                                + "_OCR.pdf";
+
+                if (sidecar != null && sidecar && sidecarTextFile != null) {
+                    // Create a zip file containing both the PDF and the text file
+                    String outputZipFilename =
+                            Filenames.toSimpleFileName(inputFile.getOriginalFilename())
+                                            .replaceFirst("[.][^.]+$", "")
+                                    + "_OCR.zip";
+
+                    try (TempFile tempZipFile = new TempFile(tempFileManager, ".zip");
+                            ZipOutputStream zipOut =
+                                    new ZipOutputStream(
+                                            Files.newOutputStream(tempZipFile.getPath()))) {
+
+                        // Add PDF file to the zip
+                        ZipEntry pdfEntry = new ZipEntry(outputFilename);
+                        zipOut.putNextEntry(pdfEntry);
+                        zipOut.write(pdfBytes);
+                        zipOut.closeEntry();
+
+                        // Add text file to the zip
+                        ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
+                        zipOut.putNextEntry(txtEntry);
+                        Files.copy(sidecarTextFile.getPath(), zipOut);
+                        zipOut.closeEntry();
+
+                        zipOut.finish();
+
+                        byte[] zipBytes = Files.readAllBytes(tempZipFile.getPath());
+
+                        // Return the zip file containing both the PDF and the text file
+                        return WebResponseUtils.bytesToWebResponse(
+                                zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
+                    }
+                } else {
+                    // Return the OCR processed PDF as a response
+                    return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
+                }
+
+            } finally {
+                // Clean up sidecar temp file if created
+                if (sidecarTextFile != null) {
+                    try {
+                        sidecarTextFile.close();
+                    } catch (Exception e) {
+                        log.warn("Failed to close sidecar temp file", e);
+                    }
+                }
+            }
+        }
+    }
+
+    private void processWithOcrMyPdf(
+            List<String> selectedLanguages,
+            Boolean sidecar,
+            Boolean deskew,
+            Boolean clean,
+            Boolean cleanFinal,
+            String ocrType,
+            String ocrRenderType,
+            Boolean removeImagesAfter,
+            Path tempInputFile,
+            Path tempOutputFile,
+            Path sidecarTextPath)
+            throws IOException, InterruptedException {
+
+        // Build OCRmyPDF command
+        String languageOption = String.join("+", selectedLanguages);
+
+        List<String> command =
+                new ArrayList<>(
+                        Arrays.asList(
+                                "ocrmypdf",
+                                "--verbose",
+                                "2",
+                                "--output-type",
+                                "pdf",
+                                "--pdf-renderer",
+                                ocrRenderType));
+
+        if (sidecar != null && sidecar && sidecarTextPath != null) {
+            command.add("--sidecar");
+            command.add(sidecarTextPath.toString());
+        }
+
+        if (deskew != null && deskew) {
+            command.add("--deskew");
+        }
+        if (clean != null && clean) {
+            command.add("--clean");
+        }
+        if (cleanFinal != null && cleanFinal) {
+            command.add("--clean-final");
+        }
+        if (ocrType != null && !"".equals(ocrType)) {
+            if ("skip-text".equals(ocrType)) {
+                command.add("--skip-text");
+            } else if ("force-ocr".equals(ocrType)) {
+                command.add("--force-ocr");
+            }
+        }
+
+        command.addAll(
+                Arrays.asList(
+                        "--language",
+                        languageOption,
+                        tempInputFile.toString(),
+                        tempOutputFile.toString()));
+
+        // Run CLI command
+        ProcessExecutorResult result =
+                ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
+                        .runCommandWithOutputHandling(command);
+
+        if (result.getRc() != 0
+                && result.getMessages().contains("multiprocessing/synchronize.py")
+                && result.getMessages().contains("OSError: [Errno 38] Function not implemented")) {
+            command.add("--jobs");
+            command.add("1");
+            result =
+                    ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
+                            .runCommandWithOutputHandling(command);
+        }
+
+        if (result.getRc() != 0) {
+            throw new IOException("OCRmyPDF failed with return code: " + result.getRc());
+        }
+
+        // Remove images from the OCR processed PDF if the flag is set to true
+        if (removeImagesAfter != null && removeImagesAfter) {
+            try (TempFile tempPdfWithoutImages = new TempFile(tempFileManager, "_no_images.pdf")) {
+                List<String> gsCommand =
+                        Arrays.asList(
+                                "gs",
+                                "-sDEVICE=pdfwrite",
+                                "-dFILTERIMAGE",
+                                "-o",
+                                tempPdfWithoutImages.getPath().toString(),
+                                tempOutputFile.toString());
+
+                ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
+                        .runCommandWithOutputHandling(gsCommand);
+
+                // Replace output file with version without images
+                Files.copy(
+                        tempPdfWithoutImages.getPath(),
+                        tempOutputFile,
+                        java.nio.file.StandardCopyOption.REPLACE_EXISTING);
+            }
+        }
+    }
+
+    private void processWithTesseract(
+            List<String> selectedLanguages, String ocrType, Path tempInputFile, Path tempOutputFile)
+            throws IOException, InterruptedException {
+
+        // Create temp directory for Tesseract processing
+        try (TempDirectory tempDir = new TempDirectory(tempFileManager)) {
+            File tempOutputDir = new File(tempDir.getPath().toFile(), "output");
+            File tempImagesDir = new File(tempDir.getPath().toFile(), "images");
+            File finalOutputFile = new File(tempDir.getPath().toFile(), "final_output.pdf");

            // Create directories
            tempOutputDir.mkdirs();
            tempImagesDir.mkdirs();

-            // Save input file
-            inputFile.transferTo(tempInputFile);
-
            PDFMergerUtility merger = new PDFMergerUtility();
            merger.setDestinationFileName(finalOutputFile.toString());

-            try (PDDocument document = pdfDocumentFactory.load(tempInputFile)) {
+            try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) {
                PDFRenderer pdfRenderer = new PDFRenderer(document);
                int pageCount = document.getNumberOfPages();

@ -135,35 +374,20 @@ public class OCRController {
                                new File(tempOutputDir, String.format("page_%d", pageNum))
                                        .toString());
                        command.add("-l");
-                        command.add(String.join("+", languages));
-                        // Always output PDF
-                        command.add("pdf");
+                        command.add(String.join("+", selectedLanguages));
+                        command.add("pdf"); // Always output PDF

-                        // Use ProcessExecutor to run tesseract command
-                        try {
                        ProcessExecutorResult result =
                                ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
                                        .runCommandWithOutputHandling(command);

-                            log.debug(
-                                    "Tesseract OCR completed for page {} with exit code {}",
-                                    pageNum,
-                                    result.getRc());
+                        if (result.getRc() != 0) {
+                            throw new RuntimeException(
+                                    "Tesseract failed with exit code: " + result.getRc());
+                        }

                        // Add OCR'd PDF to merger
                        merger.addSource(pageOutputPath);
-                        } catch (IOException | InterruptedException e) {
-                            log.error(
-                                    "Error processing page {} with tesseract: {}",
-                                    pageNum,
-                                    e.getMessage());
-                            // If OCR fails, fall back to the original page
-                            try (PDDocument pageDoc = new PDDocument()) {
-                                pageDoc.addPage(page);
-                                pageDoc.save(pageOutputPath);
-                                merger.addSource(pageOutputPath);
-                            }
-                        }
                    } else {
                        // Save original page without OCR
                        try (PDDocument pageDoc = new PDDocument()) {
@ -178,40 +402,11 @@ public class OCRController {
            // Merge all pages into final PDF
            merger.mergeDocuments(null);

-            // Read the final PDF file
-            byte[] pdfContent = java.nio.file.Files.readAllBytes(finalOutputFile.toPath());
-            String outputFilename =
-                    Filenames.toSimpleFileName(inputFile.getOriginalFilename())
-                                    .replaceFirst("[.][^.]+$", "")
-                            + "_OCR.pdf";
-
-            return ResponseEntity.ok()
-                    .header(
-                            "Content-Disposition",
-                            "attachment; filename=\"" + outputFilename + "\"")
-                    .contentType(MediaType.APPLICATION_PDF)
-                    .body(pdfContent);
-        } finally {
-            // Clean up the temp directory and all its contents
-            tempFileManager.deleteTempDirectory(tempDirPath);
-        }
-    }
-
-    private void addFileToZip(File file, String filename, ZipOutputStream zipOut)
-            throws IOException {
-        if (!file.exists()) {
-            log.warn("File {} does not exist, skipping", file);
-            return;
-        }
-        try (FileInputStream fis = new FileInputStream(file)) {
-            ZipEntry zipEntry = new ZipEntry(filename);
-            zipOut.putNextEntry(zipEntry);
-            byte[] buffer = new byte[1024];
-            int length;
-            while ((length = fis.read(buffer)) >= 0) {
-                zipOut.write(buffer, 0, length);
-            }
-            zipOut.closeEntry();
+            // Copy final output to the expected location
+            Files.copy(
+                    finalOutputFile.toPath(),
+                    tempOutputFile,
+                    java.nio.file.StandardCopyOption.REPLACE_EXISTING);
        }
    }
 }
--- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/RepairController.java
+++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/RepairController.java
@ -15,8 +15,7 @@ import io.github.pixee.security.Filenames;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.tags.Tag;

-import lombok.RequiredArgsConstructor;
-
+import stirling.software.SPDF.config.EndpointConfiguration;
 import stirling.software.common.model.api.PDFFile;
 import stirling.software.common.service.CustomPDFDocumentFactory;
 import stirling.software.common.util.ProcessExecutor;
@ -28,17 +27,28 @@ import stirling.software.common.util.WebResponseUtils;
@RestController
@RequestMapping("/api/v1/misc")
@Tag(name = "Misc", description = "Miscellaneous APIs")
-@RequiredArgsConstructor
 public class RepairController {

    private final CustomPDFDocumentFactory pdfDocumentFactory;
    private final TempFileManager tempFileManager;
+    private final boolean ghostscriptEnabled;
+    private final boolean qpdfEnabled;
+
+    public RepairController(
+            CustomPDFDocumentFactory pdfDocumentFactory,
+            TempFileManager tempFileManager,
+            EndpointConfiguration endpointConfiguration) {
+        this.pdfDocumentFactory = pdfDocumentFactory;
+        this.tempFileManager = tempFileManager;
+        this.ghostscriptEnabled = endpointConfiguration.isGroupEnabled("Ghostscript");
+        this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf");
+    }

    @PostMapping(consumes = "multipart/form-data", value = "/repair")
    @Operation(
            summary = "Repair a PDF file",
            description =
-                    "This endpoint repairs a given PDF file by running qpdf command. The PDF is"
+                    "This endpoint repairs a given PDF file by running Ghostscript (primary), qpdf (fallback), or PDFBox (if no external tools available). The PDF is"
                            + " first saved to a temporary location, repaired, read back, and then"
                            + " returned as a response. Input:PDF Output:PDF Type:SISO")
    public ResponseEntity<byte[]> repairPdf(@ModelAttribute PDFFile file)
@ -46,25 +56,72 @@ public class RepairController {
        MultipartFile inputFile = file.getFileInput();

        // Use TempFile with try-with-resources for automatic cleanup
-        try (TempFile tempFile = new TempFile(tempFileManager, ".pdf")) {
+        try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
+                TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
+
            // Save the uploaded file to the temporary location
-            inputFile.transferTo(tempFile.getFile());
+            inputFile.transferTo(tempInputFile.getFile());

-            List<String> command = new ArrayList<>();
-            command.add("qpdf");
-            command.add("--replace-input"); // Automatically fixes problems it can
-            command.add("--qdf"); // Linearizes and normalizes PDF structure
-            command.add("--object-streams=disable"); // Can help with some corruptions
-            command.add(tempFile.getFile().getAbsolutePath());
+            boolean repairSuccess = false;

-            ProcessExecutorResult returnCode =
+            // Try Ghostscript first if available
+            if (ghostscriptEnabled) {
+                try {
+                    List<String> gsCommand = new ArrayList<>();
+                    gsCommand.add("gs");
+                    gsCommand.add("-o");
+                    gsCommand.add(tempOutputFile.getPath().toString());
+                    gsCommand.add("-sDEVICE=pdfwrite");
+                    gsCommand.add(tempInputFile.getPath().toString());
+
+                    ProcessExecutorResult gsResult =
+                            ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
+                                    .runCommandWithOutputHandling(gsCommand);
+
+                    if (gsResult.getRc() == 0) {
+                        repairSuccess = true;
+                    }
+                } catch (Exception e) {
+                    // Log and continue to QPDF fallback
+                    System.out.println(
+                            "Ghostscript repair failed, trying QPDF fallback: " + e.getMessage());
+                }
+            }
+
+            // Fallback to QPDF if Ghostscript failed or not available
+            if (!repairSuccess && qpdfEnabled) {
+                List<String> qpdfCommand = new ArrayList<>();
+                qpdfCommand.add("qpdf");
+                qpdfCommand.add("--replace-input"); // Automatically fixes problems it can
+                qpdfCommand.add("--qdf"); // Linearizes and normalizes PDF structure
+                qpdfCommand.add("--object-streams=disable"); // Can help with some corruptions
+                qpdfCommand.add(tempInputFile.getPath().toString());
+                qpdfCommand.add(tempOutputFile.getPath().toString());
+
+                ProcessExecutorResult qpdfResult =
                        ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
-                            .runCommandWithOutputHandling(command);
+                                .runCommandWithOutputHandling(qpdfCommand);

-            // Read the optimized PDF file
-            byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempFile.getFile());
+                repairSuccess = true;
+            }

-            // Return the optimized PDF as a response
+            // Use PDFBox as last resort if no external tools are available
+            if (!repairSuccess) {
+                if (!ghostscriptEnabled && !qpdfEnabled) {
+                    // Basic PDFBox repair - load and save to fix structural issues
+                    try (var document = pdfDocumentFactory.load(tempInputFile.getFile())) {
+                        document.save(tempOutputFile.getFile());
+                        repairSuccess = true;
+                    }
+                } else {
+                    throw new IOException("PDF repair failed with available tools");
+                }
+            }
+
+            // Read the repaired PDF file
+            byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempOutputFile.getFile());
+
+            // Return the repaired PDF as a response
            String outputFilename =
                    Filenames.toSimpleFileName(inputFile.getOriginalFilename())
                                    .replaceFirst("[.][^.]+$", "")
--- a/stirling-pdf/src/main/java/stirling/software/SPDF/model/api/misc/ProcessPdfWithOcrRequest.java
+++ b/stirling-pdf/src/main/java/stirling/software/SPDF/model/api/misc/ProcessPdfWithOcrRequest.java
@ -19,6 +19,18 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
            defaultValue = "[\"eng\"]")
    private List<String> languages;

+    @Schema(description = "Include OCR text in a sidecar text file if set to true")
+    private boolean sidecar;
+
+    @Schema(description = "Deskew the input file if set to true")
+    private boolean deskew;
+
+    @Schema(description = "Clean the input file if set to true")
+    private boolean clean;
+
+    @Schema(description = "Clean the final output if set to true")
+    private boolean cleanFinal;
+
    @Schema(
            description = "Specify the OCR type, e.g., 'skip-text', 'force-ocr', or 'Normal'",
            requiredMode = Schema.RequiredMode.REQUIRED,
@ -31,4 +43,7 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
            allowableValues = {"hocr", "sandwich"},
            defaultValue = "hocr")
    private String ocrRenderType = "hocr";
+
+    @Schema(description = "Remove images from the output PDF if set to true")
+    private boolean removeImagesAfter;
 }
--- a/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html
+++ b/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html
@ -79,6 +79,30 @@
                  </select>
                </div>
                <br>
+                <div class="mb-3" th:if>
+                  <label class="form-label">OCR Options</label>
+                  <div class="form-check">
+                    <input type="checkbox" class="form-check-input" id="sidecar" name="sidecar" value="true">
+                    <label class="form-check-label" for="sidecar">Include OCR text in sidecar text file</label>
+                  </div>
+                  <div class="form-check">
+                    <input type="checkbox" class="form-check-input" id="deskew" name="deskew" value="true">
+                    <label class="form-check-label" for="deskew">Deskew input file</label>
+                  </div>
+                  <div class="form-check">
+                    <input type="checkbox" class="form-check-input" id="clean" name="clean" value="true">
+                    <label class="form-check-label" for="clean">Clean input file</label>
+                  </div>
+                  <div class="form-check">
+                    <input type="checkbox" class="form-check-input" id="cleanFinal" name="cleanFinal" value="true">
+                    <label class="form-check-label" for="cleanFinal">Clean final output</label>
+                  </div>
+                  <div class="form-check">
+                    <input type="checkbox" class="form-check-input" id="removeImagesAfter" name="removeImagesAfter" value="true">
+                    <label class="form-check-label" for="removeImagesAfter">Remove images from output PDF</label>
+                  </div>
+                </div>
+                <br>
                <button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{ocr.submit}"></button>
              </form>
              <script th:inline="javascript">