diff --git a/Dockerfile b/Dockerfile index fd02b29f7..1edf05841 100644 --- a/Dockerfile +++ b/Dockerfile @@ -51,7 +51,6 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a tini \ bash \ curl \ - qpdf \ shadow \ su-exec \ openssl \ @@ -69,9 +68,11 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a tesseract-ocr-data-deu \ tesseract-ocr-data-fra \ tesseract-ocr-data-por \ + unpaper \ # CV py3-opencv \ python3 \ + ocrmypdf \ py3-pip \ py3-pillow@testing \ py3-pdf2image@testing && \ diff --git a/Dockerfile.fat b/Dockerfile.fat index 666ba98be..976c1ee17 100644 --- a/Dockerfile.fat +++ b/Dockerfile.fat @@ -76,16 +76,17 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a # pdftohtml poppler-utils \ # OCR MY PDF (unpaper for descew and other advanced featues) - qpdf \ tesseract-ocr-data-eng \ tesseract-ocr-data-chi_sim \ tesseract-ocr-data-deu \ tesseract-ocr-data-fra \ tesseract-ocr-data-por \ + unpaper \ font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \ # CV py3-opencv \ python3 \ + ocrmypdf \ py3-pip \ py3-pillow@testing \ py3-pdf2image@testing && \ diff --git a/common/src/main/java/stirling/software/common/model/ApplicationProperties.java b/common/src/main/java/stirling/software/common/model/ApplicationProperties.java index 0017fa34a..e4edf2baa 100644 --- a/common/src/main/java/stirling/software/common/model/ApplicationProperties.java +++ b/common/src/main/java/stirling/software/common/model/ApplicationProperties.java @@ -545,6 +545,8 @@ public class ApplicationProperties { private int calibreSessionLimit; private int qpdfSessionLimit; private int tesseractSessionLimit; + private int ghostscriptSessionLimit; + private int ocrMyPdfSessionLimit; public int getQpdfSessionLimit() { return qpdfSessionLimit > 0 ? qpdfSessionLimit : 2; @@ -577,6 +579,14 @@ public class ApplicationProperties { public int getCalibreSessionLimit() { return calibreSessionLimit > 0 ? calibreSessionLimit : 1; } + + public int getGhostscriptSessionLimit() { + return ghostscriptSessionLimit > 0 ? ghostscriptSessionLimit : 8; + } + + public int getOcrMyPdfSessionLimit() { + return ocrMyPdfSessionLimit > 0 ? ocrMyPdfSessionLimit : 2; + } } @Data @@ -589,6 +599,8 @@ public class ApplicationProperties { private long calibreTimeoutMinutes; private long tesseractTimeoutMinutes; private long qpdfTimeoutMinutes; + private long ghostscriptTimeoutMinutes; + private long ocrMyPdfTimeoutMinutes; public long getTesseractTimeoutMinutes() { return tesseractTimeoutMinutes > 0 ? tesseractTimeoutMinutes : 30; @@ -621,6 +633,14 @@ public class ApplicationProperties { public long getCalibreTimeoutMinutes() { return calibreTimeoutMinutes > 0 ? calibreTimeoutMinutes : 30; } + + public long getGhostscriptTimeoutMinutes() { + return ghostscriptTimeoutMinutes > 0 ? ghostscriptTimeoutMinutes : 30; + } + + public long getOcrMyPdfTimeoutMinutes() { + return ocrMyPdfTimeoutMinutes > 0 ? ocrMyPdfTimeoutMinutes : 30; + } } } } diff --git a/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java b/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java index d53c4ea84..895aa70de 100644 --- a/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java +++ b/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java @@ -308,7 +308,7 @@ public class TempFileCleanupService { } java.util.List subdirectories = new java.util.ArrayList<>(); - + try (Stream pathStream = Files.list(directory)) { pathStream.forEach( path -> { @@ -347,7 +347,7 @@ public class TempFileCleanupService { } }); } - + for (Path subdirectory : subdirectories) { try { cleanupDirectoryStreaming( diff --git a/common/src/main/java/stirling/software/common/util/ProcessExecutor.java b/common/src/main/java/stirling/software/common/util/ProcessExecutor.java index 09c5ff675..ee7297153 100644 --- a/common/src/main/java/stirling/software/common/util/ProcessExecutor.java +++ b/common/src/main/java/stirling/software/common/util/ProcessExecutor.java @@ -84,6 +84,16 @@ public class ProcessExecutor { .getProcessExecutor() .getSessionLimit() .getCalibreSessionLimit(); + case GHOSTSCRIPT -> + applicationProperties + .getProcessExecutor() + .getSessionLimit() + .getGhostscriptSessionLimit(); + case OCR_MY_PDF -> + applicationProperties + .getProcessExecutor() + .getSessionLimit() + .getOcrMyPdfSessionLimit(); }; long timeoutMinutes = @@ -128,6 +138,16 @@ public class ProcessExecutor { .getProcessExecutor() .getTimeoutMinutes() .getCalibreTimeoutMinutes(); + case GHOSTSCRIPT -> + applicationProperties + .getProcessExecutor() + .getTimeoutMinutes() + .getGhostscriptTimeoutMinutes(); + case OCR_MY_PDF -> + applicationProperties + .getProcessExecutor() + .getTimeoutMinutes() + .getOcrMyPdfTimeoutMinutes(); }; return new ProcessExecutor(semaphoreLimit, liveUpdates, timeoutMinutes); }); @@ -278,7 +298,9 @@ public class ProcessExecutor { INSTALL_APP, CALIBRE, TESSERACT, - QPDF + QPDF, + GHOSTSCRIPT, + OCR_MY_PDF } public class ProcessExecutorResult { diff --git a/common/src/main/java/stirling/software/common/util/TempFileRegistry.java b/common/src/main/java/stirling/software/common/util/TempFileRegistry.java index 1e55c6b15..323b3bff3 100644 --- a/common/src/main/java/stirling/software/common/util/TempFileRegistry.java +++ b/common/src/main/java/stirling/software/common/util/TempFileRegistry.java @@ -9,7 +9,6 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; -import java.util.concurrent.ConcurrentSkipListSet; import java.util.stream.Collectors; import org.springframework.stereotype.Component; @@ -24,11 +23,10 @@ import lombok.extern.slf4j.Slf4j; @Component public class TempFileRegistry { - private final ConcurrentMap registeredFiles = new ConcurrentHashMap<>(); - private final Set thirdPartyTempFiles = - Collections.newSetFromMap(new ConcurrentHashMap<>()); - private final Set tempDirectories = - Collections.newSetFromMap(new ConcurrentHashMap<>()); + private final ConcurrentMap registeredFiles = new ConcurrentHashMap<>(); + private final Set thirdPartyTempFiles = + Collections.newSetFromMap(new ConcurrentHashMap<>()); + private final Set tempDirectories = Collections.newSetFromMap(new ConcurrentHashMap<>()); /** * Register a temporary file with the registry. diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java b/stirling-pdf/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java index 361eeace3..83ff4d39f 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java @@ -21,6 +21,8 @@ public class EndpointConfiguration { private final ApplicationProperties applicationProperties; private Map endpointStatuses = new ConcurrentHashMap<>(); private Map> endpointGroups = new ConcurrentHashMap<>(); + private Set disabledGroups = new HashSet<>(); + private Map> endpointAlternatives = new ConcurrentHashMap<>(); private final boolean runningProOrHigher; public EndpointConfiguration( @@ -34,13 +36,14 @@ public class EndpointConfiguration { public void enableEndpoint(String endpoint) { endpointStatuses.put(endpoint, true); + log.debug("Enabled endpoint: {}", endpoint); } public void disableEndpoint(String endpoint) { - if (!endpointStatuses.containsKey(endpoint) || endpointStatuses.get(endpoint) != false) { - log.debug("Disabling {}", endpoint); - endpointStatuses.put(endpoint, false); + if (!Boolean.FALSE.equals(endpointStatuses.get(endpoint))) { + log.debug("Disabling endpoint: {}", endpoint); } + endpointStatuses.put(endpoint, false); } public Map getEndpointStatuses() { @@ -48,25 +51,82 @@ public class EndpointConfiguration { } public boolean isEndpointEnabled(String endpoint) { + String original = endpoint; if (endpoint.startsWith("/")) { endpoint = endpoint.substring(1); } - return endpointStatuses.getOrDefault(endpoint, true); - } - public boolean isGroupEnabled(String group) { - Set endpoints = endpointGroups.get(group); - if (endpoints == null || endpoints.isEmpty()) { - log.debug("Group '{}' does not exist or has no endpoints", group); + // Rule 1: Explicit flag wins - if disabled via disableEndpoint(), stay disabled + Boolean explicitStatus = endpointStatuses.get(endpoint); + if (Boolean.FALSE.equals(explicitStatus)) { + log.debug("isEndpointEnabled('{}') -> false (explicitly disabled)", original); return false; } + // Rule 2: Functional-group override - check if endpoint belongs to any disabled functional + // group + for (String group : endpointGroups.keySet()) { + if (disabledGroups.contains(group) && endpointGroups.get(group).contains(endpoint)) { + // Skip tool groups (qpdf, OCRmyPDF, Ghostscript, LibreOffice, etc.) + if (!isToolGroup(group)) { + log.debug( + "isEndpointEnabled('{}') -> false (functional group '{}' disabled)", + original, + group); + return false; + } + } + } + + // Rule 3: Tool-group fallback - check if at least one alternative tool group is enabled + Set alternatives = endpointAlternatives.get(endpoint); + if (alternatives != null && !alternatives.isEmpty()) { + boolean hasEnabledToolGroup = + alternatives.stream() + .anyMatch(toolGroup -> !disabledGroups.contains(toolGroup)); + log.debug( + "isEndpointEnabled('{}') -> {} (tool groups check)", + original, + hasEnabledToolGroup); + return hasEnabledToolGroup; + } + + // Default: enabled if not explicitly disabled + boolean enabled = !Boolean.FALSE.equals(explicitStatus); + log.debug("isEndpointEnabled('{}') -> {} (default)", original, enabled); + return enabled; + } + + public boolean isGroupEnabled(String group) { + // Rule 1: If group is explicitly disabled, it stays disabled + if (disabledGroups.contains(group)) { + log.debug("isGroupEnabled('{}') -> false (explicitly disabled)", group); + return false; + } + + Set endpoints = endpointGroups.get(group); + if (endpoints == null || endpoints.isEmpty()) { + log.debug("isGroupEnabled('{}') -> false (no endpoints)", group); + return false; + } + + // Rule 2: For functional groups, check if all endpoints are enabled + // Rule 3: For tool groups, they're enabled unless explicitly disabled (handled above) + if (isToolGroup(group)) { + log.debug("isGroupEnabled('{}') -> true (tool group not disabled)", group); + return true; + } + + // For functional groups, check each endpoint individually for (String endpoint : endpoints) { - if (!isEndpointEnabled(endpoint)) { + if (!isEndpointEnabledDirectly(endpoint)) { + log.debug( + "isGroupEnabled('{}') -> false (endpoint '{}' disabled)", group, endpoint); return false; } } + log.debug("isGroupEnabled('{}') -> true (all endpoints enabled)", group); return true; } @@ -74,33 +134,48 @@ public class EndpointConfiguration { endpointGroups.computeIfAbsent(group, k -> new HashSet<>()).add(endpoint); } - public void enableGroup(String group) { - Set endpoints = endpointGroups.get(group); - if (endpoints != null) { - for (String endpoint : endpoints) { - enableEndpoint(endpoint); - } - } + public void addEndpointAlternative(String endpoint, String toolGroup) { + endpointAlternatives.computeIfAbsent(endpoint, k -> new HashSet<>()).add(toolGroup); } public void disableGroup(String group) { + if (disabledGroups.add(group)) { + log.debug("Disabling group: {}", group); + } Set endpoints = endpointGroups.get(group); if (endpoints != null) { - for (String endpoint : endpoints) { - disableEndpoint(endpoint); - } + endpoints.forEach(this::disableEndpoint); } } + public void enableGroup(String group) { + if (disabledGroups.remove(group)) { + log.debug("Enabling group: {}", group); + } + Set endpoints = endpointGroups.get(group); + if (endpoints != null) { + endpoints.forEach(this::enableEndpoint); + } + } + + public Set getDisabledGroups() { + return new HashSet<>(disabledGroups); + } + public void logDisabledEndpointsSummary() { List disabledList = endpointStatuses.entrySet().stream() - .filter(entry -> !entry.getValue()) // only get disabled endpoints (value - // is false) + .filter(entry -> Boolean.FALSE.equals(entry.getValue())) .map(Map.Entry::getKey) .sorted() .toList(); + if (!disabledGroups.isEmpty()) { + log.info( + "Disabled groups: {}", + String.join(", ", disabledGroups.stream().sorted().toList())); + } + if (!disabledList.isEmpty()) { log.info( "Total disabled endpoints: {}. Disabled endpoints: {}", @@ -212,8 +287,6 @@ public class EndpointConfiguration { // Unoconvert addEndpointToGroup("Unoconvert", "file-to-pdf"); - addEndpointToGroup("tesseract", "ocr-pdf"); - // Java addEndpointToGroup("Java", "merge-pdfs"); addEndpointToGroup("Java", "remove-pages"); @@ -261,8 +334,27 @@ public class EndpointConfiguration { addEndpointToGroup("Javascript", "compare"); addEndpointToGroup("Javascript", "adjust-contrast"); - // qpdf dependent endpoints + /* qpdf */ addEndpointToGroup("qpdf", "repair"); + addEndpointToGroup("qpdf", "compress-pdf"); + + /* Ghostscript */ + addEndpointToGroup("Ghostscript", "repair"); + addEndpointToGroup("Ghostscript", "compress-pdf"); + + /* tesseract */ + addEndpointToGroup("tesseract", "ocr-pdf"); + + /* OCRmyPDF */ + addEndpointToGroup("OCRmyPDF", "ocr-pdf"); + + // Multi-tool endpoints - endpoints that can be handled by multiple tools + addEndpointAlternative("repair", "qpdf"); + addEndpointAlternative("repair", "Ghostscript"); + addEndpointAlternative("compress-pdf", "qpdf"); + addEndpointAlternative("compress-pdf", "Ghostscript"); + addEndpointAlternative("ocr-pdf", "tesseract"); + addEndpointAlternative("ocr-pdf", "OCRmyPDF"); // Weasyprint dependent endpoints addEndpointToGroup("Weasyprint", "html-to-pdf"); @@ -304,4 +396,43 @@ public class EndpointConfiguration { public Set getEndpointsForGroup(String group) { return endpointGroups.getOrDefault(group, new HashSet<>()); } + + private boolean isToolGroup(String group) { + return "qpdf".equals(group) + || "OCRmyPDF".equals(group) + || "Ghostscript".equals(group) + || "LibreOffice".equals(group) + || "tesseract".equals(group) + || "CLI".equals(group) + || "Python".equals(group) + || "OpenCV".equals(group) + || "Unoconvert".equals(group) + || "Java".equals(group) + || "Javascript".equals(group) + || "Weasyprint".equals(group) + || "Pdftohtml".equals(group); + } + + private boolean isEndpointEnabledDirectly(String endpoint) { + if (endpoint.startsWith("/")) { + endpoint = endpoint.substring(1); + } + + // Check explicit disable flag + Boolean explicitStatus = endpointStatuses.get(endpoint); + if (Boolean.FALSE.equals(explicitStatus)) { + return false; + } + + // Check if endpoint belongs to any disabled functional group + for (String group : endpointGroups.keySet()) { + if (disabledGroups.contains(group) && endpointGroups.get(group).contains(endpoint)) { + if (!isToolGroup(group)) { + return false; + } + } + } + + return true; + } } diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java b/stirling-pdf/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java index 7dd6d2b3b..6d857c679 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java @@ -34,6 +34,8 @@ public class ExternalAppDepConfig { new HashMap<>() { { + put("gs", List.of("Ghostscript")); + put("ocrmypdf", List.of("OCRmyPDF")); put("soffice", List.of("LibreOffice")); put(weasyprintPath, List.of("Weasyprint")); put("pdftohtml", List.of("Pdftohtml")); @@ -109,6 +111,8 @@ public class ExternalAppDepConfig { @PostConstruct public void checkDependencies() { // Check core dependencies + checkDependencyAndDisableGroup("gs"); + checkDependencyAndDisableGroup("ocrmypdf"); checkDependencyAndDisableGroup("tesseract"); checkDependencyAndDisableGroup("soffice"); checkDependencyAndDisableGroup("qpdf"); diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java index 8509f5056..b70edecd1 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java @@ -47,6 +47,7 @@ import lombok.AllArgsConstructor; import lombok.Data; import lombok.EqualsAndHashCode; import lombok.NoArgsConstructor; +import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.config.EndpointConfiguration; @@ -61,16 +62,18 @@ import stirling.software.common.util.WebResponseUtils; @RequestMapping("/api/v1/misc") @Slf4j @Tag(name = "Misc", description = "Miscellaneous APIs") +@RequiredArgsConstructor public class CompressController { private final CustomPDFDocumentFactory pdfDocumentFactory; - private final boolean qpdfEnabled; + private final EndpointConfiguration endpointConfiguration; - public CompressController( - CustomPDFDocumentFactory pdfDocumentFactory, - EndpointConfiguration endpointConfiguration) { - this.pdfDocumentFactory = pdfDocumentFactory; - this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf"); + private boolean isQpdfEnabled() { + return endpointConfiguration.isGroupEnabled("qpdf"); + } + + private boolean isGhostscriptEnabled() { + return endpointConfiguration.isGroupEnabled("Ghostscript"); } @Data @@ -697,25 +700,69 @@ public class CompressController { boolean sizeMet = false; boolean imageCompressionApplied = false; - boolean qpdfCompressionApplied = false; - - if (qpdfEnabled && optimizeLevel <= 3) { - optimizeLevel = 4; - } + boolean externalCompressionApplied = false; while (!sizeMet && optimizeLevel <= 9) { - // Apply image compression for levels 4-9 - if ((optimizeLevel >= 3 || Boolean.TRUE.equals(convertToGrayscale)) - && !imageCompressionApplied) { - double scaleFactor = getScaleFactorForLevel(optimizeLevel); - float jpegQuality = getJpegQualityForLevel(optimizeLevel); + // Apply external compression first + if (!externalCompressionApplied) { + boolean ghostscriptSuccess = false; - // Compress images + // Try Ghostscript first if available - for ANY compression level + if (isGhostscriptEnabled()) { + try { + applyGhostscriptCompression( + request, optimizeLevel, currentFile, tempFiles); + log.info("Ghostscript compression applied successfully"); + ghostscriptSuccess = true; + } catch (IOException e) { + log.warn("Ghostscript compression failed, trying fallback methods"); + } + } + + // Fallback to QPDF if Ghostscript failed or not available (levels 1-3 only) + if (!ghostscriptSuccess && isQpdfEnabled() && optimizeLevel <= 3) { + try { + applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles); + log.info("QPDF compression applied successfully"); + } catch (IOException e) { + log.warn("QPDF compression also failed"); + } + } + + if (!ghostscriptSuccess && !isQpdfEnabled()) { + log.info( + "No external compression tools available, using image compression only"); + } + + externalCompressionApplied = true; + + // Skip image compression if Ghostscript succeeded + if (ghostscriptSuccess) { + imageCompressionApplied = true; + } + } + + // Apply image compression for levels 4+ only if Ghostscript didn't run + if ((optimizeLevel >= 4 || Boolean.TRUE.equals(convertToGrayscale)) + && !imageCompressionApplied) { + // Use different scale factors based on level + double scaleFactor = + switch (optimizeLevel) { + case 4 -> 0.95; // 95% of original size + case 5 -> 0.9; // 90% of original size + case 6 -> 0.8; // 80% of original size + case 7 -> 0.7; // 70% of original size + case 8 -> 0.65; // 65% of original size + case 9 -> 0.5; // 50% of original size + default -> 1.0; + }; + + log.info("Applying image compression with scale factor: {}", scaleFactor); Path compressedImageFile = compressImagesInPDF( currentFile, scaleFactor, - jpegQuality, + 0.7f, // Default JPEG quality Boolean.TRUE.equals(convertToGrayscale)); tempFiles.add(compressedImageFile); @@ -723,18 +770,6 @@ public class CompressController { imageCompressionApplied = true; } - // Apply QPDF compression for all levels - if (!qpdfCompressionApplied && qpdfEnabled) { - applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles); - qpdfCompressionApplied = true; - } else if (!qpdfCompressionApplied) { - // If QPDF is disabled, mark as applied and log - if (!qpdfEnabled) { - log.info("Skipping QPDF compression as QPDF group is disabled"); - } - qpdfCompressionApplied = true; - } - // Check if target size reached or not in auto mode long outputFileSize = Files.size(currentFile); if (outputFileSize <= expectedOutputSize || !autoMode) { @@ -754,7 +789,7 @@ public class CompressController { } else { // Reset flags for next iteration with higher optimization level imageCompressionApplied = false; - qpdfCompressionApplied = false; + externalCompressionApplied = false; optimizeLevel = newOptimizeLevel; } } @@ -788,6 +823,96 @@ public class CompressController { } } + // Run Ghostscript compression + private void applyGhostscriptCompression( + OptimizePdfRequest request, int optimizeLevel, Path currentFile, List tempFiles) + throws IOException { + + long preGsSize = Files.size(currentFile); + log.info("Pre-Ghostscript file size: {}", GeneralUtils.formatBytes(preGsSize)); + + // Create output file for Ghostscript + Path gsOutputFile = Files.createTempFile("gs_output_", ".pdf"); + tempFiles.add(gsOutputFile); + + // Build Ghostscript command based on optimization level + List command = new ArrayList<>(); + command.add("gs"); + command.add("-sDEVICE=pdfwrite"); + command.add("-dCompatibilityLevel=1.5"); + command.add("-dNOPAUSE"); + command.add("-dQUIET"); + command.add("-dBATCH"); + + // Map optimization levels to Ghostscript settings + switch (optimizeLevel) { + case 1: + command.add("-dPDFSETTINGS=/prepress"); + break; + case 2: + command.add("-dPDFSETTINGS=/printer"); + break; + case 3: + command.add("-dPDFSETTINGS=/ebook"); + break; + case 4: + case 5: + command.add("-dPDFSETTINGS=/screen"); + break; + case 6: + case 7: + command.add("-dPDFSETTINGS=/screen"); + command.add("-dColorImageResolution=150"); + command.add("-dGrayImageResolution=150"); + command.add("-dMonoImageResolution=300"); + break; + case 8: + case 9: + command.add("-dPDFSETTINGS=/screen"); + command.add("-dColorImageResolution=100"); + command.add("-dGrayImageResolution=100"); + command.add("-dMonoImageResolution=200"); + break; + case 10: + command.add("-dPDFSETTINGS=/screen"); + command.add("-dColorImageResolution=72"); + command.add("-dGrayImageResolution=72"); + command.add("-dMonoImageResolution=150"); + break; + default: + command.add("-dPDFSETTINGS=/screen"); + break; + } + + command.add("-sOutputFile=" + gsOutputFile.toString()); + command.add(currentFile.toString()); + + ProcessExecutorResult returnCode = null; + try { + returnCode = + ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT) + .runCommandWithOutputHandling(command); + + if (returnCode.getRc() == 0) { + // Update current file to the Ghostscript output + Files.copy(gsOutputFile, currentFile, StandardCopyOption.REPLACE_EXISTING); + + long postGsSize = Files.size(currentFile); + double gsReduction = 100.0 - ((postGsSize * 100.0) / preGsSize); + log.info( + "Post-Ghostscript file size: {} (reduced by {}%)", + GeneralUtils.formatBytes(postGsSize), String.format("%.1f", gsReduction)); + } else { + log.warn("Ghostscript compression failed with return code: {}", returnCode.getRc()); + throw new IOException("Ghostscript compression failed"); + } + + } catch (Exception e) { + log.warn("Ghostscript compression failed, will fallback to other methods", e); + throw new IOException("Ghostscript compression failed", e); + } + } + // Run QPDF compression private void applyQpdfCompression( OptimizePdfRequest request, int optimizeLevel, Path currentFile, List tempFiles) diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java index 93061b570..57367d921 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java @@ -2,6 +2,7 @@ package stirling.software.SPDF.controller.api.misc; import java.awt.image.BufferedImage; import java.io.*; +import java.nio.file.Files; import java.nio.file.Path; import java.util.*; import java.util.zip.ZipEntry; @@ -29,12 +30,16 @@ import io.swagger.v3.oas.annotations.tags.Tag; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import stirling.software.SPDF.config.EndpointConfiguration; import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest; import stirling.software.common.model.ApplicationProperties; import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.util.ProcessExecutor; import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult; +import stirling.software.common.util.TempDirectory; +import stirling.software.common.util.TempFile; import stirling.software.common.util.TempFileManager; +import stirling.software.common.util.WebResponseUtils; @RestController @RequestMapping("/api/v1/misc") @@ -46,6 +51,15 @@ public class OCRController { private final ApplicationProperties applicationProperties; private final CustomPDFDocumentFactory pdfDocumentFactory; private final TempFileManager tempFileManager; + private final EndpointConfiguration endpointConfiguration; + + private boolean isOcrMyPdfEnabled() { + return endpointConfiguration.isGroupEnabled("OCRmyPDF"); + } + + private boolean isTesseractEnabled() { + return endpointConfiguration.isGroupEnabled("tesseract"); + } /** Gets the list of available Tesseract languages from the tessdata directory */ public List getAvailableTesseractLanguages() { @@ -63,39 +77,261 @@ public class OCRController { @PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf") @Operation( - summary = "Process PDF files with OCR using Tesseract", + summary = "Process a PDF file with OCR", description = - "Takes a PDF file as input, performs OCR using specified languages and OCR type" - + " (skip-text/force-ocr), and returns the processed PDF. Input:PDF" - + " Output:PDF Type:SISO") + "This endpoint processes a PDF file using OCR (Optical Character Recognition). " + + "Users can specify languages, sidecar, deskew, clean, cleanFinal, ocrType, ocrRenderType, and removeImagesAfter options. " + + "Uses OCRmyPDF if available, falls back to Tesseract. Input:PDF Output:PDF Type:SI-Conditional") public ResponseEntity processPdfWithOCR( @ModelAttribute ProcessPdfWithOcrRequest request) throws IOException, InterruptedException { MultipartFile inputFile = request.getFileInput(); - List languages = request.getLanguages(); + List selectedLanguages = request.getLanguages(); + Boolean sidecar = request.isSidecar(); + Boolean deskew = request.isDeskew(); + Boolean clean = request.isClean(); + Boolean cleanFinal = request.isCleanFinal(); String ocrType = request.getOcrType(); + String ocrRenderType = request.getOcrRenderType(); + Boolean removeImagesAfter = request.isRemoveImagesAfter(); - // Create a temp directory using TempFileManager directly - Path tempDirPath = tempFileManager.createTempDirectory(); - File tempDir = tempDirPath.toFile(); + if (selectedLanguages == null || selectedLanguages.isEmpty()) { + throw new IOException("Please select at least one language."); + } - try { - File tempInputFile = new File(tempDir, "input.pdf"); - File tempOutputDir = new File(tempDir, "output"); - File tempImagesDir = new File(tempDir, "images"); - File finalOutputFile = new File(tempDir, "final_output.pdf"); + if (!"hocr".equals(ocrRenderType) && !"sandwich".equals(ocrRenderType)) { + throw new IOException("ocrRenderType wrong"); + } + + // Get available Tesseract languages + List availableLanguages = getAvailableTesseractLanguages(); + + // Validate selected languages + selectedLanguages = + selectedLanguages.stream().filter(availableLanguages::contains).toList(); + + if (selectedLanguages.isEmpty()) { + throw new IOException("None of the selected languages are valid."); + } + + // Use try-with-resources for proper temp file management + try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf"); + TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) { + + inputFile.transferTo(tempInputFile.getFile()); + + TempFile sidecarTextFile = null; + + try { + // Use OCRmyPDF if available (no fallback - error if it fails) + if (isOcrMyPdfEnabled()) { + if (sidecar != null && sidecar) { + sidecarTextFile = new TempFile(tempFileManager, ".txt"); + } + + processWithOcrMyPdf( + selectedLanguages, + sidecar, + deskew, + clean, + cleanFinal, + ocrType, + ocrRenderType, + removeImagesAfter, + tempInputFile.getPath(), + tempOutputFile.getPath(), + sidecarTextFile != null ? sidecarTextFile.getPath() : null); + log.info("OCRmyPDF processing completed successfully"); + } + // Use Tesseract only if OCRmyPDF is not available + else if (isTesseractEnabled()) { + processWithTesseract( + selectedLanguages, + ocrType, + tempInputFile.getPath(), + tempOutputFile.getPath()); + log.info("Tesseract processing completed successfully"); + } else { + throw new IOException("No OCR tools are available"); + } + + // Read the processed PDF file + byte[] pdfBytes = Files.readAllBytes(tempOutputFile.getPath()); + + // Return the OCR processed PDF as a response + String outputFilename = + Filenames.toSimpleFileName(inputFile.getOriginalFilename()) + .replaceFirst("[.][^.]+$", "") + + "_OCR.pdf"; + + if (sidecar != null && sidecar && sidecarTextFile != null) { + // Create a zip file containing both the PDF and the text file + String outputZipFilename = + Filenames.toSimpleFileName(inputFile.getOriginalFilename()) + .replaceFirst("[.][^.]+$", "") + + "_OCR.zip"; + + try (TempFile tempZipFile = new TempFile(tempFileManager, ".zip"); + ZipOutputStream zipOut = + new ZipOutputStream( + Files.newOutputStream(tempZipFile.getPath()))) { + + // Add PDF file to the zip + ZipEntry pdfEntry = new ZipEntry(outputFilename); + zipOut.putNextEntry(pdfEntry); + zipOut.write(pdfBytes); + zipOut.closeEntry(); + + // Add text file to the zip + ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt")); + zipOut.putNextEntry(txtEntry); + Files.copy(sidecarTextFile.getPath(), zipOut); + zipOut.closeEntry(); + + zipOut.finish(); + + byte[] zipBytes = Files.readAllBytes(tempZipFile.getPath()); + + // Return the zip file containing both the PDF and the text file + return WebResponseUtils.bytesToWebResponse( + zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM); + } + } else { + // Return the OCR processed PDF as a response + return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename); + } + + } finally { + // Clean up sidecar temp file if created + if (sidecarTextFile != null) { + try { + sidecarTextFile.close(); + } catch (Exception e) { + log.warn("Failed to close sidecar temp file", e); + } + } + } + } + } + + private void processWithOcrMyPdf( + List selectedLanguages, + Boolean sidecar, + Boolean deskew, + Boolean clean, + Boolean cleanFinal, + String ocrType, + String ocrRenderType, + Boolean removeImagesAfter, + Path tempInputFile, + Path tempOutputFile, + Path sidecarTextPath) + throws IOException, InterruptedException { + + // Build OCRmyPDF command + String languageOption = String.join("+", selectedLanguages); + + List command = + new ArrayList<>( + Arrays.asList( + "ocrmypdf", + "--verbose", + "2", + "--output-type", + "pdf", + "--pdf-renderer", + ocrRenderType)); + + if (sidecar != null && sidecar && sidecarTextPath != null) { + command.add("--sidecar"); + command.add(sidecarTextPath.toString()); + } + + if (deskew != null && deskew) { + command.add("--deskew"); + } + if (clean != null && clean) { + command.add("--clean"); + } + if (cleanFinal != null && cleanFinal) { + command.add("--clean-final"); + } + if (ocrType != null && !"".equals(ocrType)) { + if ("skip-text".equals(ocrType)) { + command.add("--skip-text"); + } else if ("force-ocr".equals(ocrType)) { + command.add("--force-ocr"); + } + } + + command.addAll( + Arrays.asList( + "--language", + languageOption, + tempInputFile.toString(), + tempOutputFile.toString())); + + // Run CLI command + ProcessExecutorResult result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF) + .runCommandWithOutputHandling(command); + + if (result.getRc() != 0 + && result.getMessages().contains("multiprocessing/synchronize.py") + && result.getMessages().contains("OSError: [Errno 38] Function not implemented")) { + command.add("--jobs"); + command.add("1"); + result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF) + .runCommandWithOutputHandling(command); + } + + if (result.getRc() != 0) { + throw new IOException("OCRmyPDF failed with return code: " + result.getRc()); + } + + // Remove images from the OCR processed PDF if the flag is set to true + if (removeImagesAfter != null && removeImagesAfter) { + try (TempFile tempPdfWithoutImages = new TempFile(tempFileManager, "_no_images.pdf")) { + List gsCommand = + Arrays.asList( + "gs", + "-sDEVICE=pdfwrite", + "-dFILTERIMAGE", + "-o", + tempPdfWithoutImages.getPath().toString(), + tempOutputFile.toString()); + + ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT) + .runCommandWithOutputHandling(gsCommand); + + // Replace output file with version without images + Files.copy( + tempPdfWithoutImages.getPath(), + tempOutputFile, + java.nio.file.StandardCopyOption.REPLACE_EXISTING); + } + } + } + + private void processWithTesseract( + List selectedLanguages, String ocrType, Path tempInputFile, Path tempOutputFile) + throws IOException, InterruptedException { + + // Create temp directory for Tesseract processing + try (TempDirectory tempDir = new TempDirectory(tempFileManager)) { + File tempOutputDir = new File(tempDir.getPath().toFile(), "output"); + File tempImagesDir = new File(tempDir.getPath().toFile(), "images"); + File finalOutputFile = new File(tempDir.getPath().toFile(), "final_output.pdf"); // Create directories tempOutputDir.mkdirs(); tempImagesDir.mkdirs(); - // Save input file - inputFile.transferTo(tempInputFile); - PDFMergerUtility merger = new PDFMergerUtility(); merger.setDestinationFileName(finalOutputFile.toString()); - try (PDDocument document = pdfDocumentFactory.load(tempInputFile)) { + try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) { PDFRenderer pdfRenderer = new PDFRenderer(document); int pageCount = document.getNumberOfPages(); @@ -135,35 +371,20 @@ public class OCRController { new File(tempOutputDir, String.format("page_%d", pageNum)) .toString()); command.add("-l"); - command.add(String.join("+", languages)); - // Always output PDF - command.add("pdf"); + command.add(String.join("+", selectedLanguages)); + command.add("pdf"); // Always output PDF - // Use ProcessExecutor to run tesseract command - try { - ProcessExecutorResult result = - ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT) - .runCommandWithOutputHandling(command); + ProcessExecutorResult result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT) + .runCommandWithOutputHandling(command); - log.debug( - "Tesseract OCR completed for page {} with exit code {}", - pageNum, - result.getRc()); - - // Add OCR'd PDF to merger - merger.addSource(pageOutputPath); - } catch (IOException | InterruptedException e) { - log.error( - "Error processing page {} with tesseract: {}", - pageNum, - e.getMessage()); - // If OCR fails, fall back to the original page - try (PDDocument pageDoc = new PDDocument()) { - pageDoc.addPage(page); - pageDoc.save(pageOutputPath); - merger.addSource(pageOutputPath); - } + if (result.getRc() != 0) { + throw new RuntimeException( + "Tesseract failed with exit code: " + result.getRc()); } + + // Add OCR'd PDF to merger + merger.addSource(pageOutputPath); } else { // Save original page without OCR try (PDDocument pageDoc = new PDDocument()) { @@ -178,40 +399,11 @@ public class OCRController { // Merge all pages into final PDF merger.mergeDocuments(null); - // Read the final PDF file - byte[] pdfContent = java.nio.file.Files.readAllBytes(finalOutputFile.toPath()); - String outputFilename = - Filenames.toSimpleFileName(inputFile.getOriginalFilename()) - .replaceFirst("[.][^.]+$", "") - + "_OCR.pdf"; - - return ResponseEntity.ok() - .header( - "Content-Disposition", - "attachment; filename=\"" + outputFilename + "\"") - .contentType(MediaType.APPLICATION_PDF) - .body(pdfContent); - } finally { - // Clean up the temp directory and all its contents - tempFileManager.deleteTempDirectory(tempDirPath); - } - } - - private void addFileToZip(File file, String filename, ZipOutputStream zipOut) - throws IOException { - if (!file.exists()) { - log.warn("File {} does not exist, skipping", file); - return; - } - try (FileInputStream fis = new FileInputStream(file)) { - ZipEntry zipEntry = new ZipEntry(filename); - zipOut.putNextEntry(zipEntry); - byte[] buffer = new byte[1024]; - int length; - while ((length = fis.read(buffer)) >= 0) { - zipOut.write(buffer, 0, length); - } - zipOut.closeEntry(); + // Copy final output to the expected location + Files.copy( + finalOutputFile.toPath(), + tempOutputFile, + java.nio.file.StandardCopyOption.REPLACE_EXISTING); } } } diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/RepairController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/RepairController.java index b8c347ef1..7cde1d078 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/RepairController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/RepairController.java @@ -16,7 +16,9 @@ import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import stirling.software.SPDF.config.EndpointConfiguration; import stirling.software.common.model.api.PDFFile; import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.util.ProcessExecutor; @@ -28,17 +30,27 @@ import stirling.software.common.util.WebResponseUtils; @RestController @RequestMapping("/api/v1/misc") @Tag(name = "Misc", description = "Miscellaneous APIs") +@Slf4j @RequiredArgsConstructor public class RepairController { private final CustomPDFDocumentFactory pdfDocumentFactory; private final TempFileManager tempFileManager; + private final EndpointConfiguration endpointConfiguration; + + private boolean isGhostscriptEnabled() { + return endpointConfiguration.isGroupEnabled("Ghostscript"); + } + + private boolean isQpdfEnabled() { + return endpointConfiguration.isGroupEnabled("qpdf"); + } @PostMapping(consumes = "multipart/form-data", value = "/repair") @Operation( summary = "Repair a PDF file", description = - "This endpoint repairs a given PDF file by running qpdf command. The PDF is" + "This endpoint repairs a given PDF file by running Ghostscript (primary), qpdf (fallback), or PDFBox (if no external tools available). The PDF is" + " first saved to a temporary location, repaired, read back, and then" + " returned as a response. Input:PDF Output:PDF Type:SISO") public ResponseEntity repairPdf(@ModelAttribute PDFFile file) @@ -46,25 +58,71 @@ public class RepairController { MultipartFile inputFile = file.getFileInput(); // Use TempFile with try-with-resources for automatic cleanup - try (TempFile tempFile = new TempFile(tempFileManager, ".pdf")) { + try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf"); + TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) { + // Save the uploaded file to the temporary location - inputFile.transferTo(tempFile.getFile()); + inputFile.transferTo(tempInputFile.getFile()); - List command = new ArrayList<>(); - command.add("qpdf"); - command.add("--replace-input"); // Automatically fixes problems it can - command.add("--qdf"); // Linearizes and normalizes PDF structure - command.add("--object-streams=disable"); // Can help with some corruptions - command.add(tempFile.getFile().getAbsolutePath()); + boolean repairSuccess = false; - ProcessExecutorResult returnCode = - ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF) - .runCommandWithOutputHandling(command); + // Try Ghostscript first if available + if (isGhostscriptEnabled()) { + try { + List gsCommand = new ArrayList<>(); + gsCommand.add("gs"); + gsCommand.add("-o"); + gsCommand.add(tempOutputFile.getPath().toString()); + gsCommand.add("-sDEVICE=pdfwrite"); + gsCommand.add(tempInputFile.getPath().toString()); - // Read the optimized PDF file - byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempFile.getFile()); + ProcessExecutorResult gsResult = + ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT) + .runCommandWithOutputHandling(gsCommand); - // Return the optimized PDF as a response + if (gsResult.getRc() == 0) { + repairSuccess = true; + } + } catch (Exception e) { + // Log and continue to QPDF fallback + log.warn("Ghostscript repair failed, trying QPDF fallback: ", e); + } + } + + // Fallback to QPDF if Ghostscript failed or not available + if (!repairSuccess && isQpdfEnabled()) { + List qpdfCommand = new ArrayList<>(); + qpdfCommand.add("qpdf"); + qpdfCommand.add("--replace-input"); // Automatically fixes problems it can + qpdfCommand.add("--qdf"); // Linearizes and normalizes PDF structure + qpdfCommand.add("--object-streams=disable"); // Can help with some corruptions + qpdfCommand.add(tempInputFile.getPath().toString()); + qpdfCommand.add(tempOutputFile.getPath().toString()); + + ProcessExecutorResult qpdfResult = + ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF) + .runCommandWithOutputHandling(qpdfCommand); + + repairSuccess = true; + } + + // Use PDFBox as last resort if no external tools are available + if (!repairSuccess) { + if (!isGhostscriptEnabled() && !isQpdfEnabled()) { + // Basic PDFBox repair - load and save to fix structural issues + try (var document = pdfDocumentFactory.load(tempInputFile.getFile())) { + document.save(tempOutputFile.getFile()); + repairSuccess = true; + } + } else { + throw new IOException("PDF repair failed with available tools"); + } + } + + // Read the repaired PDF file + byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempOutputFile.getFile()); + + // Return the repaired PDF as a response String outputFilename = Filenames.toSimpleFileName(inputFile.getOriginalFilename()) .replaceFirst("[.][^.]+$", "") diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/model/api/misc/ProcessPdfWithOcrRequest.java b/stirling-pdf/src/main/java/stirling/software/SPDF/model/api/misc/ProcessPdfWithOcrRequest.java index 00279eb96..2955d7160 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/model/api/misc/ProcessPdfWithOcrRequest.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/model/api/misc/ProcessPdfWithOcrRequest.java @@ -19,6 +19,18 @@ public class ProcessPdfWithOcrRequest extends PDFFile { defaultValue = "[\"eng\"]") private List languages; + @Schema(description = "Include OCR text in a sidecar text file if set to true") + private boolean sidecar; + + @Schema(description = "Deskew the input file if set to true") + private boolean deskew; + + @Schema(description = "Clean the input file if set to true") + private boolean clean; + + @Schema(description = "Clean the final output if set to true") + private boolean cleanFinal; + @Schema( description = "Specify the OCR type, e.g., 'skip-text', 'force-ocr', or 'Normal'", requiredMode = Schema.RequiredMode.REQUIRED, @@ -31,4 +43,7 @@ public class ProcessPdfWithOcrRequest extends PDFFile { allowableValues = {"hocr", "sandwich"}, defaultValue = "hocr") private String ocrRenderType = "hocr"; + + @Schema(description = "Remove images from the output PDF if set to true") + private boolean removeImagesAfter; } diff --git a/stirling-pdf/src/main/resources/settings.yml.template b/stirling-pdf/src/main/resources/settings.yml.template index d45b8482b..75f23f03f 100644 --- a/stirling-pdf/src/main/resources/settings.yml.template +++ b/stirling-pdf/src/main/resources/settings.yml.template @@ -164,6 +164,8 @@ processExecutor: weasyPrintSessionLimit: 16 installAppSessionLimit: 1 calibreSessionLimit: 1 + ghostscriptSessionLimit: 8 + ocrMyPdfSessionLimit: 2 timeoutMinutes: # Process executor timeout in minutes libreOfficetimeoutMinutes: 30 pdfToHtmltimeoutMinutes: 20 @@ -172,3 +174,6 @@ processExecutor: installApptimeoutMinutes: 60 calibretimeoutMinutes: 30 tesseractTimeoutMinutes: 30 + qpdfTimeoutMinutes: 30 + ghostscriptTimeoutMinutes: 30 + ocrMyPdfTimeoutMinutes: 30 diff --git a/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html b/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html index f98c20d29..161f4181e 100644 --- a/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html +++ b/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html @@ -79,6 +79,30 @@
+
+ +
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+