mirror of
https://github.com/Stirling-Tools/Stirling-PDF.git
synced 2025-08-27 14:49:23 +00:00
restore OCRMyPDF and ghostscript compression
This commit is contained in:
parent
11e3ccd19f
commit
782c30f934
@ -51,7 +51,6 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
|
||||
tini \
|
||||
bash \
|
||||
curl \
|
||||
qpdf \
|
||||
shadow \
|
||||
su-exec \
|
||||
openssl \
|
||||
@ -69,9 +68,11 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
|
||||
tesseract-ocr-data-deu \
|
||||
tesseract-ocr-data-fra \
|
||||
tesseract-ocr-data-por \
|
||||
unpaper \
|
||||
# CV
|
||||
py3-opencv \
|
||||
python3 \
|
||||
ocrmypdf \
|
||||
py3-pip \
|
||||
py3-pillow@testing \
|
||||
py3-pdf2image@testing && \
|
||||
|
@ -76,16 +76,17 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
|
||||
# pdftohtml
|
||||
poppler-utils \
|
||||
# OCR MY PDF (unpaper for descew and other advanced featues)
|
||||
qpdf \
|
||||
tesseract-ocr-data-eng \
|
||||
tesseract-ocr-data-chi_sim \
|
||||
tesseract-ocr-data-deu \
|
||||
tesseract-ocr-data-fra \
|
||||
tesseract-ocr-data-por \
|
||||
unpaper \
|
||||
font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \
|
||||
# CV
|
||||
py3-opencv \
|
||||
python3 \
|
||||
ocrmypdf \
|
||||
py3-pip \
|
||||
py3-pillow@testing \
|
||||
py3-pdf2image@testing && \
|
||||
|
@ -545,6 +545,8 @@ public class ApplicationProperties {
|
||||
private int calibreSessionLimit;
|
||||
private int qpdfSessionLimit;
|
||||
private int tesseractSessionLimit;
|
||||
private int ghostscriptSessionLimit;
|
||||
private int ocrMyPdfSessionLimit;
|
||||
|
||||
public int getQpdfSessionLimit() {
|
||||
return qpdfSessionLimit > 0 ? qpdfSessionLimit : 2;
|
||||
@ -577,6 +579,14 @@ public class ApplicationProperties {
|
||||
public int getCalibreSessionLimit() {
|
||||
return calibreSessionLimit > 0 ? calibreSessionLimit : 1;
|
||||
}
|
||||
|
||||
public int getGhostscriptSessionLimit() {
|
||||
return ghostscriptSessionLimit > 0 ? ghostscriptSessionLimit : 8;
|
||||
}
|
||||
|
||||
public int getOcrMyPdfSessionLimit() {
|
||||
return ocrMyPdfSessionLimit > 0 ? ocrMyPdfSessionLimit : 2;
|
||||
}
|
||||
}
|
||||
|
||||
@Data
|
||||
@ -589,6 +599,8 @@ public class ApplicationProperties {
|
||||
private long calibreTimeoutMinutes;
|
||||
private long tesseractTimeoutMinutes;
|
||||
private long qpdfTimeoutMinutes;
|
||||
private long ghostscriptTimeoutMinutes;
|
||||
private long ocrMyPdfTimeoutMinutes;
|
||||
|
||||
public long getTesseractTimeoutMinutes() {
|
||||
return tesseractTimeoutMinutes > 0 ? tesseractTimeoutMinutes : 30;
|
||||
@ -621,6 +633,14 @@ public class ApplicationProperties {
|
||||
public long getCalibreTimeoutMinutes() {
|
||||
return calibreTimeoutMinutes > 0 ? calibreTimeoutMinutes : 30;
|
||||
}
|
||||
|
||||
public long getGhostscriptTimeoutMinutes() {
|
||||
return ghostscriptTimeoutMinutes > 0 ? ghostscriptTimeoutMinutes : 30;
|
||||
}
|
||||
|
||||
public long getOcrMyPdfTimeoutMinutes() {
|
||||
return ocrMyPdfTimeoutMinutes > 0 ? ocrMyPdfTimeoutMinutes : 30;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -84,6 +84,16 @@ public class ProcessExecutor {
|
||||
.getProcessExecutor()
|
||||
.getSessionLimit()
|
||||
.getCalibreSessionLimit();
|
||||
case GHOSTSCRIPT ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
.getSessionLimit()
|
||||
.getGhostscriptSessionLimit();
|
||||
case OCR_MY_PDF ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
.getSessionLimit()
|
||||
.getOcrMyPdfSessionLimit();
|
||||
};
|
||||
|
||||
long timeoutMinutes =
|
||||
@ -128,6 +138,16 @@ public class ProcessExecutor {
|
||||
.getProcessExecutor()
|
||||
.getTimeoutMinutes()
|
||||
.getCalibreTimeoutMinutes();
|
||||
case GHOSTSCRIPT ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
.getTimeoutMinutes()
|
||||
.getGhostscriptTimeoutMinutes();
|
||||
case OCR_MY_PDF ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
.getTimeoutMinutes()
|
||||
.getOcrMyPdfTimeoutMinutes();
|
||||
};
|
||||
return new ProcessExecutor(semaphoreLimit, liveUpdates, timeoutMinutes);
|
||||
});
|
||||
@ -278,7 +298,9 @@ public class ProcessExecutor {
|
||||
INSTALL_APP,
|
||||
CALIBRE,
|
||||
TESSERACT,
|
||||
QPDF
|
||||
QPDF,
|
||||
GHOSTSCRIPT,
|
||||
OCR_MY_PDF
|
||||
}
|
||||
|
||||
public class ProcessExecutorResult {
|
||||
|
@ -9,7 +9,6 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import java.util.concurrent.ConcurrentSkipListSet;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Component;
|
||||
@ -27,8 +26,7 @@ public class TempFileRegistry {
|
||||
private final ConcurrentMap<Path, Instant> registeredFiles = new ConcurrentHashMap<>();
|
||||
private final Set<Path> thirdPartyTempFiles =
|
||||
Collections.newSetFromMap(new ConcurrentHashMap<>());
|
||||
private final Set<Path> tempDirectories =
|
||||
Collections.newSetFromMap(new ConcurrentHashMap<>());
|
||||
private final Set<Path> tempDirectories = Collections.newSetFromMap(new ConcurrentHashMap<>());
|
||||
|
||||
/**
|
||||
* Register a temporary file with the registry.
|
||||
|
@ -21,6 +21,8 @@ public class EndpointConfiguration {
|
||||
private final ApplicationProperties applicationProperties;
|
||||
private Map<String, Boolean> endpointStatuses = new ConcurrentHashMap<>();
|
||||
private Map<String, Set<String>> endpointGroups = new ConcurrentHashMap<>();
|
||||
private Set<String> disabledGroups = new HashSet<>();
|
||||
private Map<String, Set<String>> endpointAlternatives = new ConcurrentHashMap<>();
|
||||
private final boolean runningProOrHigher;
|
||||
|
||||
public EndpointConfiguration(
|
||||
@ -51,16 +53,36 @@ public class EndpointConfiguration {
|
||||
if (endpoint.startsWith("/")) {
|
||||
endpoint = endpoint.substring(1);
|
||||
}
|
||||
|
||||
// Check if endpoint has alternatives (multiple tools can handle it)
|
||||
Set<String> alternatives = endpointAlternatives.get(endpoint);
|
||||
if (alternatives != null && !alternatives.isEmpty()) {
|
||||
// Endpoint is enabled if ANY of its alternative tools are enabled
|
||||
for (String toolGroup : alternatives) {
|
||||
if (isGroupEnabled(toolGroup)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false; // All alternative tools are disabled
|
||||
}
|
||||
|
||||
// Fallback to standard endpoint status check
|
||||
return endpointStatuses.getOrDefault(endpoint, true);
|
||||
}
|
||||
|
||||
public boolean isGroupEnabled(String group) {
|
||||
// Check if group is explicitly disabled first
|
||||
if (disabledGroups.contains(group)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Set<String> endpoints = endpointGroups.get(group);
|
||||
if (endpoints == null || endpoints.isEmpty()) {
|
||||
log.debug("Group '{}' does not exist or has no endpoints", group);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Additional check: if all endpoints in group are disabled, consider group disabled
|
||||
for (String endpoint : endpoints) {
|
||||
if (!isEndpointEnabled(endpoint)) {
|
||||
return false;
|
||||
@ -74,7 +96,22 @@ public class EndpointConfiguration {
|
||||
endpointGroups.computeIfAbsent(group, k -> new HashSet<>()).add(endpoint);
|
||||
}
|
||||
|
||||
public void addEndpointAlternative(String endpoint, String toolGroup) {
|
||||
endpointAlternatives.computeIfAbsent(endpoint, k -> new HashSet<>()).add(toolGroup);
|
||||
}
|
||||
|
||||
public void disableGroup(String group) {
|
||||
disabledGroups.add(group);
|
||||
Set<String> endpoints = endpointGroups.get(group);
|
||||
if (endpoints != null) {
|
||||
for (String endpoint : endpoints) {
|
||||
disableEndpoint(endpoint);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void enableGroup(String group) {
|
||||
disabledGroups.remove(group);
|
||||
Set<String> endpoints = endpointGroups.get(group);
|
||||
if (endpoints != null) {
|
||||
for (String endpoint : endpoints) {
|
||||
@ -83,13 +120,8 @@ public class EndpointConfiguration {
|
||||
}
|
||||
}
|
||||
|
||||
public void disableGroup(String group) {
|
||||
Set<String> endpoints = endpointGroups.get(group);
|
||||
if (endpoints != null) {
|
||||
for (String endpoint : endpoints) {
|
||||
disableEndpoint(endpoint);
|
||||
}
|
||||
}
|
||||
public Set<String> getDisabledGroups() {
|
||||
return new HashSet<>(disabledGroups);
|
||||
}
|
||||
|
||||
public void logDisabledEndpointsSummary() {
|
||||
@ -101,6 +133,12 @@ public class EndpointConfiguration {
|
||||
.sorted()
|
||||
.toList();
|
||||
|
||||
if (!disabledGroups.isEmpty()) {
|
||||
log.info(
|
||||
"Disabled groups: {}",
|
||||
String.join(", ", disabledGroups.stream().sorted().toList()));
|
||||
}
|
||||
|
||||
if (!disabledList.isEmpty()) {
|
||||
log.info(
|
||||
"Total disabled endpoints: {}. Disabled endpoints: {}",
|
||||
@ -212,7 +250,6 @@ public class EndpointConfiguration {
|
||||
// Unoconvert
|
||||
addEndpointToGroup("Unoconvert", "file-to-pdf");
|
||||
|
||||
addEndpointToGroup("tesseract", "ocr-pdf");
|
||||
|
||||
// Java
|
||||
addEndpointToGroup("Java", "merge-pdfs");
|
||||
@ -261,8 +298,13 @@ public class EndpointConfiguration {
|
||||
addEndpointToGroup("Javascript", "compare");
|
||||
addEndpointToGroup("Javascript", "adjust-contrast");
|
||||
|
||||
// qpdf dependent endpoints
|
||||
addEndpointToGroup("qpdf", "repair");
|
||||
// Multi-tool endpoints - endpoints that can be handled by multiple tools
|
||||
addEndpointAlternative("repair", "qpdf");
|
||||
addEndpointAlternative("repair", "Ghostscript");
|
||||
addEndpointAlternative("compress-pdf", "qpdf");
|
||||
addEndpointAlternative("compress-pdf", "Ghostscript");
|
||||
addEndpointAlternative("ocr-pdf", "tesseract");
|
||||
addEndpointAlternative("ocr-pdf", "OCRmyPDF");
|
||||
|
||||
// Weasyprint dependent endpoints
|
||||
addEndpointToGroup("Weasyprint", "html-to-pdf");
|
||||
|
@ -34,6 +34,8 @@ public class ExternalAppDepConfig {
|
||||
new HashMap<>() {
|
||||
|
||||
{
|
||||
put("gs", List.of("Ghostscript"));
|
||||
put("ocrmypdf", List.of("OCRmyPDF"));
|
||||
put("soffice", List.of("LibreOffice"));
|
||||
put(weasyprintPath, List.of("Weasyprint"));
|
||||
put("pdftohtml", List.of("Pdftohtml"));
|
||||
@ -109,6 +111,8 @@ public class ExternalAppDepConfig {
|
||||
@PostConstruct
|
||||
public void checkDependencies() {
|
||||
// Check core dependencies
|
||||
checkDependencyAndDisableGroup("gs");
|
||||
checkDependencyAndDisableGroup("ocrmypdf");
|
||||
checkDependencyAndDisableGroup("tesseract");
|
||||
checkDependencyAndDisableGroup("soffice");
|
||||
checkDependencyAndDisableGroup("qpdf");
|
||||
|
@ -65,12 +65,14 @@ public class CompressController {
|
||||
|
||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||
private final boolean qpdfEnabled;
|
||||
private final boolean ghostscriptEnabled;
|
||||
|
||||
public CompressController(
|
||||
CustomPDFDocumentFactory pdfDocumentFactory,
|
||||
EndpointConfiguration endpointConfiguration) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf");
|
||||
this.ghostscriptEnabled = endpointConfiguration.isGroupEnabled("Ghostscript");
|
||||
}
|
||||
|
||||
@Data
|
||||
@ -697,25 +699,69 @@ public class CompressController {
|
||||
|
||||
boolean sizeMet = false;
|
||||
boolean imageCompressionApplied = false;
|
||||
boolean qpdfCompressionApplied = false;
|
||||
|
||||
if (qpdfEnabled && optimizeLevel <= 3) {
|
||||
optimizeLevel = 4;
|
||||
}
|
||||
boolean externalCompressionApplied = false;
|
||||
|
||||
while (!sizeMet && optimizeLevel <= 9) {
|
||||
// Apply image compression for levels 4-9
|
||||
if ((optimizeLevel >= 3 || Boolean.TRUE.equals(convertToGrayscale))
|
||||
&& !imageCompressionApplied) {
|
||||
double scaleFactor = getScaleFactorForLevel(optimizeLevel);
|
||||
float jpegQuality = getJpegQualityForLevel(optimizeLevel);
|
||||
// Apply external compression first
|
||||
if (!externalCompressionApplied) {
|
||||
boolean ghostscriptSuccess = false;
|
||||
|
||||
// Compress images
|
||||
// Try Ghostscript first if available - for ANY compression level
|
||||
if (ghostscriptEnabled) {
|
||||
try {
|
||||
applyGhostscriptCompression(
|
||||
request, optimizeLevel, currentFile, tempFiles);
|
||||
log.info("Ghostscript compression applied successfully");
|
||||
ghostscriptSuccess = true;
|
||||
} catch (IOException e) {
|
||||
log.warn("Ghostscript compression failed, trying fallback methods");
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to QPDF if Ghostscript failed or not available (levels 1-3 only)
|
||||
if (!ghostscriptSuccess && qpdfEnabled && optimizeLevel <= 3) {
|
||||
try {
|
||||
applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles);
|
||||
log.info("QPDF compression applied successfully");
|
||||
} catch (IOException e) {
|
||||
log.warn("QPDF compression also failed");
|
||||
}
|
||||
}
|
||||
|
||||
if (!ghostscriptSuccess && !qpdfEnabled) {
|
||||
log.info(
|
||||
"No external compression tools available, using image compression only");
|
||||
}
|
||||
|
||||
externalCompressionApplied = true;
|
||||
|
||||
// Skip image compression if Ghostscript succeeded
|
||||
if (ghostscriptSuccess) {
|
||||
imageCompressionApplied = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Apply image compression for levels 4+ only if Ghostscript didn't run
|
||||
if ((optimizeLevel >= 4 || Boolean.TRUE.equals(convertToGrayscale))
|
||||
&& !imageCompressionApplied) {
|
||||
// Use different scale factors based on level
|
||||
double scaleFactor =
|
||||
switch (optimizeLevel) {
|
||||
case 4 -> 0.95; // 95% of original size
|
||||
case 5 -> 0.9; // 90% of original size
|
||||
case 6 -> 0.8; // 80% of original size
|
||||
case 7 -> 0.7; // 70% of original size
|
||||
case 8 -> 0.65; // 65% of original size
|
||||
case 9 -> 0.5; // 50% of original size
|
||||
default -> 1.0;
|
||||
};
|
||||
|
||||
log.info("Applying image compression with scale factor: {}", scaleFactor);
|
||||
Path compressedImageFile =
|
||||
compressImagesInPDF(
|
||||
currentFile,
|
||||
scaleFactor,
|
||||
jpegQuality,
|
||||
0.7f, // Default JPEG quality
|
||||
Boolean.TRUE.equals(convertToGrayscale));
|
||||
|
||||
tempFiles.add(compressedImageFile);
|
||||
@ -723,18 +769,6 @@ public class CompressController {
|
||||
imageCompressionApplied = true;
|
||||
}
|
||||
|
||||
// Apply QPDF compression for all levels
|
||||
if (!qpdfCompressionApplied && qpdfEnabled) {
|
||||
applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles);
|
||||
qpdfCompressionApplied = true;
|
||||
} else if (!qpdfCompressionApplied) {
|
||||
// If QPDF is disabled, mark as applied and log
|
||||
if (!qpdfEnabled) {
|
||||
log.info("Skipping QPDF compression as QPDF group is disabled");
|
||||
}
|
||||
qpdfCompressionApplied = true;
|
||||
}
|
||||
|
||||
// Check if target size reached or not in auto mode
|
||||
long outputFileSize = Files.size(currentFile);
|
||||
if (outputFileSize <= expectedOutputSize || !autoMode) {
|
||||
@ -754,7 +788,7 @@ public class CompressController {
|
||||
} else {
|
||||
// Reset flags for next iteration with higher optimization level
|
||||
imageCompressionApplied = false;
|
||||
qpdfCompressionApplied = false;
|
||||
externalCompressionApplied = false;
|
||||
optimizeLevel = newOptimizeLevel;
|
||||
}
|
||||
}
|
||||
@ -788,6 +822,96 @@ public class CompressController {
|
||||
}
|
||||
}
|
||||
|
||||
// Run Ghostscript compression
|
||||
private void applyGhostscriptCompression(
|
||||
OptimizePdfRequest request, int optimizeLevel, Path currentFile, List<Path> tempFiles)
|
||||
throws IOException {
|
||||
|
||||
long preGsSize = Files.size(currentFile);
|
||||
log.info("Pre-Ghostscript file size: {}", GeneralUtils.formatBytes(preGsSize));
|
||||
|
||||
// Create output file for Ghostscript
|
||||
Path gsOutputFile = Files.createTempFile("gs_output_", ".pdf");
|
||||
tempFiles.add(gsOutputFile);
|
||||
|
||||
// Build Ghostscript command based on optimization level
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add("gs");
|
||||
command.add("-sDEVICE=pdfwrite");
|
||||
command.add("-dCompatibilityLevel=1.5");
|
||||
command.add("-dNOPAUSE");
|
||||
command.add("-dQUIET");
|
||||
command.add("-dBATCH");
|
||||
|
||||
// Map optimization levels to Ghostscript settings
|
||||
switch (optimizeLevel) {
|
||||
case 1:
|
||||
command.add("-dPDFSETTINGS=/prepress");
|
||||
break;
|
||||
case 2:
|
||||
command.add("-dPDFSETTINGS=/printer");
|
||||
break;
|
||||
case 3:
|
||||
command.add("-dPDFSETTINGS=/ebook");
|
||||
break;
|
||||
case 4:
|
||||
case 5:
|
||||
command.add("-dPDFSETTINGS=/screen");
|
||||
break;
|
||||
case 6:
|
||||
case 7:
|
||||
command.add("-dPDFSETTINGS=/screen");
|
||||
command.add("-dColorImageResolution=150");
|
||||
command.add("-dGrayImageResolution=150");
|
||||
command.add("-dMonoImageResolution=300");
|
||||
break;
|
||||
case 8:
|
||||
case 9:
|
||||
command.add("-dPDFSETTINGS=/screen");
|
||||
command.add("-dColorImageResolution=100");
|
||||
command.add("-dGrayImageResolution=100");
|
||||
command.add("-dMonoImageResolution=200");
|
||||
break;
|
||||
case 10:
|
||||
command.add("-dPDFSETTINGS=/screen");
|
||||
command.add("-dColorImageResolution=72");
|
||||
command.add("-dGrayImageResolution=72");
|
||||
command.add("-dMonoImageResolution=150");
|
||||
break;
|
||||
default:
|
||||
command.add("-dPDFSETTINGS=/screen");
|
||||
break;
|
||||
}
|
||||
|
||||
command.add("-sOutputFile=" + gsOutputFile.toString());
|
||||
command.add(currentFile.toString());
|
||||
|
||||
ProcessExecutorResult returnCode = null;
|
||||
try {
|
||||
returnCode =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||
.runCommandWithOutputHandling(command);
|
||||
|
||||
if (returnCode.getRc() == 0) {
|
||||
// Update current file to the Ghostscript output
|
||||
Files.copy(gsOutputFile, currentFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
|
||||
long postGsSize = Files.size(currentFile);
|
||||
double gsReduction = 100.0 - ((postGsSize * 100.0) / preGsSize);
|
||||
log.info(
|
||||
"Post-Ghostscript file size: {} (reduced by {}%)",
|
||||
GeneralUtils.formatBytes(postGsSize), String.format("%.1f", gsReduction));
|
||||
} else {
|
||||
log.warn("Ghostscript compression failed with return code: {}", returnCode.getRc());
|
||||
throw new IOException("Ghostscript compression failed");
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
log.warn("Ghostscript compression failed, will fallback to other methods", e);
|
||||
throw new IOException("Ghostscript compression failed", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Run QPDF compression
|
||||
private void applyQpdfCompression(
|
||||
OptimizePdfRequest request, int optimizeLevel, Path currentFile, List<Path> tempFiles)
|
||||
|
@ -2,6 +2,7 @@ package stirling.software.SPDF.controller.api.misc;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.*;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.zip.ZipEntry;
|
||||
@ -26,26 +27,42 @@ import io.github.pixee.security.Filenames;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.config.EndpointConfiguration;
|
||||
import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
|
||||
import stirling.software.common.model.ApplicationProperties;
|
||||
import stirling.software.common.service.CustomPDFDocumentFactory;
|
||||
import stirling.software.common.util.ProcessExecutor;
|
||||
import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
|
||||
import stirling.software.common.util.TempDirectory;
|
||||
import stirling.software.common.util.TempFile;
|
||||
import stirling.software.common.util.TempFileManager;
|
||||
import stirling.software.common.util.WebResponseUtils;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/api/v1/misc")
|
||||
@Tag(name = "Misc", description = "Miscellaneous APIs")
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
public class OCRController {
|
||||
|
||||
private final ApplicationProperties applicationProperties;
|
||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||
private final TempFileManager tempFileManager;
|
||||
private final boolean ocrMyPdfEnabled;
|
||||
private final boolean tesseractEnabled;
|
||||
|
||||
public OCRController(
|
||||
ApplicationProperties applicationProperties,
|
||||
CustomPDFDocumentFactory pdfDocumentFactory,
|
||||
TempFileManager tempFileManager,
|
||||
EndpointConfiguration endpointConfiguration) {
|
||||
this.applicationProperties = applicationProperties;
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
this.tempFileManager = tempFileManager;
|
||||
this.ocrMyPdfEnabled = endpointConfiguration.isGroupEnabled("OCRmyPDF");
|
||||
this.tesseractEnabled = endpointConfiguration.isGroupEnabled("tesseract");
|
||||
}
|
||||
|
||||
/** Gets the list of available Tesseract languages from the tessdata directory */
|
||||
public List<String> getAvailableTesseractLanguages() {
|
||||
@ -63,39 +80,261 @@ public class OCRController {
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
|
||||
@Operation(
|
||||
summary = "Process PDF files with OCR using Tesseract",
|
||||
summary = "Process a PDF file with OCR",
|
||||
description =
|
||||
"Takes a PDF file as input, performs OCR using specified languages and OCR type"
|
||||
+ " (skip-text/force-ocr), and returns the processed PDF. Input:PDF"
|
||||
+ " Output:PDF Type:SISO")
|
||||
"This endpoint processes a PDF file using OCR (Optical Character Recognition). "
|
||||
+ "Users can specify languages, sidecar, deskew, clean, cleanFinal, ocrType, ocrRenderType, and removeImagesAfter options. "
|
||||
+ "Uses OCRmyPDF if available, falls back to Tesseract. Input:PDF Output:PDF Type:SI-Conditional")
|
||||
public ResponseEntity<byte[]> processPdfWithOCR(
|
||||
@ModelAttribute ProcessPdfWithOcrRequest request)
|
||||
throws IOException, InterruptedException {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
List<String> languages = request.getLanguages();
|
||||
List<String> selectedLanguages = request.getLanguages();
|
||||
Boolean sidecar = request.isSidecar();
|
||||
Boolean deskew = request.isDeskew();
|
||||
Boolean clean = request.isClean();
|
||||
Boolean cleanFinal = request.isCleanFinal();
|
||||
String ocrType = request.getOcrType();
|
||||
String ocrRenderType = request.getOcrRenderType();
|
||||
Boolean removeImagesAfter = request.isRemoveImagesAfter();
|
||||
|
||||
// Create a temp directory using TempFileManager directly
|
||||
Path tempDirPath = tempFileManager.createTempDirectory();
|
||||
File tempDir = tempDirPath.toFile();
|
||||
if (selectedLanguages == null || selectedLanguages.isEmpty()) {
|
||||
throw new IOException("Please select at least one language.");
|
||||
}
|
||||
|
||||
if (!"hocr".equals(ocrRenderType) && !"sandwich".equals(ocrRenderType)) {
|
||||
throw new IOException("ocrRenderType wrong");
|
||||
}
|
||||
|
||||
// Get available Tesseract languages
|
||||
List<String> availableLanguages = getAvailableTesseractLanguages();
|
||||
|
||||
// Validate selected languages
|
||||
selectedLanguages =
|
||||
selectedLanguages.stream().filter(availableLanguages::contains).toList();
|
||||
|
||||
if (selectedLanguages.isEmpty()) {
|
||||
throw new IOException("None of the selected languages are valid.");
|
||||
}
|
||||
|
||||
// Use try-with-resources for proper temp file management
|
||||
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
|
||||
TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
|
||||
|
||||
inputFile.transferTo(tempInputFile.getFile());
|
||||
|
||||
TempFile sidecarTextFile = null;
|
||||
|
||||
try {
|
||||
File tempInputFile = new File(tempDir, "input.pdf");
|
||||
File tempOutputDir = new File(tempDir, "output");
|
||||
File tempImagesDir = new File(tempDir, "images");
|
||||
File finalOutputFile = new File(tempDir, "final_output.pdf");
|
||||
// Use OCRmyPDF if available (no fallback - error if it fails)
|
||||
if (ocrMyPdfEnabled) {
|
||||
if (sidecar != null && sidecar) {
|
||||
sidecarTextFile = new TempFile(tempFileManager, ".txt");
|
||||
}
|
||||
|
||||
processWithOcrMyPdf(
|
||||
selectedLanguages,
|
||||
sidecar,
|
||||
deskew,
|
||||
clean,
|
||||
cleanFinal,
|
||||
ocrType,
|
||||
ocrRenderType,
|
||||
removeImagesAfter,
|
||||
tempInputFile.getPath(),
|
||||
tempOutputFile.getPath(),
|
||||
sidecarTextFile != null ? sidecarTextFile.getPath() : null);
|
||||
log.info("OCRmyPDF processing completed successfully");
|
||||
}
|
||||
// Use Tesseract only if OCRmyPDF is not available
|
||||
else if (tesseractEnabled) {
|
||||
processWithTesseract(
|
||||
selectedLanguages,
|
||||
ocrType,
|
||||
tempInputFile.getPath(),
|
||||
tempOutputFile.getPath());
|
||||
log.info("Tesseract processing completed successfully");
|
||||
} else {
|
||||
throw new IOException("No OCR tools are available");
|
||||
}
|
||||
|
||||
// Read the processed PDF file
|
||||
byte[] pdfBytes = Files.readAllBytes(tempOutputFile.getPath());
|
||||
|
||||
// Return the OCR processed PDF as a response
|
||||
String outputFilename =
|
||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
+ "_OCR.pdf";
|
||||
|
||||
if (sidecar != null && sidecar && sidecarTextFile != null) {
|
||||
// Create a zip file containing both the PDF and the text file
|
||||
String outputZipFilename =
|
||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
+ "_OCR.zip";
|
||||
|
||||
try (TempFile tempZipFile = new TempFile(tempFileManager, ".zip");
|
||||
ZipOutputStream zipOut =
|
||||
new ZipOutputStream(
|
||||
Files.newOutputStream(tempZipFile.getPath()))) {
|
||||
|
||||
// Add PDF file to the zip
|
||||
ZipEntry pdfEntry = new ZipEntry(outputFilename);
|
||||
zipOut.putNextEntry(pdfEntry);
|
||||
zipOut.write(pdfBytes);
|
||||
zipOut.closeEntry();
|
||||
|
||||
// Add text file to the zip
|
||||
ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
|
||||
zipOut.putNextEntry(txtEntry);
|
||||
Files.copy(sidecarTextFile.getPath(), zipOut);
|
||||
zipOut.closeEntry();
|
||||
|
||||
zipOut.finish();
|
||||
|
||||
byte[] zipBytes = Files.readAllBytes(tempZipFile.getPath());
|
||||
|
||||
// Return the zip file containing both the PDF and the text file
|
||||
return WebResponseUtils.bytesToWebResponse(
|
||||
zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
|
||||
}
|
||||
} else {
|
||||
// Return the OCR processed PDF as a response
|
||||
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
|
||||
}
|
||||
|
||||
} finally {
|
||||
// Clean up sidecar temp file if created
|
||||
if (sidecarTextFile != null) {
|
||||
try {
|
||||
sidecarTextFile.close();
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to close sidecar temp file", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void processWithOcrMyPdf(
|
||||
List<String> selectedLanguages,
|
||||
Boolean sidecar,
|
||||
Boolean deskew,
|
||||
Boolean clean,
|
||||
Boolean cleanFinal,
|
||||
String ocrType,
|
||||
String ocrRenderType,
|
||||
Boolean removeImagesAfter,
|
||||
Path tempInputFile,
|
||||
Path tempOutputFile,
|
||||
Path sidecarTextPath)
|
||||
throws IOException, InterruptedException {
|
||||
|
||||
// Build OCRmyPDF command
|
||||
String languageOption = String.join("+", selectedLanguages);
|
||||
|
||||
List<String> command =
|
||||
new ArrayList<>(
|
||||
Arrays.asList(
|
||||
"ocrmypdf",
|
||||
"--verbose",
|
||||
"2",
|
||||
"--output-type",
|
||||
"pdf",
|
||||
"--pdf-renderer",
|
||||
ocrRenderType));
|
||||
|
||||
if (sidecar != null && sidecar && sidecarTextPath != null) {
|
||||
command.add("--sidecar");
|
||||
command.add(sidecarTextPath.toString());
|
||||
}
|
||||
|
||||
if (deskew != null && deskew) {
|
||||
command.add("--deskew");
|
||||
}
|
||||
if (clean != null && clean) {
|
||||
command.add("--clean");
|
||||
}
|
||||
if (cleanFinal != null && cleanFinal) {
|
||||
command.add("--clean-final");
|
||||
}
|
||||
if (ocrType != null && !"".equals(ocrType)) {
|
||||
if ("skip-text".equals(ocrType)) {
|
||||
command.add("--skip-text");
|
||||
} else if ("force-ocr".equals(ocrType)) {
|
||||
command.add("--force-ocr");
|
||||
}
|
||||
}
|
||||
|
||||
command.addAll(
|
||||
Arrays.asList(
|
||||
"--language",
|
||||
languageOption,
|
||||
tempInputFile.toString(),
|
||||
tempOutputFile.toString()));
|
||||
|
||||
// Run CLI command
|
||||
ProcessExecutorResult result =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
||||
.runCommandWithOutputHandling(command);
|
||||
|
||||
if (result.getRc() != 0
|
||||
&& result.getMessages().contains("multiprocessing/synchronize.py")
|
||||
&& result.getMessages().contains("OSError: [Errno 38] Function not implemented")) {
|
||||
command.add("--jobs");
|
||||
command.add("1");
|
||||
result =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
||||
.runCommandWithOutputHandling(command);
|
||||
}
|
||||
|
||||
if (result.getRc() != 0) {
|
||||
throw new IOException("OCRmyPDF failed with return code: " + result.getRc());
|
||||
}
|
||||
|
||||
// Remove images from the OCR processed PDF if the flag is set to true
|
||||
if (removeImagesAfter != null && removeImagesAfter) {
|
||||
try (TempFile tempPdfWithoutImages = new TempFile(tempFileManager, "_no_images.pdf")) {
|
||||
List<String> gsCommand =
|
||||
Arrays.asList(
|
||||
"gs",
|
||||
"-sDEVICE=pdfwrite",
|
||||
"-dFILTERIMAGE",
|
||||
"-o",
|
||||
tempPdfWithoutImages.getPath().toString(),
|
||||
tempOutputFile.toString());
|
||||
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||
.runCommandWithOutputHandling(gsCommand);
|
||||
|
||||
// Replace output file with version without images
|
||||
Files.copy(
|
||||
tempPdfWithoutImages.getPath(),
|
||||
tempOutputFile,
|
||||
java.nio.file.StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void processWithTesseract(
|
||||
List<String> selectedLanguages, String ocrType, Path tempInputFile, Path tempOutputFile)
|
||||
throws IOException, InterruptedException {
|
||||
|
||||
// Create temp directory for Tesseract processing
|
||||
try (TempDirectory tempDir = new TempDirectory(tempFileManager)) {
|
||||
File tempOutputDir = new File(tempDir.getPath().toFile(), "output");
|
||||
File tempImagesDir = new File(tempDir.getPath().toFile(), "images");
|
||||
File finalOutputFile = new File(tempDir.getPath().toFile(), "final_output.pdf");
|
||||
|
||||
// Create directories
|
||||
tempOutputDir.mkdirs();
|
||||
tempImagesDir.mkdirs();
|
||||
|
||||
// Save input file
|
||||
inputFile.transferTo(tempInputFile);
|
||||
|
||||
PDFMergerUtility merger = new PDFMergerUtility();
|
||||
merger.setDestinationFileName(finalOutputFile.toString());
|
||||
|
||||
try (PDDocument document = pdfDocumentFactory.load(tempInputFile)) {
|
||||
try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) {
|
||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||
int pageCount = document.getNumberOfPages();
|
||||
|
||||
@ -135,35 +374,20 @@ public class OCRController {
|
||||
new File(tempOutputDir, String.format("page_%d", pageNum))
|
||||
.toString());
|
||||
command.add("-l");
|
||||
command.add(String.join("+", languages));
|
||||
// Always output PDF
|
||||
command.add("pdf");
|
||||
command.add(String.join("+", selectedLanguages));
|
||||
command.add("pdf"); // Always output PDF
|
||||
|
||||
// Use ProcessExecutor to run tesseract command
|
||||
try {
|
||||
ProcessExecutorResult result =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
|
||||
.runCommandWithOutputHandling(command);
|
||||
|
||||
log.debug(
|
||||
"Tesseract OCR completed for page {} with exit code {}",
|
||||
pageNum,
|
||||
result.getRc());
|
||||
if (result.getRc() != 0) {
|
||||
throw new RuntimeException(
|
||||
"Tesseract failed with exit code: " + result.getRc());
|
||||
}
|
||||
|
||||
// Add OCR'd PDF to merger
|
||||
merger.addSource(pageOutputPath);
|
||||
} catch (IOException | InterruptedException e) {
|
||||
log.error(
|
||||
"Error processing page {} with tesseract: {}",
|
||||
pageNum,
|
||||
e.getMessage());
|
||||
// If OCR fails, fall back to the original page
|
||||
try (PDDocument pageDoc = new PDDocument()) {
|
||||
pageDoc.addPage(page);
|
||||
pageDoc.save(pageOutputPath);
|
||||
merger.addSource(pageOutputPath);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Save original page without OCR
|
||||
try (PDDocument pageDoc = new PDDocument()) {
|
||||
@ -178,40 +402,11 @@ public class OCRController {
|
||||
// Merge all pages into final PDF
|
||||
merger.mergeDocuments(null);
|
||||
|
||||
// Read the final PDF file
|
||||
byte[] pdfContent = java.nio.file.Files.readAllBytes(finalOutputFile.toPath());
|
||||
String outputFilename =
|
||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
+ "_OCR.pdf";
|
||||
|
||||
return ResponseEntity.ok()
|
||||
.header(
|
||||
"Content-Disposition",
|
||||
"attachment; filename=\"" + outputFilename + "\"")
|
||||
.contentType(MediaType.APPLICATION_PDF)
|
||||
.body(pdfContent);
|
||||
} finally {
|
||||
// Clean up the temp directory and all its contents
|
||||
tempFileManager.deleteTempDirectory(tempDirPath);
|
||||
}
|
||||
}
|
||||
|
||||
private void addFileToZip(File file, String filename, ZipOutputStream zipOut)
|
||||
throws IOException {
|
||||
if (!file.exists()) {
|
||||
log.warn("File {} does not exist, skipping", file);
|
||||
return;
|
||||
}
|
||||
try (FileInputStream fis = new FileInputStream(file)) {
|
||||
ZipEntry zipEntry = new ZipEntry(filename);
|
||||
zipOut.putNextEntry(zipEntry);
|
||||
byte[] buffer = new byte[1024];
|
||||
int length;
|
||||
while ((length = fis.read(buffer)) >= 0) {
|
||||
zipOut.write(buffer, 0, length);
|
||||
}
|
||||
zipOut.closeEntry();
|
||||
// Copy final output to the expected location
|
||||
Files.copy(
|
||||
finalOutputFile.toPath(),
|
||||
tempOutputFile,
|
||||
java.nio.file.StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -15,8 +15,7 @@ import io.github.pixee.security.Filenames;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
import stirling.software.SPDF.config.EndpointConfiguration;
|
||||
import stirling.software.common.model.api.PDFFile;
|
||||
import stirling.software.common.service.CustomPDFDocumentFactory;
|
||||
import stirling.software.common.util.ProcessExecutor;
|
||||
@ -28,17 +27,28 @@ import stirling.software.common.util.WebResponseUtils;
|
||||
@RestController
|
||||
@RequestMapping("/api/v1/misc")
|
||||
@Tag(name = "Misc", description = "Miscellaneous APIs")
|
||||
@RequiredArgsConstructor
|
||||
public class RepairController {
|
||||
|
||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||
private final TempFileManager tempFileManager;
|
||||
private final boolean ghostscriptEnabled;
|
||||
private final boolean qpdfEnabled;
|
||||
|
||||
public RepairController(
|
||||
CustomPDFDocumentFactory pdfDocumentFactory,
|
||||
TempFileManager tempFileManager,
|
||||
EndpointConfiguration endpointConfiguration) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
this.tempFileManager = tempFileManager;
|
||||
this.ghostscriptEnabled = endpointConfiguration.isGroupEnabled("Ghostscript");
|
||||
this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf");
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/repair")
|
||||
@Operation(
|
||||
summary = "Repair a PDF file",
|
||||
description =
|
||||
"This endpoint repairs a given PDF file by running qpdf command. The PDF is"
|
||||
"This endpoint repairs a given PDF file by running Ghostscript (primary), qpdf (fallback), or PDFBox (if no external tools available). The PDF is"
|
||||
+ " first saved to a temporary location, repaired, read back, and then"
|
||||
+ " returned as a response. Input:PDF Output:PDF Type:SISO")
|
||||
public ResponseEntity<byte[]> repairPdf(@ModelAttribute PDFFile file)
|
||||
@ -46,25 +56,72 @@ public class RepairController {
|
||||
MultipartFile inputFile = file.getFileInput();
|
||||
|
||||
// Use TempFile with try-with-resources for automatic cleanup
|
||||
try (TempFile tempFile = new TempFile(tempFileManager, ".pdf")) {
|
||||
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
|
||||
TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
|
||||
|
||||
// Save the uploaded file to the temporary location
|
||||
inputFile.transferTo(tempFile.getFile());
|
||||
inputFile.transferTo(tempInputFile.getFile());
|
||||
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add("qpdf");
|
||||
command.add("--replace-input"); // Automatically fixes problems it can
|
||||
command.add("--qdf"); // Linearizes and normalizes PDF structure
|
||||
command.add("--object-streams=disable"); // Can help with some corruptions
|
||||
command.add(tempFile.getFile().getAbsolutePath());
|
||||
boolean repairSuccess = false;
|
||||
|
||||
ProcessExecutorResult returnCode =
|
||||
// Try Ghostscript first if available
|
||||
if (ghostscriptEnabled) {
|
||||
try {
|
||||
List<String> gsCommand = new ArrayList<>();
|
||||
gsCommand.add("gs");
|
||||
gsCommand.add("-o");
|
||||
gsCommand.add(tempOutputFile.getPath().toString());
|
||||
gsCommand.add("-sDEVICE=pdfwrite");
|
||||
gsCommand.add(tempInputFile.getPath().toString());
|
||||
|
||||
ProcessExecutorResult gsResult =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||
.runCommandWithOutputHandling(gsCommand);
|
||||
|
||||
if (gsResult.getRc() == 0) {
|
||||
repairSuccess = true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// Log and continue to QPDF fallback
|
||||
System.out.println(
|
||||
"Ghostscript repair failed, trying QPDF fallback: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to QPDF if Ghostscript failed or not available
|
||||
if (!repairSuccess && qpdfEnabled) {
|
||||
List<String> qpdfCommand = new ArrayList<>();
|
||||
qpdfCommand.add("qpdf");
|
||||
qpdfCommand.add("--replace-input"); // Automatically fixes problems it can
|
||||
qpdfCommand.add("--qdf"); // Linearizes and normalizes PDF structure
|
||||
qpdfCommand.add("--object-streams=disable"); // Can help with some corruptions
|
||||
qpdfCommand.add(tempInputFile.getPath().toString());
|
||||
qpdfCommand.add(tempOutputFile.getPath().toString());
|
||||
|
||||
ProcessExecutorResult qpdfResult =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
|
||||
.runCommandWithOutputHandling(command);
|
||||
.runCommandWithOutputHandling(qpdfCommand);
|
||||
|
||||
// Read the optimized PDF file
|
||||
byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempFile.getFile());
|
||||
repairSuccess = true;
|
||||
}
|
||||
|
||||
// Return the optimized PDF as a response
|
||||
// Use PDFBox as last resort if no external tools are available
|
||||
if (!repairSuccess) {
|
||||
if (!ghostscriptEnabled && !qpdfEnabled) {
|
||||
// Basic PDFBox repair - load and save to fix structural issues
|
||||
try (var document = pdfDocumentFactory.load(tempInputFile.getFile())) {
|
||||
document.save(tempOutputFile.getFile());
|
||||
repairSuccess = true;
|
||||
}
|
||||
} else {
|
||||
throw new IOException("PDF repair failed with available tools");
|
||||
}
|
||||
}
|
||||
|
||||
// Read the repaired PDF file
|
||||
byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempOutputFile.getFile());
|
||||
|
||||
// Return the repaired PDF as a response
|
||||
String outputFilename =
|
||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
|
@ -19,6 +19,18 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
|
||||
defaultValue = "[\"eng\"]")
|
||||
private List<String> languages;
|
||||
|
||||
@Schema(description = "Include OCR text in a sidecar text file if set to true")
|
||||
private boolean sidecar;
|
||||
|
||||
@Schema(description = "Deskew the input file if set to true")
|
||||
private boolean deskew;
|
||||
|
||||
@Schema(description = "Clean the input file if set to true")
|
||||
private boolean clean;
|
||||
|
||||
@Schema(description = "Clean the final output if set to true")
|
||||
private boolean cleanFinal;
|
||||
|
||||
@Schema(
|
||||
description = "Specify the OCR type, e.g., 'skip-text', 'force-ocr', or 'Normal'",
|
||||
requiredMode = Schema.RequiredMode.REQUIRED,
|
||||
@ -31,4 +43,7 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
|
||||
allowableValues = {"hocr", "sandwich"},
|
||||
defaultValue = "hocr")
|
||||
private String ocrRenderType = "hocr";
|
||||
|
||||
@Schema(description = "Remove images from the output PDF if set to true")
|
||||
private boolean removeImagesAfter;
|
||||
}
|
||||
|
@ -79,6 +79,30 @@
|
||||
</select>
|
||||
</div>
|
||||
<br>
|
||||
<div class="mb-3" th:if>
|
||||
<label class="form-label">OCR Options</label>
|
||||
<div class="form-check">
|
||||
<input type="checkbox" class="form-check-input" id="sidecar" name="sidecar" value="true">
|
||||
<label class="form-check-label" for="sidecar">Include OCR text in sidecar text file</label>
|
||||
</div>
|
||||
<div class="form-check">
|
||||
<input type="checkbox" class="form-check-input" id="deskew" name="deskew" value="true">
|
||||
<label class="form-check-label" for="deskew">Deskew input file</label>
|
||||
</div>
|
||||
<div class="form-check">
|
||||
<input type="checkbox" class="form-check-input" id="clean" name="clean" value="true">
|
||||
<label class="form-check-label" for="clean">Clean input file</label>
|
||||
</div>
|
||||
<div class="form-check">
|
||||
<input type="checkbox" class="form-check-input" id="cleanFinal" name="cleanFinal" value="true">
|
||||
<label class="form-check-label" for="cleanFinal">Clean final output</label>
|
||||
</div>
|
||||
<div class="form-check">
|
||||
<input type="checkbox" class="form-check-input" id="removeImagesAfter" name="removeImagesAfter" value="true">
|
||||
<label class="form-check-label" for="removeImagesAfter">Remove images from output PDF</label>
|
||||
</div>
|
||||
</div>
|
||||
<br>
|
||||
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{ocr.submit}"></button>
|
||||
</form>
|
||||
<script th:inline="javascript">
|
||||
|
Loading…
x
Reference in New Issue
Block a user