mirror of
https://github.com/Stirling-Tools/Stirling-PDF.git
synced 2025-08-27 14:49:23 +00:00
restore OCRMyPDF and ghostscript compression
This commit is contained in:
parent
11e3ccd19f
commit
782c30f934
@ -51,7 +51,6 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
|
|||||||
tini \
|
tini \
|
||||||
bash \
|
bash \
|
||||||
curl \
|
curl \
|
||||||
qpdf \
|
|
||||||
shadow \
|
shadow \
|
||||||
su-exec \
|
su-exec \
|
||||||
openssl \
|
openssl \
|
||||||
@ -69,9 +68,11 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
|
|||||||
tesseract-ocr-data-deu \
|
tesseract-ocr-data-deu \
|
||||||
tesseract-ocr-data-fra \
|
tesseract-ocr-data-fra \
|
||||||
tesseract-ocr-data-por \
|
tesseract-ocr-data-por \
|
||||||
|
unpaper \
|
||||||
# CV
|
# CV
|
||||||
py3-opencv \
|
py3-opencv \
|
||||||
python3 \
|
python3 \
|
||||||
|
ocrmypdf \
|
||||||
py3-pip \
|
py3-pip \
|
||||||
py3-pillow@testing \
|
py3-pillow@testing \
|
||||||
py3-pdf2image@testing && \
|
py3-pdf2image@testing && \
|
||||||
|
@ -76,16 +76,17 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
|
|||||||
# pdftohtml
|
# pdftohtml
|
||||||
poppler-utils \
|
poppler-utils \
|
||||||
# OCR MY PDF (unpaper for descew and other advanced featues)
|
# OCR MY PDF (unpaper for descew and other advanced featues)
|
||||||
qpdf \
|
|
||||||
tesseract-ocr-data-eng \
|
tesseract-ocr-data-eng \
|
||||||
tesseract-ocr-data-chi_sim \
|
tesseract-ocr-data-chi_sim \
|
||||||
tesseract-ocr-data-deu \
|
tesseract-ocr-data-deu \
|
||||||
tesseract-ocr-data-fra \
|
tesseract-ocr-data-fra \
|
||||||
tesseract-ocr-data-por \
|
tesseract-ocr-data-por \
|
||||||
|
unpaper \
|
||||||
font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \
|
font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \
|
||||||
# CV
|
# CV
|
||||||
py3-opencv \
|
py3-opencv \
|
||||||
python3 \
|
python3 \
|
||||||
|
ocrmypdf \
|
||||||
py3-pip \
|
py3-pip \
|
||||||
py3-pillow@testing \
|
py3-pillow@testing \
|
||||||
py3-pdf2image@testing && \
|
py3-pdf2image@testing && \
|
||||||
|
@ -545,6 +545,8 @@ public class ApplicationProperties {
|
|||||||
private int calibreSessionLimit;
|
private int calibreSessionLimit;
|
||||||
private int qpdfSessionLimit;
|
private int qpdfSessionLimit;
|
||||||
private int tesseractSessionLimit;
|
private int tesseractSessionLimit;
|
||||||
|
private int ghostscriptSessionLimit;
|
||||||
|
private int ocrMyPdfSessionLimit;
|
||||||
|
|
||||||
public int getQpdfSessionLimit() {
|
public int getQpdfSessionLimit() {
|
||||||
return qpdfSessionLimit > 0 ? qpdfSessionLimit : 2;
|
return qpdfSessionLimit > 0 ? qpdfSessionLimit : 2;
|
||||||
@ -577,6 +579,14 @@ public class ApplicationProperties {
|
|||||||
public int getCalibreSessionLimit() {
|
public int getCalibreSessionLimit() {
|
||||||
return calibreSessionLimit > 0 ? calibreSessionLimit : 1;
|
return calibreSessionLimit > 0 ? calibreSessionLimit : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int getGhostscriptSessionLimit() {
|
||||||
|
return ghostscriptSessionLimit > 0 ? ghostscriptSessionLimit : 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getOcrMyPdfSessionLimit() {
|
||||||
|
return ocrMyPdfSessionLimit > 0 ? ocrMyPdfSessionLimit : 2;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@ -589,6 +599,8 @@ public class ApplicationProperties {
|
|||||||
private long calibreTimeoutMinutes;
|
private long calibreTimeoutMinutes;
|
||||||
private long tesseractTimeoutMinutes;
|
private long tesseractTimeoutMinutes;
|
||||||
private long qpdfTimeoutMinutes;
|
private long qpdfTimeoutMinutes;
|
||||||
|
private long ghostscriptTimeoutMinutes;
|
||||||
|
private long ocrMyPdfTimeoutMinutes;
|
||||||
|
|
||||||
public long getTesseractTimeoutMinutes() {
|
public long getTesseractTimeoutMinutes() {
|
||||||
return tesseractTimeoutMinutes > 0 ? tesseractTimeoutMinutes : 30;
|
return tesseractTimeoutMinutes > 0 ? tesseractTimeoutMinutes : 30;
|
||||||
@ -621,6 +633,14 @@ public class ApplicationProperties {
|
|||||||
public long getCalibreTimeoutMinutes() {
|
public long getCalibreTimeoutMinutes() {
|
||||||
return calibreTimeoutMinutes > 0 ? calibreTimeoutMinutes : 30;
|
return calibreTimeoutMinutes > 0 ? calibreTimeoutMinutes : 30;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public long getGhostscriptTimeoutMinutes() {
|
||||||
|
return ghostscriptTimeoutMinutes > 0 ? ghostscriptTimeoutMinutes : 30;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getOcrMyPdfTimeoutMinutes() {
|
||||||
|
return ocrMyPdfTimeoutMinutes > 0 ? ocrMyPdfTimeoutMinutes : 30;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -308,7 +308,7 @@ public class TempFileCleanupService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
java.util.List<Path> subdirectories = new java.util.ArrayList<>();
|
java.util.List<Path> subdirectories = new java.util.ArrayList<>();
|
||||||
|
|
||||||
try (Stream<Path> pathStream = Files.list(directory)) {
|
try (Stream<Path> pathStream = Files.list(directory)) {
|
||||||
pathStream.forEach(
|
pathStream.forEach(
|
||||||
path -> {
|
path -> {
|
||||||
@ -347,7 +347,7 @@ public class TempFileCleanupService {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Path subdirectory : subdirectories) {
|
for (Path subdirectory : subdirectories) {
|
||||||
try {
|
try {
|
||||||
cleanupDirectoryStreaming(
|
cleanupDirectoryStreaming(
|
||||||
|
@ -84,6 +84,16 @@ public class ProcessExecutor {
|
|||||||
.getProcessExecutor()
|
.getProcessExecutor()
|
||||||
.getSessionLimit()
|
.getSessionLimit()
|
||||||
.getCalibreSessionLimit();
|
.getCalibreSessionLimit();
|
||||||
|
case GHOSTSCRIPT ->
|
||||||
|
applicationProperties
|
||||||
|
.getProcessExecutor()
|
||||||
|
.getSessionLimit()
|
||||||
|
.getGhostscriptSessionLimit();
|
||||||
|
case OCR_MY_PDF ->
|
||||||
|
applicationProperties
|
||||||
|
.getProcessExecutor()
|
||||||
|
.getSessionLimit()
|
||||||
|
.getOcrMyPdfSessionLimit();
|
||||||
};
|
};
|
||||||
|
|
||||||
long timeoutMinutes =
|
long timeoutMinutes =
|
||||||
@ -128,6 +138,16 @@ public class ProcessExecutor {
|
|||||||
.getProcessExecutor()
|
.getProcessExecutor()
|
||||||
.getTimeoutMinutes()
|
.getTimeoutMinutes()
|
||||||
.getCalibreTimeoutMinutes();
|
.getCalibreTimeoutMinutes();
|
||||||
|
case GHOSTSCRIPT ->
|
||||||
|
applicationProperties
|
||||||
|
.getProcessExecutor()
|
||||||
|
.getTimeoutMinutes()
|
||||||
|
.getGhostscriptTimeoutMinutes();
|
||||||
|
case OCR_MY_PDF ->
|
||||||
|
applicationProperties
|
||||||
|
.getProcessExecutor()
|
||||||
|
.getTimeoutMinutes()
|
||||||
|
.getOcrMyPdfTimeoutMinutes();
|
||||||
};
|
};
|
||||||
return new ProcessExecutor(semaphoreLimit, liveUpdates, timeoutMinutes);
|
return new ProcessExecutor(semaphoreLimit, liveUpdates, timeoutMinutes);
|
||||||
});
|
});
|
||||||
@ -278,7 +298,9 @@ public class ProcessExecutor {
|
|||||||
INSTALL_APP,
|
INSTALL_APP,
|
||||||
CALIBRE,
|
CALIBRE,
|
||||||
TESSERACT,
|
TESSERACT,
|
||||||
QPDF
|
QPDF,
|
||||||
|
GHOSTSCRIPT,
|
||||||
|
OCR_MY_PDF
|
||||||
}
|
}
|
||||||
|
|
||||||
public class ProcessExecutorResult {
|
public class ProcessExecutorResult {
|
||||||
|
@ -9,7 +9,6 @@ import java.util.Map;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.ConcurrentMap;
|
import java.util.concurrent.ConcurrentMap;
|
||||||
import java.util.concurrent.ConcurrentSkipListSet;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
@ -24,11 +23,10 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@Component
|
@Component
|
||||||
public class TempFileRegistry {
|
public class TempFileRegistry {
|
||||||
|
|
||||||
private final ConcurrentMap<Path, Instant> registeredFiles = new ConcurrentHashMap<>();
|
private final ConcurrentMap<Path, Instant> registeredFiles = new ConcurrentHashMap<>();
|
||||||
private final Set<Path> thirdPartyTempFiles =
|
private final Set<Path> thirdPartyTempFiles =
|
||||||
Collections.newSetFromMap(new ConcurrentHashMap<>());
|
Collections.newSetFromMap(new ConcurrentHashMap<>());
|
||||||
private final Set<Path> tempDirectories =
|
private final Set<Path> tempDirectories = Collections.newSetFromMap(new ConcurrentHashMap<>());
|
||||||
Collections.newSetFromMap(new ConcurrentHashMap<>());
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Register a temporary file with the registry.
|
* Register a temporary file with the registry.
|
||||||
|
@ -21,6 +21,8 @@ public class EndpointConfiguration {
|
|||||||
private final ApplicationProperties applicationProperties;
|
private final ApplicationProperties applicationProperties;
|
||||||
private Map<String, Boolean> endpointStatuses = new ConcurrentHashMap<>();
|
private Map<String, Boolean> endpointStatuses = new ConcurrentHashMap<>();
|
||||||
private Map<String, Set<String>> endpointGroups = new ConcurrentHashMap<>();
|
private Map<String, Set<String>> endpointGroups = new ConcurrentHashMap<>();
|
||||||
|
private Set<String> disabledGroups = new HashSet<>();
|
||||||
|
private Map<String, Set<String>> endpointAlternatives = new ConcurrentHashMap<>();
|
||||||
private final boolean runningProOrHigher;
|
private final boolean runningProOrHigher;
|
||||||
|
|
||||||
public EndpointConfiguration(
|
public EndpointConfiguration(
|
||||||
@ -51,16 +53,36 @@ public class EndpointConfiguration {
|
|||||||
if (endpoint.startsWith("/")) {
|
if (endpoint.startsWith("/")) {
|
||||||
endpoint = endpoint.substring(1);
|
endpoint = endpoint.substring(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if endpoint has alternatives (multiple tools can handle it)
|
||||||
|
Set<String> alternatives = endpointAlternatives.get(endpoint);
|
||||||
|
if (alternatives != null && !alternatives.isEmpty()) {
|
||||||
|
// Endpoint is enabled if ANY of its alternative tools are enabled
|
||||||
|
for (String toolGroup : alternatives) {
|
||||||
|
if (isGroupEnabled(toolGroup)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false; // All alternative tools are disabled
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to standard endpoint status check
|
||||||
return endpointStatuses.getOrDefault(endpoint, true);
|
return endpointStatuses.getOrDefault(endpoint, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isGroupEnabled(String group) {
|
public boolean isGroupEnabled(String group) {
|
||||||
|
// Check if group is explicitly disabled first
|
||||||
|
if (disabledGroups.contains(group)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
Set<String> endpoints = endpointGroups.get(group);
|
Set<String> endpoints = endpointGroups.get(group);
|
||||||
if (endpoints == null || endpoints.isEmpty()) {
|
if (endpoints == null || endpoints.isEmpty()) {
|
||||||
log.debug("Group '{}' does not exist or has no endpoints", group);
|
log.debug("Group '{}' does not exist or has no endpoints", group);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Additional check: if all endpoints in group are disabled, consider group disabled
|
||||||
for (String endpoint : endpoints) {
|
for (String endpoint : endpoints) {
|
||||||
if (!isEndpointEnabled(endpoint)) {
|
if (!isEndpointEnabled(endpoint)) {
|
||||||
return false;
|
return false;
|
||||||
@ -73,8 +95,23 @@ public class EndpointConfiguration {
|
|||||||
public void addEndpointToGroup(String group, String endpoint) {
|
public void addEndpointToGroup(String group, String endpoint) {
|
||||||
endpointGroups.computeIfAbsent(group, k -> new HashSet<>()).add(endpoint);
|
endpointGroups.computeIfAbsent(group, k -> new HashSet<>()).add(endpoint);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void addEndpointAlternative(String endpoint, String toolGroup) {
|
||||||
|
endpointAlternatives.computeIfAbsent(endpoint, k -> new HashSet<>()).add(toolGroup);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void disableGroup(String group) {
|
||||||
|
disabledGroups.add(group);
|
||||||
|
Set<String> endpoints = endpointGroups.get(group);
|
||||||
|
if (endpoints != null) {
|
||||||
|
for (String endpoint : endpoints) {
|
||||||
|
disableEndpoint(endpoint);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void enableGroup(String group) {
|
public void enableGroup(String group) {
|
||||||
|
disabledGroups.remove(group);
|
||||||
Set<String> endpoints = endpointGroups.get(group);
|
Set<String> endpoints = endpointGroups.get(group);
|
||||||
if (endpoints != null) {
|
if (endpoints != null) {
|
||||||
for (String endpoint : endpoints) {
|
for (String endpoint : endpoints) {
|
||||||
@ -83,13 +120,8 @@ public class EndpointConfiguration {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void disableGroup(String group) {
|
public Set<String> getDisabledGroups() {
|
||||||
Set<String> endpoints = endpointGroups.get(group);
|
return new HashSet<>(disabledGroups);
|
||||||
if (endpoints != null) {
|
|
||||||
for (String endpoint : endpoints) {
|
|
||||||
disableEndpoint(endpoint);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void logDisabledEndpointsSummary() {
|
public void logDisabledEndpointsSummary() {
|
||||||
@ -101,6 +133,12 @@ public class EndpointConfiguration {
|
|||||||
.sorted()
|
.sorted()
|
||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
|
if (!disabledGroups.isEmpty()) {
|
||||||
|
log.info(
|
||||||
|
"Disabled groups: {}",
|
||||||
|
String.join(", ", disabledGroups.stream().sorted().toList()));
|
||||||
|
}
|
||||||
|
|
||||||
if (!disabledList.isEmpty()) {
|
if (!disabledList.isEmpty()) {
|
||||||
log.info(
|
log.info(
|
||||||
"Total disabled endpoints: {}. Disabled endpoints: {}",
|
"Total disabled endpoints: {}. Disabled endpoints: {}",
|
||||||
@ -212,7 +250,6 @@ public class EndpointConfiguration {
|
|||||||
// Unoconvert
|
// Unoconvert
|
||||||
addEndpointToGroup("Unoconvert", "file-to-pdf");
|
addEndpointToGroup("Unoconvert", "file-to-pdf");
|
||||||
|
|
||||||
addEndpointToGroup("tesseract", "ocr-pdf");
|
|
||||||
|
|
||||||
// Java
|
// Java
|
||||||
addEndpointToGroup("Java", "merge-pdfs");
|
addEndpointToGroup("Java", "merge-pdfs");
|
||||||
@ -261,8 +298,13 @@ public class EndpointConfiguration {
|
|||||||
addEndpointToGroup("Javascript", "compare");
|
addEndpointToGroup("Javascript", "compare");
|
||||||
addEndpointToGroup("Javascript", "adjust-contrast");
|
addEndpointToGroup("Javascript", "adjust-contrast");
|
||||||
|
|
||||||
// qpdf dependent endpoints
|
// Multi-tool endpoints - endpoints that can be handled by multiple tools
|
||||||
addEndpointToGroup("qpdf", "repair");
|
addEndpointAlternative("repair", "qpdf");
|
||||||
|
addEndpointAlternative("repair", "Ghostscript");
|
||||||
|
addEndpointAlternative("compress-pdf", "qpdf");
|
||||||
|
addEndpointAlternative("compress-pdf", "Ghostscript");
|
||||||
|
addEndpointAlternative("ocr-pdf", "tesseract");
|
||||||
|
addEndpointAlternative("ocr-pdf", "OCRmyPDF");
|
||||||
|
|
||||||
// Weasyprint dependent endpoints
|
// Weasyprint dependent endpoints
|
||||||
addEndpointToGroup("Weasyprint", "html-to-pdf");
|
addEndpointToGroup("Weasyprint", "html-to-pdf");
|
||||||
|
@ -34,6 +34,8 @@ public class ExternalAppDepConfig {
|
|||||||
new HashMap<>() {
|
new HashMap<>() {
|
||||||
|
|
||||||
{
|
{
|
||||||
|
put("gs", List.of("Ghostscript"));
|
||||||
|
put("ocrmypdf", List.of("OCRmyPDF"));
|
||||||
put("soffice", List.of("LibreOffice"));
|
put("soffice", List.of("LibreOffice"));
|
||||||
put(weasyprintPath, List.of("Weasyprint"));
|
put(weasyprintPath, List.of("Weasyprint"));
|
||||||
put("pdftohtml", List.of("Pdftohtml"));
|
put("pdftohtml", List.of("Pdftohtml"));
|
||||||
@ -109,6 +111,8 @@ public class ExternalAppDepConfig {
|
|||||||
@PostConstruct
|
@PostConstruct
|
||||||
public void checkDependencies() {
|
public void checkDependencies() {
|
||||||
// Check core dependencies
|
// Check core dependencies
|
||||||
|
checkDependencyAndDisableGroup("gs");
|
||||||
|
checkDependencyAndDisableGroup("ocrmypdf");
|
||||||
checkDependencyAndDisableGroup("tesseract");
|
checkDependencyAndDisableGroup("tesseract");
|
||||||
checkDependencyAndDisableGroup("soffice");
|
checkDependencyAndDisableGroup("soffice");
|
||||||
checkDependencyAndDisableGroup("qpdf");
|
checkDependencyAndDisableGroup("qpdf");
|
||||||
|
@ -65,12 +65,14 @@ public class CompressController {
|
|||||||
|
|
||||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||||
private final boolean qpdfEnabled;
|
private final boolean qpdfEnabled;
|
||||||
|
private final boolean ghostscriptEnabled;
|
||||||
|
|
||||||
public CompressController(
|
public CompressController(
|
||||||
CustomPDFDocumentFactory pdfDocumentFactory,
|
CustomPDFDocumentFactory pdfDocumentFactory,
|
||||||
EndpointConfiguration endpointConfiguration) {
|
EndpointConfiguration endpointConfiguration) {
|
||||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||||
this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf");
|
this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf");
|
||||||
|
this.ghostscriptEnabled = endpointConfiguration.isGroupEnabled("Ghostscript");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@ -697,25 +699,69 @@ public class CompressController {
|
|||||||
|
|
||||||
boolean sizeMet = false;
|
boolean sizeMet = false;
|
||||||
boolean imageCompressionApplied = false;
|
boolean imageCompressionApplied = false;
|
||||||
boolean qpdfCompressionApplied = false;
|
boolean externalCompressionApplied = false;
|
||||||
|
|
||||||
if (qpdfEnabled && optimizeLevel <= 3) {
|
|
||||||
optimizeLevel = 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
while (!sizeMet && optimizeLevel <= 9) {
|
while (!sizeMet && optimizeLevel <= 9) {
|
||||||
// Apply image compression for levels 4-9
|
// Apply external compression first
|
||||||
if ((optimizeLevel >= 3 || Boolean.TRUE.equals(convertToGrayscale))
|
if (!externalCompressionApplied) {
|
||||||
&& !imageCompressionApplied) {
|
boolean ghostscriptSuccess = false;
|
||||||
double scaleFactor = getScaleFactorForLevel(optimizeLevel);
|
|
||||||
float jpegQuality = getJpegQualityForLevel(optimizeLevel);
|
|
||||||
|
|
||||||
// Compress images
|
// Try Ghostscript first if available - for ANY compression level
|
||||||
|
if (ghostscriptEnabled) {
|
||||||
|
try {
|
||||||
|
applyGhostscriptCompression(
|
||||||
|
request, optimizeLevel, currentFile, tempFiles);
|
||||||
|
log.info("Ghostscript compression applied successfully");
|
||||||
|
ghostscriptSuccess = true;
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.warn("Ghostscript compression failed, trying fallback methods");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to QPDF if Ghostscript failed or not available (levels 1-3 only)
|
||||||
|
if (!ghostscriptSuccess && qpdfEnabled && optimizeLevel <= 3) {
|
||||||
|
try {
|
||||||
|
applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles);
|
||||||
|
log.info("QPDF compression applied successfully");
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.warn("QPDF compression also failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ghostscriptSuccess && !qpdfEnabled) {
|
||||||
|
log.info(
|
||||||
|
"No external compression tools available, using image compression only");
|
||||||
|
}
|
||||||
|
|
||||||
|
externalCompressionApplied = true;
|
||||||
|
|
||||||
|
// Skip image compression if Ghostscript succeeded
|
||||||
|
if (ghostscriptSuccess) {
|
||||||
|
imageCompressionApplied = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply image compression for levels 4+ only if Ghostscript didn't run
|
||||||
|
if ((optimizeLevel >= 4 || Boolean.TRUE.equals(convertToGrayscale))
|
||||||
|
&& !imageCompressionApplied) {
|
||||||
|
// Use different scale factors based on level
|
||||||
|
double scaleFactor =
|
||||||
|
switch (optimizeLevel) {
|
||||||
|
case 4 -> 0.95; // 95% of original size
|
||||||
|
case 5 -> 0.9; // 90% of original size
|
||||||
|
case 6 -> 0.8; // 80% of original size
|
||||||
|
case 7 -> 0.7; // 70% of original size
|
||||||
|
case 8 -> 0.65; // 65% of original size
|
||||||
|
case 9 -> 0.5; // 50% of original size
|
||||||
|
default -> 1.0;
|
||||||
|
};
|
||||||
|
|
||||||
|
log.info("Applying image compression with scale factor: {}", scaleFactor);
|
||||||
Path compressedImageFile =
|
Path compressedImageFile =
|
||||||
compressImagesInPDF(
|
compressImagesInPDF(
|
||||||
currentFile,
|
currentFile,
|
||||||
scaleFactor,
|
scaleFactor,
|
||||||
jpegQuality,
|
0.7f, // Default JPEG quality
|
||||||
Boolean.TRUE.equals(convertToGrayscale));
|
Boolean.TRUE.equals(convertToGrayscale));
|
||||||
|
|
||||||
tempFiles.add(compressedImageFile);
|
tempFiles.add(compressedImageFile);
|
||||||
@ -723,18 +769,6 @@ public class CompressController {
|
|||||||
imageCompressionApplied = true;
|
imageCompressionApplied = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply QPDF compression for all levels
|
|
||||||
if (!qpdfCompressionApplied && qpdfEnabled) {
|
|
||||||
applyQpdfCompression(request, optimizeLevel, currentFile, tempFiles);
|
|
||||||
qpdfCompressionApplied = true;
|
|
||||||
} else if (!qpdfCompressionApplied) {
|
|
||||||
// If QPDF is disabled, mark as applied and log
|
|
||||||
if (!qpdfEnabled) {
|
|
||||||
log.info("Skipping QPDF compression as QPDF group is disabled");
|
|
||||||
}
|
|
||||||
qpdfCompressionApplied = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if target size reached or not in auto mode
|
// Check if target size reached or not in auto mode
|
||||||
long outputFileSize = Files.size(currentFile);
|
long outputFileSize = Files.size(currentFile);
|
||||||
if (outputFileSize <= expectedOutputSize || !autoMode) {
|
if (outputFileSize <= expectedOutputSize || !autoMode) {
|
||||||
@ -754,7 +788,7 @@ public class CompressController {
|
|||||||
} else {
|
} else {
|
||||||
// Reset flags for next iteration with higher optimization level
|
// Reset flags for next iteration with higher optimization level
|
||||||
imageCompressionApplied = false;
|
imageCompressionApplied = false;
|
||||||
qpdfCompressionApplied = false;
|
externalCompressionApplied = false;
|
||||||
optimizeLevel = newOptimizeLevel;
|
optimizeLevel = newOptimizeLevel;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -788,6 +822,96 @@ public class CompressController {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Run Ghostscript compression
|
||||||
|
private void applyGhostscriptCompression(
|
||||||
|
OptimizePdfRequest request, int optimizeLevel, Path currentFile, List<Path> tempFiles)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
long preGsSize = Files.size(currentFile);
|
||||||
|
log.info("Pre-Ghostscript file size: {}", GeneralUtils.formatBytes(preGsSize));
|
||||||
|
|
||||||
|
// Create output file for Ghostscript
|
||||||
|
Path gsOutputFile = Files.createTempFile("gs_output_", ".pdf");
|
||||||
|
tempFiles.add(gsOutputFile);
|
||||||
|
|
||||||
|
// Build Ghostscript command based on optimization level
|
||||||
|
List<String> command = new ArrayList<>();
|
||||||
|
command.add("gs");
|
||||||
|
command.add("-sDEVICE=pdfwrite");
|
||||||
|
command.add("-dCompatibilityLevel=1.5");
|
||||||
|
command.add("-dNOPAUSE");
|
||||||
|
command.add("-dQUIET");
|
||||||
|
command.add("-dBATCH");
|
||||||
|
|
||||||
|
// Map optimization levels to Ghostscript settings
|
||||||
|
switch (optimizeLevel) {
|
||||||
|
case 1:
|
||||||
|
command.add("-dPDFSETTINGS=/prepress");
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
command.add("-dPDFSETTINGS=/printer");
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
command.add("-dPDFSETTINGS=/ebook");
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
case 5:
|
||||||
|
command.add("-dPDFSETTINGS=/screen");
|
||||||
|
break;
|
||||||
|
case 6:
|
||||||
|
case 7:
|
||||||
|
command.add("-dPDFSETTINGS=/screen");
|
||||||
|
command.add("-dColorImageResolution=150");
|
||||||
|
command.add("-dGrayImageResolution=150");
|
||||||
|
command.add("-dMonoImageResolution=300");
|
||||||
|
break;
|
||||||
|
case 8:
|
||||||
|
case 9:
|
||||||
|
command.add("-dPDFSETTINGS=/screen");
|
||||||
|
command.add("-dColorImageResolution=100");
|
||||||
|
command.add("-dGrayImageResolution=100");
|
||||||
|
command.add("-dMonoImageResolution=200");
|
||||||
|
break;
|
||||||
|
case 10:
|
||||||
|
command.add("-dPDFSETTINGS=/screen");
|
||||||
|
command.add("-dColorImageResolution=72");
|
||||||
|
command.add("-dGrayImageResolution=72");
|
||||||
|
command.add("-dMonoImageResolution=150");
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
command.add("-dPDFSETTINGS=/screen");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
command.add("-sOutputFile=" + gsOutputFile.toString());
|
||||||
|
command.add(currentFile.toString());
|
||||||
|
|
||||||
|
ProcessExecutorResult returnCode = null;
|
||||||
|
try {
|
||||||
|
returnCode =
|
||||||
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||||
|
.runCommandWithOutputHandling(command);
|
||||||
|
|
||||||
|
if (returnCode.getRc() == 0) {
|
||||||
|
// Update current file to the Ghostscript output
|
||||||
|
Files.copy(gsOutputFile, currentFile, StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
|
||||||
|
long postGsSize = Files.size(currentFile);
|
||||||
|
double gsReduction = 100.0 - ((postGsSize * 100.0) / preGsSize);
|
||||||
|
log.info(
|
||||||
|
"Post-Ghostscript file size: {} (reduced by {}%)",
|
||||||
|
GeneralUtils.formatBytes(postGsSize), String.format("%.1f", gsReduction));
|
||||||
|
} else {
|
||||||
|
log.warn("Ghostscript compression failed with return code: {}", returnCode.getRc());
|
||||||
|
throw new IOException("Ghostscript compression failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("Ghostscript compression failed, will fallback to other methods", e);
|
||||||
|
throw new IOException("Ghostscript compression failed", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Run QPDF compression
|
// Run QPDF compression
|
||||||
private void applyQpdfCompression(
|
private void applyQpdfCompression(
|
||||||
OptimizePdfRequest request, int optimizeLevel, Path currentFile, List<Path> tempFiles)
|
OptimizePdfRequest request, int optimizeLevel, Path currentFile, List<Path> tempFiles)
|
||||||
|
@ -2,6 +2,7 @@ package stirling.software.SPDF.controller.api.misc;
|
|||||||
|
|
||||||
import java.awt.image.BufferedImage;
|
import java.awt.image.BufferedImage;
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.zip.ZipEntry;
|
import java.util.zip.ZipEntry;
|
||||||
@ -26,26 +27,42 @@ import io.github.pixee.security.Filenames;
|
|||||||
import io.swagger.v3.oas.annotations.Operation;
|
import io.swagger.v3.oas.annotations.Operation;
|
||||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
import stirling.software.SPDF.config.EndpointConfiguration;
|
||||||
import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
|
import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
|
||||||
import stirling.software.common.model.ApplicationProperties;
|
import stirling.software.common.model.ApplicationProperties;
|
||||||
import stirling.software.common.service.CustomPDFDocumentFactory;
|
import stirling.software.common.service.CustomPDFDocumentFactory;
|
||||||
import stirling.software.common.util.ProcessExecutor;
|
import stirling.software.common.util.ProcessExecutor;
|
||||||
import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
|
import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
|
||||||
|
import stirling.software.common.util.TempDirectory;
|
||||||
|
import stirling.software.common.util.TempFile;
|
||||||
import stirling.software.common.util.TempFileManager;
|
import stirling.software.common.util.TempFileManager;
|
||||||
|
import stirling.software.common.util.WebResponseUtils;
|
||||||
|
|
||||||
@RestController
|
@RestController
|
||||||
@RequestMapping("/api/v1/misc")
|
@RequestMapping("/api/v1/misc")
|
||||||
@Tag(name = "Misc", description = "Miscellaneous APIs")
|
@Tag(name = "Misc", description = "Miscellaneous APIs")
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@RequiredArgsConstructor
|
|
||||||
public class OCRController {
|
public class OCRController {
|
||||||
|
|
||||||
private final ApplicationProperties applicationProperties;
|
private final ApplicationProperties applicationProperties;
|
||||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||||
private final TempFileManager tempFileManager;
|
private final TempFileManager tempFileManager;
|
||||||
|
private final boolean ocrMyPdfEnabled;
|
||||||
|
private final boolean tesseractEnabled;
|
||||||
|
|
||||||
|
public OCRController(
|
||||||
|
ApplicationProperties applicationProperties,
|
||||||
|
CustomPDFDocumentFactory pdfDocumentFactory,
|
||||||
|
TempFileManager tempFileManager,
|
||||||
|
EndpointConfiguration endpointConfiguration) {
|
||||||
|
this.applicationProperties = applicationProperties;
|
||||||
|
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||||
|
this.tempFileManager = tempFileManager;
|
||||||
|
this.ocrMyPdfEnabled = endpointConfiguration.isGroupEnabled("OCRmyPDF");
|
||||||
|
this.tesseractEnabled = endpointConfiguration.isGroupEnabled("tesseract");
|
||||||
|
}
|
||||||
|
|
||||||
/** Gets the list of available Tesseract languages from the tessdata directory */
|
/** Gets the list of available Tesseract languages from the tessdata directory */
|
||||||
public List<String> getAvailableTesseractLanguages() {
|
public List<String> getAvailableTesseractLanguages() {
|
||||||
@ -63,39 +80,261 @@ public class OCRController {
|
|||||||
|
|
||||||
@PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
|
@PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
|
||||||
@Operation(
|
@Operation(
|
||||||
summary = "Process PDF files with OCR using Tesseract",
|
summary = "Process a PDF file with OCR",
|
||||||
description =
|
description =
|
||||||
"Takes a PDF file as input, performs OCR using specified languages and OCR type"
|
"This endpoint processes a PDF file using OCR (Optical Character Recognition). "
|
||||||
+ " (skip-text/force-ocr), and returns the processed PDF. Input:PDF"
|
+ "Users can specify languages, sidecar, deskew, clean, cleanFinal, ocrType, ocrRenderType, and removeImagesAfter options. "
|
||||||
+ " Output:PDF Type:SISO")
|
+ "Uses OCRmyPDF if available, falls back to Tesseract. Input:PDF Output:PDF Type:SI-Conditional")
|
||||||
public ResponseEntity<byte[]> processPdfWithOCR(
|
public ResponseEntity<byte[]> processPdfWithOCR(
|
||||||
@ModelAttribute ProcessPdfWithOcrRequest request)
|
@ModelAttribute ProcessPdfWithOcrRequest request)
|
||||||
throws IOException, InterruptedException {
|
throws IOException, InterruptedException {
|
||||||
MultipartFile inputFile = request.getFileInput();
|
MultipartFile inputFile = request.getFileInput();
|
||||||
List<String> languages = request.getLanguages();
|
List<String> selectedLanguages = request.getLanguages();
|
||||||
|
Boolean sidecar = request.isSidecar();
|
||||||
|
Boolean deskew = request.isDeskew();
|
||||||
|
Boolean clean = request.isClean();
|
||||||
|
Boolean cleanFinal = request.isCleanFinal();
|
||||||
String ocrType = request.getOcrType();
|
String ocrType = request.getOcrType();
|
||||||
|
String ocrRenderType = request.getOcrRenderType();
|
||||||
|
Boolean removeImagesAfter = request.isRemoveImagesAfter();
|
||||||
|
|
||||||
// Create a temp directory using TempFileManager directly
|
if (selectedLanguages == null || selectedLanguages.isEmpty()) {
|
||||||
Path tempDirPath = tempFileManager.createTempDirectory();
|
throw new IOException("Please select at least one language.");
|
||||||
File tempDir = tempDirPath.toFile();
|
}
|
||||||
|
|
||||||
try {
|
if (!"hocr".equals(ocrRenderType) && !"sandwich".equals(ocrRenderType)) {
|
||||||
File tempInputFile = new File(tempDir, "input.pdf");
|
throw new IOException("ocrRenderType wrong");
|
||||||
File tempOutputDir = new File(tempDir, "output");
|
}
|
||||||
File tempImagesDir = new File(tempDir, "images");
|
|
||||||
File finalOutputFile = new File(tempDir, "final_output.pdf");
|
// Get available Tesseract languages
|
||||||
|
List<String> availableLanguages = getAvailableTesseractLanguages();
|
||||||
|
|
||||||
|
// Validate selected languages
|
||||||
|
selectedLanguages =
|
||||||
|
selectedLanguages.stream().filter(availableLanguages::contains).toList();
|
||||||
|
|
||||||
|
if (selectedLanguages.isEmpty()) {
|
||||||
|
throw new IOException("None of the selected languages are valid.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use try-with-resources for proper temp file management
|
||||||
|
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
|
||||||
|
TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
|
||||||
|
|
||||||
|
inputFile.transferTo(tempInputFile.getFile());
|
||||||
|
|
||||||
|
TempFile sidecarTextFile = null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Use OCRmyPDF if available (no fallback - error if it fails)
|
||||||
|
if (ocrMyPdfEnabled) {
|
||||||
|
if (sidecar != null && sidecar) {
|
||||||
|
sidecarTextFile = new TempFile(tempFileManager, ".txt");
|
||||||
|
}
|
||||||
|
|
||||||
|
processWithOcrMyPdf(
|
||||||
|
selectedLanguages,
|
||||||
|
sidecar,
|
||||||
|
deskew,
|
||||||
|
clean,
|
||||||
|
cleanFinal,
|
||||||
|
ocrType,
|
||||||
|
ocrRenderType,
|
||||||
|
removeImagesAfter,
|
||||||
|
tempInputFile.getPath(),
|
||||||
|
tempOutputFile.getPath(),
|
||||||
|
sidecarTextFile != null ? sidecarTextFile.getPath() : null);
|
||||||
|
log.info("OCRmyPDF processing completed successfully");
|
||||||
|
}
|
||||||
|
// Use Tesseract only if OCRmyPDF is not available
|
||||||
|
else if (tesseractEnabled) {
|
||||||
|
processWithTesseract(
|
||||||
|
selectedLanguages,
|
||||||
|
ocrType,
|
||||||
|
tempInputFile.getPath(),
|
||||||
|
tempOutputFile.getPath());
|
||||||
|
log.info("Tesseract processing completed successfully");
|
||||||
|
} else {
|
||||||
|
throw new IOException("No OCR tools are available");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read the processed PDF file
|
||||||
|
byte[] pdfBytes = Files.readAllBytes(tempOutputFile.getPath());
|
||||||
|
|
||||||
|
// Return the OCR processed PDF as a response
|
||||||
|
String outputFilename =
|
||||||
|
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||||
|
.replaceFirst("[.][^.]+$", "")
|
||||||
|
+ "_OCR.pdf";
|
||||||
|
|
||||||
|
if (sidecar != null && sidecar && sidecarTextFile != null) {
|
||||||
|
// Create a zip file containing both the PDF and the text file
|
||||||
|
String outputZipFilename =
|
||||||
|
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||||
|
.replaceFirst("[.][^.]+$", "")
|
||||||
|
+ "_OCR.zip";
|
||||||
|
|
||||||
|
try (TempFile tempZipFile = new TempFile(tempFileManager, ".zip");
|
||||||
|
ZipOutputStream zipOut =
|
||||||
|
new ZipOutputStream(
|
||||||
|
Files.newOutputStream(tempZipFile.getPath()))) {
|
||||||
|
|
||||||
|
// Add PDF file to the zip
|
||||||
|
ZipEntry pdfEntry = new ZipEntry(outputFilename);
|
||||||
|
zipOut.putNextEntry(pdfEntry);
|
||||||
|
zipOut.write(pdfBytes);
|
||||||
|
zipOut.closeEntry();
|
||||||
|
|
||||||
|
// Add text file to the zip
|
||||||
|
ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
|
||||||
|
zipOut.putNextEntry(txtEntry);
|
||||||
|
Files.copy(sidecarTextFile.getPath(), zipOut);
|
||||||
|
zipOut.closeEntry();
|
||||||
|
|
||||||
|
zipOut.finish();
|
||||||
|
|
||||||
|
byte[] zipBytes = Files.readAllBytes(tempZipFile.getPath());
|
||||||
|
|
||||||
|
// Return the zip file containing both the PDF and the text file
|
||||||
|
return WebResponseUtils.bytesToWebResponse(
|
||||||
|
zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Return the OCR processed PDF as a response
|
||||||
|
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
|
||||||
|
}
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
// Clean up sidecar temp file if created
|
||||||
|
if (sidecarTextFile != null) {
|
||||||
|
try {
|
||||||
|
sidecarTextFile.close();
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("Failed to close sidecar temp file", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processWithOcrMyPdf(
|
||||||
|
List<String> selectedLanguages,
|
||||||
|
Boolean sidecar,
|
||||||
|
Boolean deskew,
|
||||||
|
Boolean clean,
|
||||||
|
Boolean cleanFinal,
|
||||||
|
String ocrType,
|
||||||
|
String ocrRenderType,
|
||||||
|
Boolean removeImagesAfter,
|
||||||
|
Path tempInputFile,
|
||||||
|
Path tempOutputFile,
|
||||||
|
Path sidecarTextPath)
|
||||||
|
throws IOException, InterruptedException {
|
||||||
|
|
||||||
|
// Build OCRmyPDF command
|
||||||
|
String languageOption = String.join("+", selectedLanguages);
|
||||||
|
|
||||||
|
List<String> command =
|
||||||
|
new ArrayList<>(
|
||||||
|
Arrays.asList(
|
||||||
|
"ocrmypdf",
|
||||||
|
"--verbose",
|
||||||
|
"2",
|
||||||
|
"--output-type",
|
||||||
|
"pdf",
|
||||||
|
"--pdf-renderer",
|
||||||
|
ocrRenderType));
|
||||||
|
|
||||||
|
if (sidecar != null && sidecar && sidecarTextPath != null) {
|
||||||
|
command.add("--sidecar");
|
||||||
|
command.add(sidecarTextPath.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (deskew != null && deskew) {
|
||||||
|
command.add("--deskew");
|
||||||
|
}
|
||||||
|
if (clean != null && clean) {
|
||||||
|
command.add("--clean");
|
||||||
|
}
|
||||||
|
if (cleanFinal != null && cleanFinal) {
|
||||||
|
command.add("--clean-final");
|
||||||
|
}
|
||||||
|
if (ocrType != null && !"".equals(ocrType)) {
|
||||||
|
if ("skip-text".equals(ocrType)) {
|
||||||
|
command.add("--skip-text");
|
||||||
|
} else if ("force-ocr".equals(ocrType)) {
|
||||||
|
command.add("--force-ocr");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
command.addAll(
|
||||||
|
Arrays.asList(
|
||||||
|
"--language",
|
||||||
|
languageOption,
|
||||||
|
tempInputFile.toString(),
|
||||||
|
tempOutputFile.toString()));
|
||||||
|
|
||||||
|
// Run CLI command
|
||||||
|
ProcessExecutorResult result =
|
||||||
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
||||||
|
.runCommandWithOutputHandling(command);
|
||||||
|
|
||||||
|
if (result.getRc() != 0
|
||||||
|
&& result.getMessages().contains("multiprocessing/synchronize.py")
|
||||||
|
&& result.getMessages().contains("OSError: [Errno 38] Function not implemented")) {
|
||||||
|
command.add("--jobs");
|
||||||
|
command.add("1");
|
||||||
|
result =
|
||||||
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
||||||
|
.runCommandWithOutputHandling(command);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.getRc() != 0) {
|
||||||
|
throw new IOException("OCRmyPDF failed with return code: " + result.getRc());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove images from the OCR processed PDF if the flag is set to true
|
||||||
|
if (removeImagesAfter != null && removeImagesAfter) {
|
||||||
|
try (TempFile tempPdfWithoutImages = new TempFile(tempFileManager, "_no_images.pdf")) {
|
||||||
|
List<String> gsCommand =
|
||||||
|
Arrays.asList(
|
||||||
|
"gs",
|
||||||
|
"-sDEVICE=pdfwrite",
|
||||||
|
"-dFILTERIMAGE",
|
||||||
|
"-o",
|
||||||
|
tempPdfWithoutImages.getPath().toString(),
|
||||||
|
tempOutputFile.toString());
|
||||||
|
|
||||||
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||||
|
.runCommandWithOutputHandling(gsCommand);
|
||||||
|
|
||||||
|
// Replace output file with version without images
|
||||||
|
Files.copy(
|
||||||
|
tempPdfWithoutImages.getPath(),
|
||||||
|
tempOutputFile,
|
||||||
|
java.nio.file.StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processWithTesseract(
|
||||||
|
List<String> selectedLanguages, String ocrType, Path tempInputFile, Path tempOutputFile)
|
||||||
|
throws IOException, InterruptedException {
|
||||||
|
|
||||||
|
// Create temp directory for Tesseract processing
|
||||||
|
try (TempDirectory tempDir = new TempDirectory(tempFileManager)) {
|
||||||
|
File tempOutputDir = new File(tempDir.getPath().toFile(), "output");
|
||||||
|
File tempImagesDir = new File(tempDir.getPath().toFile(), "images");
|
||||||
|
File finalOutputFile = new File(tempDir.getPath().toFile(), "final_output.pdf");
|
||||||
|
|
||||||
// Create directories
|
// Create directories
|
||||||
tempOutputDir.mkdirs();
|
tempOutputDir.mkdirs();
|
||||||
tempImagesDir.mkdirs();
|
tempImagesDir.mkdirs();
|
||||||
|
|
||||||
// Save input file
|
|
||||||
inputFile.transferTo(tempInputFile);
|
|
||||||
|
|
||||||
PDFMergerUtility merger = new PDFMergerUtility();
|
PDFMergerUtility merger = new PDFMergerUtility();
|
||||||
merger.setDestinationFileName(finalOutputFile.toString());
|
merger.setDestinationFileName(finalOutputFile.toString());
|
||||||
|
|
||||||
try (PDDocument document = pdfDocumentFactory.load(tempInputFile)) {
|
try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) {
|
||||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||||
int pageCount = document.getNumberOfPages();
|
int pageCount = document.getNumberOfPages();
|
||||||
|
|
||||||
@ -135,35 +374,20 @@ public class OCRController {
|
|||||||
new File(tempOutputDir, String.format("page_%d", pageNum))
|
new File(tempOutputDir, String.format("page_%d", pageNum))
|
||||||
.toString());
|
.toString());
|
||||||
command.add("-l");
|
command.add("-l");
|
||||||
command.add(String.join("+", languages));
|
command.add(String.join("+", selectedLanguages));
|
||||||
// Always output PDF
|
command.add("pdf"); // Always output PDF
|
||||||
command.add("pdf");
|
|
||||||
|
|
||||||
// Use ProcessExecutor to run tesseract command
|
ProcessExecutorResult result =
|
||||||
try {
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
|
||||||
ProcessExecutorResult result =
|
.runCommandWithOutputHandling(command);
|
||||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
|
|
||||||
.runCommandWithOutputHandling(command);
|
|
||||||
|
|
||||||
log.debug(
|
if (result.getRc() != 0) {
|
||||||
"Tesseract OCR completed for page {} with exit code {}",
|
throw new RuntimeException(
|
||||||
pageNum,
|
"Tesseract failed with exit code: " + result.getRc());
|
||||||
result.getRc());
|
|
||||||
|
|
||||||
// Add OCR'd PDF to merger
|
|
||||||
merger.addSource(pageOutputPath);
|
|
||||||
} catch (IOException | InterruptedException e) {
|
|
||||||
log.error(
|
|
||||||
"Error processing page {} with tesseract: {}",
|
|
||||||
pageNum,
|
|
||||||
e.getMessage());
|
|
||||||
// If OCR fails, fall back to the original page
|
|
||||||
try (PDDocument pageDoc = new PDDocument()) {
|
|
||||||
pageDoc.addPage(page);
|
|
||||||
pageDoc.save(pageOutputPath);
|
|
||||||
merger.addSource(pageOutputPath);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add OCR'd PDF to merger
|
||||||
|
merger.addSource(pageOutputPath);
|
||||||
} else {
|
} else {
|
||||||
// Save original page without OCR
|
// Save original page without OCR
|
||||||
try (PDDocument pageDoc = new PDDocument()) {
|
try (PDDocument pageDoc = new PDDocument()) {
|
||||||
@ -178,40 +402,11 @@ public class OCRController {
|
|||||||
// Merge all pages into final PDF
|
// Merge all pages into final PDF
|
||||||
merger.mergeDocuments(null);
|
merger.mergeDocuments(null);
|
||||||
|
|
||||||
// Read the final PDF file
|
// Copy final output to the expected location
|
||||||
byte[] pdfContent = java.nio.file.Files.readAllBytes(finalOutputFile.toPath());
|
Files.copy(
|
||||||
String outputFilename =
|
finalOutputFile.toPath(),
|
||||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
tempOutputFile,
|
||||||
.replaceFirst("[.][^.]+$", "")
|
java.nio.file.StandardCopyOption.REPLACE_EXISTING);
|
||||||
+ "_OCR.pdf";
|
|
||||||
|
|
||||||
return ResponseEntity.ok()
|
|
||||||
.header(
|
|
||||||
"Content-Disposition",
|
|
||||||
"attachment; filename=\"" + outputFilename + "\"")
|
|
||||||
.contentType(MediaType.APPLICATION_PDF)
|
|
||||||
.body(pdfContent);
|
|
||||||
} finally {
|
|
||||||
// Clean up the temp directory and all its contents
|
|
||||||
tempFileManager.deleteTempDirectory(tempDirPath);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addFileToZip(File file, String filename, ZipOutputStream zipOut)
|
|
||||||
throws IOException {
|
|
||||||
if (!file.exists()) {
|
|
||||||
log.warn("File {} does not exist, skipping", file);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
try (FileInputStream fis = new FileInputStream(file)) {
|
|
||||||
ZipEntry zipEntry = new ZipEntry(filename);
|
|
||||||
zipOut.putNextEntry(zipEntry);
|
|
||||||
byte[] buffer = new byte[1024];
|
|
||||||
int length;
|
|
||||||
while ((length = fis.read(buffer)) >= 0) {
|
|
||||||
zipOut.write(buffer, 0, length);
|
|
||||||
}
|
|
||||||
zipOut.closeEntry();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -15,8 +15,7 @@ import io.github.pixee.security.Filenames;
|
|||||||
import io.swagger.v3.oas.annotations.Operation;
|
import io.swagger.v3.oas.annotations.Operation;
|
||||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import stirling.software.SPDF.config.EndpointConfiguration;
|
||||||
|
|
||||||
import stirling.software.common.model.api.PDFFile;
|
import stirling.software.common.model.api.PDFFile;
|
||||||
import stirling.software.common.service.CustomPDFDocumentFactory;
|
import stirling.software.common.service.CustomPDFDocumentFactory;
|
||||||
import stirling.software.common.util.ProcessExecutor;
|
import stirling.software.common.util.ProcessExecutor;
|
||||||
@ -28,17 +27,28 @@ import stirling.software.common.util.WebResponseUtils;
|
|||||||
@RestController
|
@RestController
|
||||||
@RequestMapping("/api/v1/misc")
|
@RequestMapping("/api/v1/misc")
|
||||||
@Tag(name = "Misc", description = "Miscellaneous APIs")
|
@Tag(name = "Misc", description = "Miscellaneous APIs")
|
||||||
@RequiredArgsConstructor
|
|
||||||
public class RepairController {
|
public class RepairController {
|
||||||
|
|
||||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||||
private final TempFileManager tempFileManager;
|
private final TempFileManager tempFileManager;
|
||||||
|
private final boolean ghostscriptEnabled;
|
||||||
|
private final boolean qpdfEnabled;
|
||||||
|
|
||||||
|
public RepairController(
|
||||||
|
CustomPDFDocumentFactory pdfDocumentFactory,
|
||||||
|
TempFileManager tempFileManager,
|
||||||
|
EndpointConfiguration endpointConfiguration) {
|
||||||
|
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||||
|
this.tempFileManager = tempFileManager;
|
||||||
|
this.ghostscriptEnabled = endpointConfiguration.isGroupEnabled("Ghostscript");
|
||||||
|
this.qpdfEnabled = endpointConfiguration.isGroupEnabled("qpdf");
|
||||||
|
}
|
||||||
|
|
||||||
@PostMapping(consumes = "multipart/form-data", value = "/repair")
|
@PostMapping(consumes = "multipart/form-data", value = "/repair")
|
||||||
@Operation(
|
@Operation(
|
||||||
summary = "Repair a PDF file",
|
summary = "Repair a PDF file",
|
||||||
description =
|
description =
|
||||||
"This endpoint repairs a given PDF file by running qpdf command. The PDF is"
|
"This endpoint repairs a given PDF file by running Ghostscript (primary), qpdf (fallback), or PDFBox (if no external tools available). The PDF is"
|
||||||
+ " first saved to a temporary location, repaired, read back, and then"
|
+ " first saved to a temporary location, repaired, read back, and then"
|
||||||
+ " returned as a response. Input:PDF Output:PDF Type:SISO")
|
+ " returned as a response. Input:PDF Output:PDF Type:SISO")
|
||||||
public ResponseEntity<byte[]> repairPdf(@ModelAttribute PDFFile file)
|
public ResponseEntity<byte[]> repairPdf(@ModelAttribute PDFFile file)
|
||||||
@ -46,25 +56,72 @@ public class RepairController {
|
|||||||
MultipartFile inputFile = file.getFileInput();
|
MultipartFile inputFile = file.getFileInput();
|
||||||
|
|
||||||
// Use TempFile with try-with-resources for automatic cleanup
|
// Use TempFile with try-with-resources for automatic cleanup
|
||||||
try (TempFile tempFile = new TempFile(tempFileManager, ".pdf")) {
|
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
|
||||||
|
TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
|
||||||
|
|
||||||
// Save the uploaded file to the temporary location
|
// Save the uploaded file to the temporary location
|
||||||
inputFile.transferTo(tempFile.getFile());
|
inputFile.transferTo(tempInputFile.getFile());
|
||||||
|
|
||||||
List<String> command = new ArrayList<>();
|
boolean repairSuccess = false;
|
||||||
command.add("qpdf");
|
|
||||||
command.add("--replace-input"); // Automatically fixes problems it can
|
|
||||||
command.add("--qdf"); // Linearizes and normalizes PDF structure
|
|
||||||
command.add("--object-streams=disable"); // Can help with some corruptions
|
|
||||||
command.add(tempFile.getFile().getAbsolutePath());
|
|
||||||
|
|
||||||
ProcessExecutorResult returnCode =
|
// Try Ghostscript first if available
|
||||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
|
if (ghostscriptEnabled) {
|
||||||
.runCommandWithOutputHandling(command);
|
try {
|
||||||
|
List<String> gsCommand = new ArrayList<>();
|
||||||
|
gsCommand.add("gs");
|
||||||
|
gsCommand.add("-o");
|
||||||
|
gsCommand.add(tempOutputFile.getPath().toString());
|
||||||
|
gsCommand.add("-sDEVICE=pdfwrite");
|
||||||
|
gsCommand.add(tempInputFile.getPath().toString());
|
||||||
|
|
||||||
// Read the optimized PDF file
|
ProcessExecutorResult gsResult =
|
||||||
byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempFile.getFile());
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||||
|
.runCommandWithOutputHandling(gsCommand);
|
||||||
|
|
||||||
// Return the optimized PDF as a response
|
if (gsResult.getRc() == 0) {
|
||||||
|
repairSuccess = true;
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
// Log and continue to QPDF fallback
|
||||||
|
System.out.println(
|
||||||
|
"Ghostscript repair failed, trying QPDF fallback: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to QPDF if Ghostscript failed or not available
|
||||||
|
if (!repairSuccess && qpdfEnabled) {
|
||||||
|
List<String> qpdfCommand = new ArrayList<>();
|
||||||
|
qpdfCommand.add("qpdf");
|
||||||
|
qpdfCommand.add("--replace-input"); // Automatically fixes problems it can
|
||||||
|
qpdfCommand.add("--qdf"); // Linearizes and normalizes PDF structure
|
||||||
|
qpdfCommand.add("--object-streams=disable"); // Can help with some corruptions
|
||||||
|
qpdfCommand.add(tempInputFile.getPath().toString());
|
||||||
|
qpdfCommand.add(tempOutputFile.getPath().toString());
|
||||||
|
|
||||||
|
ProcessExecutorResult qpdfResult =
|
||||||
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
|
||||||
|
.runCommandWithOutputHandling(qpdfCommand);
|
||||||
|
|
||||||
|
repairSuccess = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use PDFBox as last resort if no external tools are available
|
||||||
|
if (!repairSuccess) {
|
||||||
|
if (!ghostscriptEnabled && !qpdfEnabled) {
|
||||||
|
// Basic PDFBox repair - load and save to fix structural issues
|
||||||
|
try (var document = pdfDocumentFactory.load(tempInputFile.getFile())) {
|
||||||
|
document.save(tempOutputFile.getFile());
|
||||||
|
repairSuccess = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw new IOException("PDF repair failed with available tools");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read the repaired PDF file
|
||||||
|
byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempOutputFile.getFile());
|
||||||
|
|
||||||
|
// Return the repaired PDF as a response
|
||||||
String outputFilename =
|
String outputFilename =
|
||||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||||
.replaceFirst("[.][^.]+$", "")
|
.replaceFirst("[.][^.]+$", "")
|
||||||
|
@ -19,6 +19,18 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
|
|||||||
defaultValue = "[\"eng\"]")
|
defaultValue = "[\"eng\"]")
|
||||||
private List<String> languages;
|
private List<String> languages;
|
||||||
|
|
||||||
|
@Schema(description = "Include OCR text in a sidecar text file if set to true")
|
||||||
|
private boolean sidecar;
|
||||||
|
|
||||||
|
@Schema(description = "Deskew the input file if set to true")
|
||||||
|
private boolean deskew;
|
||||||
|
|
||||||
|
@Schema(description = "Clean the input file if set to true")
|
||||||
|
private boolean clean;
|
||||||
|
|
||||||
|
@Schema(description = "Clean the final output if set to true")
|
||||||
|
private boolean cleanFinal;
|
||||||
|
|
||||||
@Schema(
|
@Schema(
|
||||||
description = "Specify the OCR type, e.g., 'skip-text', 'force-ocr', or 'Normal'",
|
description = "Specify the OCR type, e.g., 'skip-text', 'force-ocr', or 'Normal'",
|
||||||
requiredMode = Schema.RequiredMode.REQUIRED,
|
requiredMode = Schema.RequiredMode.REQUIRED,
|
||||||
@ -31,4 +43,7 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
|
|||||||
allowableValues = {"hocr", "sandwich"},
|
allowableValues = {"hocr", "sandwich"},
|
||||||
defaultValue = "hocr")
|
defaultValue = "hocr")
|
||||||
private String ocrRenderType = "hocr";
|
private String ocrRenderType = "hocr";
|
||||||
|
|
||||||
|
@Schema(description = "Remove images from the output PDF if set to true")
|
||||||
|
private boolean removeImagesAfter;
|
||||||
}
|
}
|
||||||
|
@ -79,6 +79,30 @@
|
|||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
<br>
|
<br>
|
||||||
|
<div class="mb-3" th:if>
|
||||||
|
<label class="form-label">OCR Options</label>
|
||||||
|
<div class="form-check">
|
||||||
|
<input type="checkbox" class="form-check-input" id="sidecar" name="sidecar" value="true">
|
||||||
|
<label class="form-check-label" for="sidecar">Include OCR text in sidecar text file</label>
|
||||||
|
</div>
|
||||||
|
<div class="form-check">
|
||||||
|
<input type="checkbox" class="form-check-input" id="deskew" name="deskew" value="true">
|
||||||
|
<label class="form-check-label" for="deskew">Deskew input file</label>
|
||||||
|
</div>
|
||||||
|
<div class="form-check">
|
||||||
|
<input type="checkbox" class="form-check-input" id="clean" name="clean" value="true">
|
||||||
|
<label class="form-check-label" for="clean">Clean input file</label>
|
||||||
|
</div>
|
||||||
|
<div class="form-check">
|
||||||
|
<input type="checkbox" class="form-check-input" id="cleanFinal" name="cleanFinal" value="true">
|
||||||
|
<label class="form-check-label" for="cleanFinal">Clean final output</label>
|
||||||
|
</div>
|
||||||
|
<div class="form-check">
|
||||||
|
<input type="checkbox" class="form-check-input" id="removeImagesAfter" name="removeImagesAfter" value="true">
|
||||||
|
<label class="form-check-label" for="removeImagesAfter">Remove images from output PDF</label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<br>
|
||||||
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{ocr.submit}"></button>
|
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{ocr.submit}"></button>
|
||||||
</form>
|
</form>
|
||||||
<script th:inline="javascript">
|
<script th:inline="javascript">
|
||||||
|
Loading…
x
Reference in New Issue
Block a user