From 14a4bdfb1b06219d48826484a5ffeff566a13682 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Sat, 5 Jul 2025 17:33:25 +0100 Subject: [PATCH] Exception cleanup, resource change and OCR Defaults (#3876) # Description of Changes Please provide a summary of the changes, including: - What was changed - Why the change was made - Any challenges encountered Closes #(issue_number) --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details. --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: a --- Dockerfile | 6 ++- Dockerfile.dev | 3 +- Dockerfile.fat | 4 +- Dockerfile.ultra-lite | 2 +- .../service/CustomPDFDocumentFactory.java | 26 ++++++++++ .../service/TempFileCleanupService.java | 16 ++++--- .../software/common/util/ExceptionUtils.java | 3 +- .../software/common/util/PdfErrorUtils.java | 3 +- .../controller/api/SplitPDFController.java | 2 +- .../api/ToSinglePageController.java | 8 ++-- .../api/misc/ExtractImagesController.java | 11 +++-- .../api/misc/FakeScanController.java | 47 ++++++++++++++++++- .../api/pipeline/PipelineProcessor.java | 40 ++++++++++------ .../api/security/CertSignController.java | 3 +- .../src/main/resources/static/js/fileInput.js | 3 +- .../resources/templates/misc/ocr-pdf.html | 2 +- 16 files changed, 137 insertions(+), 42 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1edf05841..84dddd1dc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -75,7 +75,9 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a ocrmypdf \ py3-pip \ py3-pillow@testing \ - py3-pdf2image@testing && \ + py3-pdf2image@testing \ + # URW Base 35 fonts for better PDF rendering + font-urw-base35 && \ python3 -m venv /opt/venv && \ /opt/venv/bin/pip install --upgrade pip setuptools && \ /opt/venv/bin/pip install --no-cache-dir --upgrade unoserver weasyprint && \ @@ -84,6 +86,8 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a ln -s /usr/lib/libreoffice/program /opt/venv/lib/python3.12/site-packages/LibreOffice && \ mv /usr/share/tessdata /usr/share/tessdata-original && \ mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders /tmp/stirling-pdf && \ + # Configure URW Base 35 fonts + ln -s /usr/share/fontconfig/conf.avail/69-urw-*.conf /etc/fonts/conf.d/ && \ fc-cache -f -v && \ chmod +x /scripts/* && \ chmod +x /scripts/init.sh && \ diff --git a/Dockerfile.dev b/Dockerfile.dev index 15de277b9..78460115f 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -19,7 +19,7 @@ RUN apt-get update && apt-get install -y \ # settings.yml | tessdataDir: /usr/share/tesseract-ocr/5/tessdata tesseract-ocr \ tesseract-ocr-eng \ - fonts-terminus fonts-dejavu fonts-font-awesome fonts-noto fonts-noto-core fonts-noto-cjk fonts-noto-extra fonts-liberation fonts-linuxlibertine \ + fonts-terminus fonts-dejavu fonts-font-awesome fonts-noto fonts-noto-core fonts-noto-cjk fonts-noto-extra fonts-liberation fonts-linuxlibertine fonts-urw-base35 \ python3-uno \ python3-venv \ # ss -tln @@ -45,6 +45,7 @@ ENV PATH="/opt/venv/bin:$PATH" COPY . /workspace RUN mkdir -p /tmp/stirling-pdf \ + && fc-cache -f -v \ && adduser --disabled-password --gecos '' devuser \ && chown -R devuser:devuser /home/devuser /workspace /tmp/stirling-pdf RUN echo "devuser ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/devuser \ diff --git a/Dockerfile.fat b/Dockerfile.fat index 976c1ee17..4053cd97f 100644 --- a/Dockerfile.fat +++ b/Dockerfile.fat @@ -82,7 +82,7 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a tesseract-ocr-data-fra \ tesseract-ocr-data-por \ unpaper \ - font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \ + font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine font-urw-base35 \ # CV py3-opencv \ python3 \ @@ -98,6 +98,8 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a ln -s /usr/lib/libreoffice/program /opt/venv/lib/python3.12/site-packages/LibreOffice && \ mv /usr/share/tessdata /usr/share/tessdata-original && \ mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders /tmp/stirling-pdf && \ + # Configure URW Base 35 fonts + ln -s /usr/share/fontconfig/conf.avail/69-urw-*.conf /etc/fonts/conf.d/ && \ fc-cache -f -v && \ chmod +x /scripts/* && \ chmod +x /scripts/init.sh && \ diff --git a/Dockerfile.ultra-lite b/Dockerfile.ultra-lite index c4eb4ba46..283d3c983 100644 --- a/Dockerfile.ultra-lite +++ b/Dockerfile.ultra-lite @@ -52,4 +52,4 @@ EXPOSE 8080/tcp # Run the application ENTRYPOINT ["tini", "--", "/scripts/init-without-ocr.sh"] -CMD ["java", "-Dfile.encoding=UTF-8", "-Djava.io.tmpdir=/tmp/stirling-pdf", "-jar", "/app.jar"] +CMD ["java", "-Dfile.encoding=UTF-8", "-Djava.io.tmpdir=/tmp/stirling-pdf", "-jar", "/app.jar"] \ No newline at end of file diff --git a/common/src/main/java/stirling/software/common/service/CustomPDFDocumentFactory.java b/common/src/main/java/stirling/software/common/service/CustomPDFDocumentFactory.java index d5993346e..d106a2729 100644 --- a/common/src/main/java/stirling/software/common/service/CustomPDFDocumentFactory.java +++ b/common/src/main/java/stirling/software/common/service/CustomPDFDocumentFactory.java @@ -293,9 +293,32 @@ public class CustomPDFDocumentFactory { } else { throw new IllegalArgumentException("Unsupported source type: " + source.getClass()); } + + configureResourceCacheIfNeeded(document, contentSize); + return document; } + /** + * Configure resource cache based on content size and memory constraints. Disables resource + * cache for large files or when memory is low to prevent OOM errors. + */ + private void configureResourceCacheIfNeeded(PDDocument document, long contentSize) { + if (contentSize > LARGE_FILE_THRESHOLD) { + document.setResourceCache(null); + } else { + // Check current memory status for smaller files + long maxMemory = Runtime.getRuntime().maxMemory(); + long usedMemory = + Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); + double freeMemoryPercent = (double) (maxMemory - usedMemory) / maxMemory * 100; + + if (freeMemoryPercent < MIN_FREE_MEMORY_PERCENTAGE) { + document.setResourceCache(null); + } + } + } + /** Load a PDF with password protection using adaptive loading strategies */ private PDDocument loadAdaptivelyWithPassword(Object source, long contentSize, String password) throws IOException { @@ -314,6 +337,9 @@ public class CustomPDFDocumentFactory { } else { throw new IllegalArgumentException("Unsupported source type: " + source.getClass()); } + + configureResourceCacheIfNeeded(document, contentSize); + return document; } diff --git a/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java b/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java index 53d7920b8..df85a016b 100644 --- a/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java +++ b/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java @@ -153,13 +153,15 @@ public class TempFileCleanupService { // Clean up unregistered temp files based on our cleanup strategy boolean containerMode = isContainerMode(); int unregisteredDeletedCount = cleanupUnregisteredFiles(containerMode, true, maxAgeMillis); - - if(registeredDeletedCount >0 || unregisteredDeletedCount >0 || directoriesDeletedCount >0) { - log.info( - "Scheduled cleanup complete. Deleted {} registered files, {} unregistered files, {} directories", - registeredDeletedCount, - unregisteredDeletedCount, - directoriesDeletedCount); + + if (registeredDeletedCount > 0 + || unregisteredDeletedCount > 0 + || directoriesDeletedCount > 0) { + log.info( + "Scheduled cleanup complete. Deleted {} registered files, {} unregistered files, {} directories", + registeredDeletedCount, + unregisteredDeletedCount, + directoriesDeletedCount); } } diff --git a/common/src/main/java/stirling/software/common/util/ExceptionUtils.java b/common/src/main/java/stirling/software/common/util/ExceptionUtils.java index 061cf1450..74f65e713 100644 --- a/common/src/main/java/stirling/software/common/util/ExceptionUtils.java +++ b/common/src/main/java/stirling/software/common/util/ExceptionUtils.java @@ -300,7 +300,8 @@ public class ExceptionUtils { public static void logException(String operation, Exception e) { if (PdfErrorUtils.isCorruptedPdfError(e)) { log.warn("PDF corruption detected during {}: {}", operation, e.getMessage()); - } else if (e instanceof IOException && (isEncryptionError((IOException) e) || isPasswordError((IOException) e))) { + } else if (e instanceof IOException + && (isEncryptionError((IOException) e) || isPasswordError((IOException) e))) { log.info("PDF security issue during {}: {}", operation, e.getMessage()); } else { log.error("Unexpected error during {}", operation, e); diff --git a/common/src/main/java/stirling/software/common/util/PdfErrorUtils.java b/common/src/main/java/stirling/software/common/util/PdfErrorUtils.java index c67e2a4ec..aeda114c9 100644 --- a/common/src/main/java/stirling/software/common/util/PdfErrorUtils.java +++ b/common/src/main/java/stirling/software/common/util/PdfErrorUtils.java @@ -49,6 +49,7 @@ public class PdfErrorUtils { || message.contains("Invalid dictionary, found:") || message.contains("AES initialization vector not fully read") || message.contains("BadPaddingException") - || message.contains("Given final block not properly padded"); + || message.contains("Given final block not properly padded") + || message.contains("End-of-File, expected line"); } } diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java index 3438a8789..f2425ac9a 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java @@ -72,7 +72,7 @@ public class SplitPDFController { pageNumbers.add(totalPages - 1); } - log.info( + log.debug( "Splitting PDF into pages: {}", pageNumbers.stream().map(String::valueOf).collect(Collectors.joining(","))); diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/ToSinglePageController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/ToSinglePageController.java index 9085f41b1..104a0f351 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/ToSinglePageController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/ToSinglePageController.java @@ -70,17 +70,17 @@ public class ToSinglePageController { float yOffset = totalHeight; // For each page, copy its content to the new page at the correct offset + int pageIndex = 0; for (PDPage page : sourceDocument.getPages()) { - PDFormXObject form = - layerUtility.importPageAsForm( - sourceDocument, sourceDocument.getPages().indexOf(page)); + PDFormXObject form = layerUtility.importPageAsForm(sourceDocument, pageIndex); AffineTransform af = AffineTransform.getTranslateInstance( 0, yOffset - page.getMediaBox().getHeight()); layerUtility.wrapInSaveRestore(newPage); - String defaultLayerName = "Layer" + sourceDocument.getPages().indexOf(page); + String defaultLayerName = "Layer" + pageIndex; layerUtility.appendFormAsLayer(newPage, form, af, defaultLayerName); yOffset -= page.getMediaBox().getHeight(); + pageIndex++; } ByteArrayOutputStream baos = new ByteArrayOutputStream(); diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java index 249e9263c..09486f9e8 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java @@ -95,9 +95,9 @@ public class ExtractImagesController { try { int pageCount = document.getPages().getCount(); log.debug("Document reports {} pages", pageCount); - + int consecutiveFailures = 0; - + for (int pgNum = 0; pgNum < pageCount; pgNum++) { try { PDPage page = document.getPage(pgNum); @@ -118,7 +118,10 @@ public class ExtractImagesController { allowDuplicates); } catch (Exception e) { // Log the error and continue processing other pages - ExceptionUtils.logException("image extraction from page " + currentPageNum, e); + ExceptionUtils.logException( + "image extraction from page " + + currentPageNum, + e); } return null; // Callable requires a return type @@ -129,7 +132,7 @@ public class ExtractImagesController { } catch (Exception e) { consecutiveFailures++; ExceptionUtils.logException("page access for page " + (pgNum + 1), e); - + if (consecutiveFailures >= 3) { log.warn("Stopping page iteration after 3 consecutive failures"); break; diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/FakeScanController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/FakeScanController.java index 7872a4b09..d221ed52c 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/FakeScanController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/FakeScanController.java @@ -47,6 +47,11 @@ public class FakeScanController { private final CustomPDFDocumentFactory pdfDocumentFactory; private static final Random RANDOM = new Random(); + // Size limits to prevent OutOfMemoryError + private static final int MAX_IMAGE_WIDTH = 8192; + private static final int MAX_IMAGE_HEIGHT = 8192; + private static final long MAX_IMAGE_PIXELS = 16_777_216; // 4096x4096 + @PostMapping(value = "/fake-scan", consumes = "multipart/form-data") @Operation( summary = "Convert PDF to look like a scanned document", @@ -82,8 +87,46 @@ public class FakeScanController { PDFRenderer pdfRenderer = new PDFRenderer(document); for (int i = 0; i < document.getNumberOfPages(); i++) { - // Render page to image with specified resolution - BufferedImage image = pdfRenderer.renderImageWithDPI(i, resolution); + // Get page dimensions to calculate safe resolution + PDRectangle pageSize = document.getPage(i).getMediaBox(); + float pageWidthPts = pageSize.getWidth(); + float pageHeightPts = pageSize.getHeight(); + + // Calculate what the image dimensions would be at the requested resolution + int projectedWidth = (int) Math.ceil(pageWidthPts * resolution / 72.0); + int projectedHeight = (int) Math.ceil(pageHeightPts * resolution / 72.0); + long projectedPixels = (long) projectedWidth * projectedHeight; + + // Calculate safe resolution that stays within limits + int safeResolution = resolution; + if (projectedWidth > MAX_IMAGE_WIDTH + || projectedHeight > MAX_IMAGE_HEIGHT + || projectedPixels > MAX_IMAGE_PIXELS) { + double widthScale = (double) MAX_IMAGE_WIDTH / projectedWidth; + double heightScale = (double) MAX_IMAGE_HEIGHT / projectedHeight; + double pixelScale = Math.sqrt((double) MAX_IMAGE_PIXELS / projectedPixels); + double minScale = Math.min(Math.min(widthScale, heightScale), pixelScale); + safeResolution = (int) Math.max(72, resolution * minScale); + + log.warn( + "Page {} would be too large at {}dpi ({}x{} pixels). Reducing to {}dpi", + i + 1, + resolution, + projectedWidth, + projectedHeight, + safeResolution); + } + + // Render page to image with safe resolution + BufferedImage image = pdfRenderer.renderImageWithDPI(i, safeResolution); + + log.debug( + "Processing page {} with dimensions {}x{} ({} pixels) at {}dpi", + i + 1, + image.getWidth(), + image.getHeight(), + (long) image.getWidth() * image.getHeight(), + safeResolution); // 1. Convert to grayscale or keep color BufferedImage processed; diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/pipeline/PipelineProcessor.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/pipeline/PipelineProcessor.java index 74177c23c..5c1fd5f4a 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/pipeline/PipelineProcessor.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/pipeline/PipelineProcessor.java @@ -153,15 +153,19 @@ public class PipelineProcessor { String filename = file.getFilename(); String providedExtension = "no extension"; if (filename != null && filename.contains(".")) { - providedExtension = filename.substring(filename.lastIndexOf(".")).toLowerCase(); + providedExtension = + filename.substring(filename.lastIndexOf(".")).toLowerCase(); } - + logPrintStream.println( "No files with extension " + String.join(", ", inputFileTypes) + " found for operation " + operation - + ". Provided file '" + filename + "' has extension: " + providedExtension); + + ". Provided file '" + + filename + + "' has extension: " + + providedExtension); hasErrors = true; } } @@ -211,17 +215,21 @@ public class PipelineProcessor { } } else { // Get details about what files were actually provided - List providedExtensions = outputFiles.stream() - .map(file -> { - String filename = file.getFilename(); - if (filename != null && filename.contains(".")) { - return filename.substring(filename.lastIndexOf(".")).toLowerCase(); - } - return "no extension"; - }) - .distinct() - .toList(); - + List providedExtensions = + outputFiles.stream() + .map( + file -> { + String filename = file.getFilename(); + if (filename != null && filename.contains(".")) { + return filename.substring( + filename.lastIndexOf(".")) + .toLowerCase(); + } + return "no extension"; + }) + .distinct() + .toList(); + logPrintStream.println( "No files with extension " + String.join(", ", inputFileTypes) @@ -229,7 +237,9 @@ public class PipelineProcessor { + operation + ". Provided files have extensions: " + String.join(", ", providedExtensions) - + " (total files: " + outputFiles.size() + ")"); + + " (total files: " + + outputFiles.size() + + ")"); hasErrors = true; } } diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/CertSignController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/CertSignController.java index 9a621390b..7675355da 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/CertSignController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/CertSignController.java @@ -65,6 +65,7 @@ import org.springframework.web.bind.annotation.RestController; import org.springframework.web.multipart.MultipartFile; import io.github.pixee.security.Filenames; +import io.micrometer.common.util.StringUtils; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; @@ -166,7 +167,7 @@ public class CertSignController { Integer pageNumber = request.getPageNumber() != null ? (request.getPageNumber() - 1) : null; Boolean showLogo = request.getShowLogo(); - if (certType == null) { + if (StringUtils.isBlank(certType)) { throw ExceptionUtils.createIllegalArgumentException( "error.optionsNotSpecified", "{0} options are not specified", diff --git a/stirling-pdf/src/main/resources/static/js/fileInput.js b/stirling-pdf/src/main/resources/static/js/fileInput.js index 4c78f176e..b728792f1 100644 --- a/stirling-pdf/src/main/resources/static/js/fileInput.js +++ b/stirling-pdf/src/main/resources/static/js/fileInput.js @@ -226,7 +226,8 @@ function setupFileInput(chooser) { try { const { isEncrypted, requiresPassword } = await decryptFile.checkFileEncrypted(file); - if (file.type === 'application/pdf' && isEncrypted) { + if (file.type === 'application/pdf' && isEncrypted && + !window.location.pathname.includes('remove-password')) { decryptedFile = await decryptFile.decryptFile(file, requiresPassword); if (!decryptedFile) throw new Error('File decryption failed.'); } diff --git a/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html b/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html index 161f4181e..e1de37eb8 100644 --- a/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html +++ b/stirling-pdf/src/main/resources/templates/misc/ocr-pdf.html @@ -65,7 +65,7 @@