From d79b6e29e0968906b894528cd946c13ccff17448 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com.> Date: Tue, 1 Jul 2025 21:17:45 +0100 Subject: [PATCH] error tuning --- .../service/CustomPDFDocumentFactory.java | 20 ++++- .../software/common/util/PdfErrorUtils.java | 59 +++++++++++++++ .../software/common/util/PdfUtils.java | 75 +++++++++++++++++-- scripts/split_photos.py | 10 ++- .../SPDF/controller/api/MergeController.java | 12 ++- .../api/RearrangePagesPDFController.java | 2 +- .../api/misc/ExtractImagesController.java | 10 ++- .../controller/api/security/GetInfoOnPDF.java | 62 ++++++++++----- .../api/security/PasswordController.java | 28 +++++-- 9 files changed, 241 insertions(+), 37 deletions(-) create mode 100644 common/src/main/java/stirling/software/common/util/PdfErrorUtils.java diff --git a/common/src/main/java/stirling/software/common/service/CustomPDFDocumentFactory.java b/common/src/main/java/stirling/software/common/service/CustomPDFDocumentFactory.java index 51f52c34d..4da2412a8 100644 --- a/common/src/main/java/stirling/software/common/service/CustomPDFDocumentFactory.java +++ b/common/src/main/java/stirling/software/common/service/CustomPDFDocumentFactory.java @@ -24,6 +24,7 @@ import lombok.extern.slf4j.Slf4j; import stirling.software.common.model.api.PDFFile; import stirling.software.common.util.ApplicationContextProvider; +import stirling.software.common.util.PdfErrorUtils; import stirling.software.common.util.TempFileManager; import stirling.software.common.util.TempFileRegistry; @@ -354,7 +355,14 @@ public class CustomPDFDocumentFactory { private PDDocument loadFromFile(File file, long size, StreamCacheCreateFunction cache) throws IOException { - return Loader.loadPDF(new DeletingRandomAccessFile(file), "", null, null, cache); + try { + return Loader.loadPDF(new DeletingRandomAccessFile(file), "", null, null, cache); + } catch (IOException e) { + if (PdfErrorUtils.isCorruptedPdfError(e)) { + throw new IOException(PdfErrorUtils.getCorruptedPdfMessage(""), e); + } + throw e; + } } private PDDocument loadFromBytes(byte[] bytes, long size, StreamCacheCreateFunction cache) @@ -366,7 +374,15 @@ public class CustomPDFDocumentFactory { Files.write(tempFile, bytes); return loadFromFile(tempFile.toFile(), size, cache); } - return Loader.loadPDF(bytes, "", null, null, cache); + + try { + return Loader.loadPDF(bytes, "", null, null, cache); + } catch (IOException e) { + if (PdfErrorUtils.isCorruptedPdfError(e)) { + throw new IOException(PdfErrorUtils.getCorruptedPdfMessage(""), e); + } + throw e; + } } public PDDocument createNewDocument(MemoryUsageSetting settings) throws IOException { diff --git a/common/src/main/java/stirling/software/common/util/PdfErrorUtils.java b/common/src/main/java/stirling/software/common/util/PdfErrorUtils.java new file mode 100644 index 000000000..31cc2c00a --- /dev/null +++ b/common/src/main/java/stirling/software/common/util/PdfErrorUtils.java @@ -0,0 +1,59 @@ +package stirling.software.common.util; + +import java.io.IOException; + +/** + * Utility class for detecting and handling PDF-related errors. + */ +public class PdfErrorUtils { + + /** + * Checks if an IOException indicates a corrupted PDF file. + * + * @param e the IOException to check + * @return true if the error indicates PDF corruption, false otherwise + */ + public static boolean isCorruptedPdfError(IOException e) { + String message = e.getMessage(); + if (message == null) return false; + + // Check for common corruption indicators + return message.contains("Missing root object specification") || + message.contains("Header doesn't contain versioninfo") || + message.contains("Expected trailer") || + message.contains("Invalid PDF") || + message.contains("Corrupted") || + message.contains("damaged") || + message.contains("Unknown dir object") || + message.contains("Can't dereference COSObject") || + message.contains("AES initialization vector not fully read") || + message.contains("BadPaddingException") || + message.contains("Given final block not properly padded"); + } + + /** + * Creates a user-friendly error message for corrupted PDF files. + * + * @param context additional context about where the error occurred (e.g., "during merge", "during processing") + * @return a user-friendly error message + */ + public static String getCorruptedPdfMessage(String context) { + String baseMessage = "PDF file appears to be corrupted or damaged. " + + "Please try using the 'Repair PDF' feature first to fix the file before proceeding with this operation."; + + if (context != null && !context.isEmpty()) { + return "Error " + context + ": " + baseMessage; + } + return baseMessage; + } + + /** + * Creates a user-friendly error message for multiple corrupted PDF files (e.g., during merge). + * + * @return a user-friendly error message for multiple file operations + */ + public static String getCorruptedPdfMessageForMultipleFiles() { + return "One or more PDF files appear to be corrupted or damaged. " + + "Please try using the 'Repair PDF' feature on each file first before attempting to merge them."; + } +} \ No newline at end of file diff --git a/common/src/main/java/stirling/software/common/util/PdfUtils.java b/common/src/main/java/stirling/software/common/util/PdfUtils.java index 3986110e5..9ea65388c 100644 --- a/common/src/main/java/stirling/software/common/util/PdfUtils.java +++ b/common/src/main/java/stirling/software/common/util/PdfUtils.java @@ -135,6 +135,16 @@ public class PdfUtils { int DPI, String filename) throws IOException, Exception { + + // Validate and limit DPI to prevent excessive memory usage + final int MAX_SAFE_DPI = 300; // Maximum safe DPI to prevent memory issues + if (DPI > MAX_SAFE_DPI) { + throw new IllegalArgumentException(String.format( + "DPI value %d exceeds maximum safe limit of %d. " + + "High DPI values can cause memory issues and crashes. " + + "Please use a lower DPI value.", DPI, MAX_SAFE_DPI)); + } + try (PDDocument document = pdfDocumentFactory.load(inputStream)) { PDFRenderer pdfRenderer = new PDFRenderer(document); pdfRenderer.setSubsamplingAllowed(true); @@ -158,7 +168,18 @@ public class PdfUtils { writer.prepareWriteSequence(null); for (int i = 0; i < pageCount; ++i) { - BufferedImage image = pdfRenderer.renderImageWithDPI(i, DPI, colorType); + BufferedImage image; + try { + image = pdfRenderer.renderImageWithDPI(i, DPI, colorType); + } catch (IllegalArgumentException e) { + if (e.getMessage() != null && e.getMessage().contains("Maximum size of image exceeded")) { + throw new IllegalArgumentException(String.format( + "PDF page %d is too large to render at %d DPI. " + + "Please try a lower DPI value (recommended: 150 or less).", + i + 1, DPI), e); + } + throw e; + } writer.writeToSequence(new IIOImage(image, null, null), param); } @@ -190,7 +211,18 @@ public class PdfUtils { PdfImageDimensionValue dimension = pageSizes.get(settings); if (dimension == null) { // Render the image to get the dimensions - pdfSizeImage = pdfRenderer.renderImageWithDPI(i, DPI, colorType); + try { + pdfSizeImage = pdfRenderer.renderImageWithDPI(i, DPI, colorType); + } catch (IllegalArgumentException e) { + if (e.getMessage() != null && e.getMessage().contains("Maximum size of image exceeded")) { + throw new IllegalArgumentException(String.format( + "PDF page %d is too large to render at %d DPI. " + + "The resulting image would exceed Java's maximum array size. " + + "Please try a lower DPI value (recommended: 150 or less).", + i + 1, DPI), e); + } + throw e; + } pdfSizeImageIndex = i; dimension = new PdfImageDimensionValue( @@ -218,7 +250,17 @@ public class PdfUtils { if (firstImageAlreadyRendered && i == 0) { pageImage = pdfSizeImage; } else { - pageImage = pdfRenderer.renderImageWithDPI(i, DPI, colorType); + try { + pageImage = pdfRenderer.renderImageWithDPI(i, DPI, colorType); + } catch (IllegalArgumentException e) { + if (e.getMessage() != null && e.getMessage().contains("Maximum size of image exceeded")) { + throw new IllegalArgumentException(String.format( + "PDF page %d is too large to render at %d DPI. " + + "Please try a lower DPI value (recommended: 150 or less).", + i + 1, DPI), e); + } + throw e; + } } // Calculate the x-coordinate to center the image @@ -238,7 +280,18 @@ public class PdfUtils { // Zip the images and return as byte array try (ZipOutputStream zos = new ZipOutputStream(baos)) { for (int i = 0; i < pageCount; ++i) { - BufferedImage image = pdfRenderer.renderImageWithDPI(i, DPI, colorType); + BufferedImage image; + try { + image = pdfRenderer.renderImageWithDPI(i, DPI, colorType); + } catch (IllegalArgumentException e) { + if (e.getMessage() != null && e.getMessage().contains("Maximum size of image exceeded")) { + throw new IllegalArgumentException(String.format( + "PDF page %d is too large to render at %d DPI. " + + "Please try a lower DPI value (recommended: 150 or less).", + i + 1, DPI), e); + } + throw e; + } try (ByteArrayOutputStream baosImage = new ByteArrayOutputStream()) { ImageIO.write(image, imageType, baosImage); @@ -276,7 +329,19 @@ public class PdfUtils { PDFRenderer pdfRenderer = new PDFRenderer(document); pdfRenderer.setSubsamplingAllowed(true); for (int page = 0; page < document.getNumberOfPages(); ++page) { - BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB); + BufferedImage bim; + try { + bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB); + } catch (IllegalArgumentException e) { + if (e.getMessage() != null && e.getMessage().contains("Maximum size of image exceeded")) { + throw new IllegalArgumentException(String.format( + "PDF page %d is too large to render at 300 DPI. " + + "The resulting image would exceed Java's maximum array size. " + + "Please use a lower DPI value for PDF-to-image conversion.", + page + 1), e); + } + throw e; + } PDPage originalPage = document.getPage(page); float width = originalPage.getMediaBox().getWidth(); diff --git a/scripts/split_photos.py b/scripts/split_photos.py index a007fdeb4..510ca8409 100644 --- a/scripts/split_photos.py +++ b/scripts/split_photos.py @@ -94,8 +94,14 @@ def split_photos(input_file, output_directory, tolerance=30, min_area=10000, min cropped_image = image[y:y+h, x:x+w] cropped_image = auto_rotate(cropped_image, angle_threshold) - # Remove the added border - cropped_image = cropped_image[border_size:-border_size, border_size:-border_size] + # Remove the added border, but ensure we don't create an empty image + if border_size > 0 and cropped_image.shape[0] > 2 * border_size and cropped_image.shape[1] > 2 * border_size: + cropped_image = cropped_image[border_size:-border_size, border_size:-border_size] + + # Check if the cropped image is valid before saving + if cropped_image.size == 0 or cropped_image.shape[0] == 0 or cropped_image.shape[1] == 0: + print(f"Warning: Skipping empty image for region {idx+1}") + continue output_path = os.path.join(output_directory, f"{input_file_basename}_{idx+1}.png") cv2.imwrite(output_path, cropped_image) diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/MergeController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/MergeController.java index ddd988ef9..277828970 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/MergeController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/MergeController.java @@ -36,6 +36,7 @@ import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.model.api.general.MergePdfsRequest; import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.util.GeneralUtils; +import stirling.software.common.util.PdfErrorUtils; import stirling.software.common.util.WebResponseUtils; @RestController @@ -189,8 +190,15 @@ public class MergeController { mergedTempFile = Files.createTempFile("merged-", ".pdf").toFile(); mergerUtility.setDestinationFileName(mergedTempFile.getAbsolutePath()); - mergerUtility.mergeDocuments( - pdfDocumentFactory.getStreamCacheFunction(totalSize)); // Merge the documents + try { + mergerUtility.mergeDocuments( + pdfDocumentFactory.getStreamCacheFunction(totalSize)); // Merge the documents + } catch (IOException e) { + if (PdfErrorUtils.isCorruptedPdfError(e)) { + throw new IOException(PdfErrorUtils.getCorruptedPdfMessageForMultipleFiles(), e); + } + throw e; + } // Load the merged PDF document mergedDocument = pdfDocumentFactory.load(mergedTempFile); diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/RearrangePagesPDFController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/RearrangePagesPDFController.java index 3bf2ec802..1f2d3a7c3 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/RearrangePagesPDFController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/RearrangePagesPDFController.java @@ -289,7 +289,7 @@ public class RearrangePagesPDFController { + "_rearranged.pdf"); } catch (IOException e) { log.error("Failed rearranging documents", e); - return null; + throw e; } } } diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java index cb06b9f4d..5b21ab1a7 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java @@ -42,6 +42,7 @@ import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.model.api.PDFExtractImagesRequest; import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.util.ImageProcessingUtils; +import stirling.software.common.util.PdfErrorUtils; import stirling.software.common.util.WebResponseUtils; @RestController @@ -180,7 +181,8 @@ public class ExtractImagesController { } int count = 1; for (COSName name : page.getResources().getXObjectNames()) { - if (page.getResources().isImageXObject(name)) { + try { + if (page.getResources().isImageXObject(name)) { PDImageXObject image = (PDImageXObject) page.getResources().getXObject(name); if (!allowDuplicates) { byte[] data = ImageProcessingUtils.getImageData(image.getImage()); @@ -209,6 +211,12 @@ public class ExtractImagesController { zos.closeEntry(); } } + } catch (IOException e) { + if (PdfErrorUtils.isCorruptedPdfError(e)) { + throw new IOException(PdfErrorUtils.getCorruptedPdfMessage("during image extraction"), e); + } + throw e; + } } } diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java index c630106e4..d4ffbab89 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java @@ -150,21 +150,37 @@ public class GetInfoOnPDF { PDMetadata pdMetadata = document.getDocumentCatalog().getMetadata(); if (pdMetadata != null) { COSInputStream metaStream = pdMetadata.createInputStream(); - DomXmpParser domXmpParser = new DomXmpParser(); - XMPMetadata xmpMeta = domXmpParser.parse(metaStream); - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - new XmpSerializer().serialize(xmpMeta, baos, true); - String xmpString = new String(baos.toByteArray(), StandardCharsets.UTF_8); - - if (xmpString.contains(standardKeyword)) { + + // First try to read raw metadata as string to check for standard keywords + byte[] metadataBytes = metaStream.readAllBytes(); + String rawMetadata = new String(metadataBytes, StandardCharsets.UTF_8); + + if (rawMetadata.contains(standardKeyword)) { return true; } + + // If raw check doesn't find it, try parsing with XMP parser + // Reset stream for parsing + metaStream.close(); + metaStream = pdMetadata.createInputStream(); + + try { + DomXmpParser domXmpParser = new DomXmpParser(); + XMPMetadata xmpMeta = domXmpParser.parse(metaStream); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + new XmpSerializer().serialize(xmpMeta, baos, true); + String xmpString = new String(baos.toByteArray(), StandardCharsets.UTF_8); + + if (xmpString.contains(standardKeyword)) { + return true; + } + } catch (XmpParsingException e) { + // XMP parsing failed, but we already checked raw metadata above + log.debug("XMP parsing failed for standard check, but raw metadata was already checked: {}", e.getMessage()); + } } - } catch ( - Exception - e) { // Catching general exception for brevity, ideally you'd catch specific - // exceptions. + } catch (Exception e) { log.error("exception", e); } @@ -392,13 +408,23 @@ public class GetInfoOnPDF { if (pdMetadata != null) { try { COSInputStream is = pdMetadata.createInputStream(); - DomXmpParser domXmpParser = new DomXmpParser(); - XMPMetadata xmpMeta = domXmpParser.parse(is); + + try { + DomXmpParser domXmpParser = new DomXmpParser(); + XMPMetadata xmpMeta = domXmpParser.parse(is); - ByteArrayOutputStream os = new ByteArrayOutputStream(); - new XmpSerializer().serialize(xmpMeta, os, true); - xmpString = new String(os.toByteArray(), StandardCharsets.UTF_8); - } catch (XmpParsingException | IOException e) { + ByteArrayOutputStream os = new ByteArrayOutputStream(); + new XmpSerializer().serialize(xmpMeta, os, true); + xmpString = new String(os.toByteArray(), StandardCharsets.UTF_8); + } catch (XmpParsingException e) { + // XMP parsing failed, try to read raw metadata instead + log.debug("XMP parsing failed, reading raw metadata: {}", e.getMessage()); + is.close(); + is = pdMetadata.createInputStream(); + byte[] metadataBytes = is.readAllBytes(); + xmpString = new String(metadataBytes, StandardCharsets.UTF_8); + } + } catch (IOException e) { log.error("exception", e); } } diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/PasswordController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/PasswordController.java index 4567fcb7e..61808b9d5 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/PasswordController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/PasswordController.java @@ -42,12 +42,28 @@ public class PasswordController { MultipartFile fileInput = request.getFileInput(); String password = request.getPassword(); PDDocument document = pdfDocumentFactory.load(fileInput, password); - document.setAllSecurityToBeRemoved(true); - return WebResponseUtils.pdfDocToWebResponse( - document, - Filenames.toSimpleFileName(fileInput.getOriginalFilename()) - .replaceFirst("[.][^.]+$", "") - + "_password_removed.pdf"); + + try { + document.setAllSecurityToBeRemoved(true); + return WebResponseUtils.pdfDocToWebResponse( + document, + Filenames.toSimpleFileName(fileInput.getOriginalFilename()) + .replaceFirst("[.][^.]+$", "") + + "_password_removed.pdf"); + } catch (IOException e) { + // Check if this is an encryption/decryption error + if (e.getMessage() != null && + (e.getMessage().contains("BadPaddingException") || + e.getMessage().contains("Given final block not properly padded") || + e.getMessage().contains("Failed to decrypt"))) { + + document.close(); + throw new IOException("The PDF appears to have corrupted encryption data. " + + "This can happen when the PDF was created with incompatible encryption methods. " + + "Please try using the 'Repair PDF' feature first, or contact the document creator for a new copy.", e); + } + throw e; + } } @PostMapping(consumes = "multipart/form-data", value = "/add-password")