diff --git a/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java b/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java index 895aa70de..627268809 100644 --- a/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java +++ b/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java @@ -153,12 +153,14 @@ public class TempFileCleanupService { // Clean up unregistered temp files based on our cleanup strategy boolean containerMode = isContainerMode(); int unregisteredDeletedCount = cleanupUnregisteredFiles(containerMode, true, maxAgeMillis); - + + if(registeredDeletedCount >0 || unregisteredDeletedCount >0 || directoriesDeletedCount >0) { log.info( "Scheduled cleanup complete. Deleted {} registered files, {} unregistered files, {} directories", registeredDeletedCount, unregisteredDeletedCount, directoriesDeletedCount); + } } /** @@ -166,7 +168,6 @@ public class TempFileCleanupService { * important in Docker environments where temp files persist between container restarts. */ private void runStartupCleanup() { - log.info("Running startup temporary file cleanup"); boolean containerMode = isContainerMode(); log.info( @@ -178,7 +179,6 @@ public class TempFileCleanupService { long maxAgeMillis = containerMode ? 0 : 24 * 60 * 60 * 1000; // 0 or 24 hours int totalDeletedCount = cleanupUnregisteredFiles(containerMode, false, maxAgeMillis); - log.info( "Startup cleanup complete. Deleted {} temporary files/directories", totalDeletedCount); @@ -225,7 +225,7 @@ public class TempFileCleanupService { tempDir -> { try { String phase = isScheduled ? "scheduled" : "startup"; - log.info( + log.debug( "Scanning directory for {} cleanup: {}", phase, tempDir); diff --git a/common/src/main/java/stirling/software/common/util/ExceptionUtils.java b/common/src/main/java/stirling/software/common/util/ExceptionUtils.java index fb6d501d9..061cf1450 100644 --- a/common/src/main/java/stirling/software/common/util/ExceptionUtils.java +++ b/common/src/main/java/stirling/software/common/util/ExceptionUtils.java @@ -298,9 +298,9 @@ public class ExceptionUtils { * @param e the exception that occurred */ public static void logException(String operation, Exception e) { - if (e instanceof IOException && PdfErrorUtils.isCorruptedPdfError((IOException) e)) { + if (PdfErrorUtils.isCorruptedPdfError(e)) { log.warn("PDF corruption detected during {}: {}", operation, e.getMessage()); - } else if (isEncryptionError((IOException) e) || isPasswordError((IOException) e)) { + } else if (e instanceof IOException && (isEncryptionError((IOException) e) || isPasswordError((IOException) e))) { log.info("PDF security issue during {}: {}", operation, e.getMessage()); } else { log.error("Unexpected error during {}", operation, e); diff --git a/common/src/main/java/stirling/software/common/util/PdfErrorUtils.java b/common/src/main/java/stirling/software/common/util/PdfErrorUtils.java index e18922c10..c67e2a4ec 100644 --- a/common/src/main/java/stirling/software/common/util/PdfErrorUtils.java +++ b/common/src/main/java/stirling/software/common/util/PdfErrorUtils.java @@ -12,7 +12,26 @@ public class PdfErrorUtils { * @return true if the error indicates PDF corruption, false otherwise */ public static boolean isCorruptedPdfError(IOException e) { - String message = e.getMessage(); + return isCorruptedPdfError(e.getMessage()); + } + + /** + * Checks if any Exception indicates a corrupted PDF file. + * + * @param e the Exception to check + * @return true if the error indicates PDF corruption, false otherwise + */ + public static boolean isCorruptedPdfError(Exception e) { + return isCorruptedPdfError(e.getMessage()); + } + + /** + * Checks if an error message indicates a corrupted PDF file. + * + * @param message the error message to check + * @return true if the message indicates PDF corruption, false otherwise + */ + private static boolean isCorruptedPdfError(String message) { if (message == null) return false; // Check for common corruption indicators @@ -24,6 +43,10 @@ public class PdfErrorUtils { || message.contains("damaged") || message.contains("Unknown dir object") || message.contains("Can't dereference COSObject") + || message.contains("parseCOSString string should start with") + || message.contains("ICCBased colorspace array must have a stream") + || message.contains("1-based index not found") + || message.contains("Invalid dictionary, found:") || message.contains("AES initialization vector not fully read") || message.contains("BadPaddingException") || message.contains("Given final block not properly padded"); diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java index dd9eeb2da..249e9263c 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java @@ -91,36 +91,54 @@ public class ExtractImagesController { Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()); Set> futures = new HashSet<>(); - // Iterate over each page - for (int pgNum = 0; pgNum < document.getPages().getCount(); pgNum++) { - PDPage page = document.getPage(pgNum); - Future future = - executor.submit( - () -> { - // Use the page number directly from the iterator, so no need to - // calculate manually - int pageNum = document.getPages().indexOf(page) + 1; + // Safely iterate over each page, handling corrupt PDFs where page count might be wrong + try { + int pageCount = document.getPages().getCount(); + log.debug("Document reports {} pages", pageCount); + + int consecutiveFailures = 0; + + for (int pgNum = 0; pgNum < pageCount; pgNum++) { + try { + PDPage page = document.getPage(pgNum); + consecutiveFailures = 0; // Reset on success + final int currentPageNum = pgNum + 1; // Convert to 1-based page numbering + Future future = + executor.submit( + () -> { + try { + // Call the image extraction method for each page + extractImagesFromPage( + page, + format, + filename, + currentPageNum, + processedImages, + zos, + allowDuplicates); + } catch (Exception e) { + // Log the error and continue processing other pages + ExceptionUtils.logException("image extraction from page " + currentPageNum, e); + } - try { - // Call the image extraction method for each page - extractImagesFromPage( - page, - format, - filename, - pageNum, - processedImages, - zos, - allowDuplicates); - } catch (IOException e) { - // Log the error and continue processing other pages - ExceptionUtils.logException("image extraction from page " + pageNum, e); - } + return null; // Callable requires a return type + }); - return null; // Callable requires a return type - }); - - // Add the Future object to the list to track completion - futures.add(future); + // Add the Future object to the list to track completion + futures.add(future); + } catch (Exception e) { + consecutiveFailures++; + ExceptionUtils.logException("page access for page " + (pgNum + 1), e); + + if (consecutiveFailures >= 3) { + log.warn("Stopping page iteration after 3 consecutive failures"); + break; + } + } + } + } catch (Exception e) { + ExceptionUtils.logException("page count determination", e); + throw e; } // Wait for all tasks to complete