diff --git a/DeveloperGuide.md b/DeveloperGuide.md index c7d45804..8a4b53b6 100644 --- a/DeveloperGuide.md +++ b/DeveloperGuide.md @@ -124,7 +124,7 @@ These files provide pre-configured setups for different scenarios. For example, services: stirling-pdf: container_name: Stirling-PDF-Security - image: stirlingtools/stirling-pdf:latest + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest deploy: resources: limits: diff --git a/build.gradle b/build.gradle index 0e452c6d..11d89d07 100644 --- a/build.gradle +++ b/build.gradle @@ -25,7 +25,7 @@ ext { } group = "stirling.software" -version = "0.43.2" +version = "0.44.0" java { // 17 is lowest but we support and recommend 21 diff --git a/exampleYmlFiles/docker-compose-latest-fat-security-postgres.yml b/exampleYmlFiles/docker-compose-latest-fat-security-postgres.yml index 3deddab4..b79631b6 100644 --- a/exampleYmlFiles/docker-compose-latest-fat-security-postgres.yml +++ b/exampleYmlFiles/docker-compose-latest-fat-security-postgres.yml @@ -1,7 +1,7 @@ services: stirling-pdf: container_name: Stirling-PDF-Security-Fat-Postgres - image: stirlingtools/stirling-pdf:latest-fat-postgres + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-fat-postgres deploy: resources: limits: diff --git a/exampleYmlFiles/docker-compose-latest-fat-security.yml b/exampleYmlFiles/docker-compose-latest-fat-security.yml index a8fedcb2..5d01f33f 100644 --- a/exampleYmlFiles/docker-compose-latest-fat-security.yml +++ b/exampleYmlFiles/docker-compose-latest-fat-security.yml @@ -1,7 +1,7 @@ services: stirling-pdf: container_name: Stirling-PDF-Security-Fat - image: stirlingtools/stirling-pdf:latest-fat + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-fat deploy: resources: limits: diff --git a/exampleYmlFiles/docker-compose-latest-security-with-sso.yml b/exampleYmlFiles/docker-compose-latest-security-with-sso.yml index 9d30986c..9b86d4cc 100644 --- a/exampleYmlFiles/docker-compose-latest-security-with-sso.yml +++ b/exampleYmlFiles/docker-compose-latest-security-with-sso.yml @@ -1,7 +1,7 @@ services: stirling-pdf: container_name: Stirling-PDF-Security - image: stirlingtools/stirling-pdf:latest + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest deploy: resources: limits: diff --git a/exampleYmlFiles/docker-compose-latest-security.yml b/exampleYmlFiles/docker-compose-latest-security.yml index 5f8c977d..b79ea801 100644 --- a/exampleYmlFiles/docker-compose-latest-security.yml +++ b/exampleYmlFiles/docker-compose-latest-security.yml @@ -1,7 +1,7 @@ services: stirling-pdf: container_name: Stirling-PDF-Security - image: stirlingtools/stirling-pdf:latest + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest deploy: resources: limits: diff --git a/exampleYmlFiles/docker-compose-latest-ultra-lite-security.yml b/exampleYmlFiles/docker-compose-latest-ultra-lite-security.yml index f357e0b9..8f3932f7 100644 --- a/exampleYmlFiles/docker-compose-latest-ultra-lite-security.yml +++ b/exampleYmlFiles/docker-compose-latest-ultra-lite-security.yml @@ -1,7 +1,7 @@ services: stirling-pdf: container_name: Stirling-PDF-Ultra-Lite-Security - image: stirlingtools/stirling-pdf:latest-ultra-lite + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-ultra-lite deploy: resources: limits: diff --git a/exampleYmlFiles/docker-compose-latest-ultra-lite.yml b/exampleYmlFiles/docker-compose-latest-ultra-lite.yml index 53d7bcb8..d194be21 100644 --- a/exampleYmlFiles/docker-compose-latest-ultra-lite.yml +++ b/exampleYmlFiles/docker-compose-latest-ultra-lite.yml @@ -1,7 +1,7 @@ services: stirling-pdf: container_name: Stirling-PDF-Ultra-Lite - image: stirlingtools/stirling-pdf:latest-ultra-lite + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-ultra-lite deploy: resources: limits: diff --git a/exampleYmlFiles/docker-compose-latest.yml b/exampleYmlFiles/docker-compose-latest.yml index 8419b072..7dec8a95 100644 --- a/exampleYmlFiles/docker-compose-latest.yml +++ b/exampleYmlFiles/docker-compose-latest.yml @@ -1,7 +1,7 @@ services: stirling-pdf: container_name: Stirling-PDF - image: stirlingtools/stirling-pdf:latest + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest deploy: resources: limits: diff --git a/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java b/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java index a1b1ab7a..12bf2c29 100644 --- a/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java +++ b/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java @@ -11,6 +11,7 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import lombok.extern.slf4j.Slf4j; + import stirling.software.SPDF.model.ApplicationProperties; @Service diff --git a/src/main/java/stirling/software/SPDF/controller/api/AnalysisController.java b/src/main/java/stirling/software/SPDF/controller/api/AnalysisController.java index 0e0d0534..8c97605b 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/AnalysisController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/AnalysisController.java @@ -3,7 +3,6 @@ package stirling.software.SPDF.controller.api; import java.io.IOException; import java.util.*; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; @@ -12,24 +11,33 @@ import org.apache.pdfbox.pdmodel.PDPageTree; import org.apache.pdfbox.pdmodel.encryption.PDEncryption; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.web.bind.annotation.*; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; import stirling.software.SPDF.model.api.PDFFile; +import stirling.software.SPDF.service.CustomPDDocumentFactory; @RestController @RequestMapping("/api/v1/analysis") @Tag(name = "Analysis", description = "Analysis APIs") public class AnalysisController { + private final CustomPDDocumentFactory pdfDocumentFactory; + + @Autowired + public AnalysisController(CustomPDDocumentFactory pdfDocumentFactory) { + this.pdfDocumentFactory = pdfDocumentFactory; + } + @PostMapping(value = "/page-count", consumes = "multipart/form-data") @Operation( summary = "Get PDF page count", description = "Returns total number of pages in PDF. Input:PDF Output:JSON Type:SISO") public Map getPageCount(@ModelAttribute PDFFile file) throws IOException { - try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { + try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) { return Map.of("pageCount", document.getNumberOfPages()); } } @@ -39,7 +47,7 @@ public class AnalysisController { summary = "Get basic PDF information", description = "Returns page count, version, file size. Input:PDF Output:JSON Type:SISO") public Map getBasicInfo(@ModelAttribute PDFFile file) throws IOException { - try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { + try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) { Map info = new HashMap<>(); info.put("pageCount", document.getNumberOfPages()); info.put("pdfVersion", document.getVersion()); @@ -54,7 +62,7 @@ public class AnalysisController { description = "Returns title, author, subject, etc. Input:PDF Output:JSON Type:SISO") public Map getDocumentProperties(@ModelAttribute PDFFile file) throws IOException { - try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { + try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) { PDDocumentInformation info = document.getDocumentInformation(); Map properties = new HashMap<>(); properties.put("title", info.getTitle()); @@ -75,7 +83,7 @@ public class AnalysisController { description = "Returns width and height of each page. Input:PDF Output:JSON Type:SISO") public List> getPageDimensions(@ModelAttribute PDFFile file) throws IOException { - try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { + try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) { List> dimensions = new ArrayList<>(); PDPageTree pages = document.getPages(); @@ -95,7 +103,7 @@ public class AnalysisController { description = "Returns count and details of form fields. Input:PDF Output:JSON Type:SISO") public Map getFormFields(@ModelAttribute PDFFile file) throws IOException { - try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { + try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) { Map formInfo = new HashMap<>(); PDAcroForm form = document.getDocumentCatalog().getAcroForm(); @@ -117,7 +125,7 @@ public class AnalysisController { summary = "Get annotation information", description = "Returns count and types of annotations. Input:PDF Output:JSON Type:SISO") public Map getAnnotationInfo(@ModelAttribute PDFFile file) throws IOException { - try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { + try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) { Map annotInfo = new HashMap<>(); int totalAnnotations = 0; Map annotationTypes = new HashMap<>(); @@ -142,7 +150,7 @@ public class AnalysisController { description = "Returns list of fonts used in the document. Input:PDF Output:JSON Type:SISO") public Map getFontInfo(@ModelAttribute PDFFile file) throws IOException { - try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { + try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) { Map fontInfo = new HashMap<>(); Set fontNames = new HashSet<>(); @@ -164,7 +172,7 @@ public class AnalysisController { description = "Returns encryption and permission details. Input:PDF Output:JSON Type:SISO") public Map getSecurityInfo(@ModelAttribute PDFFile file) throws IOException { - try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { + try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) { Map securityInfo = new HashMap<>(); PDEncryption encryption = document.getEncryption(); diff --git a/src/main/java/stirling/software/SPDF/controller/api/CropController.java b/src/main/java/stirling/software/SPDF/controller/api/CropController.java index 776c85b0..d3e4933f 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/CropController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/CropController.java @@ -3,7 +3,6 @@ package stirling.software.SPDF.controller.api; import java.io.ByteArrayOutputStream; import java.io.IOException; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.multipdf.LayerUtility; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; @@ -23,7 +22,6 @@ import io.swagger.v3.oas.annotations.tags.Tag; import stirling.software.SPDF.model.api.general.CropPdfForm; import stirling.software.SPDF.service.CustomPDDocumentFactory; -import stirling.software.SPDF.service.PostHogService; import stirling.software.SPDF.utils.WebResponseUtils; @RestController @@ -33,13 +31,9 @@ public class CropController { private final CustomPDDocumentFactory pdfDocumentFactory; - private final PostHogService postHogService; - @Autowired - public CropController( - CustomPDDocumentFactory pdfDocumentFactory, PostHogService postHogService) { + public CropController(CustomPDDocumentFactory pdfDocumentFactory) { this.pdfDocumentFactory = pdfDocumentFactory; - this.postHogService = postHogService; } @PostMapping(value = "/crop", consumes = "multipart/form-data") @@ -48,7 +42,7 @@ public class CropController { description = "This operation takes an input PDF file and crops it according to the given coordinates. Input:PDF Output:PDF Type:SISO") public ResponseEntity cropPdf(@ModelAttribute CropPdfForm form) throws IOException { - PDDocument sourceDocument = Loader.loadPDF(form.getFileInput().getBytes()); + PDDocument sourceDocument = pdfDocumentFactory.load(form.getFileInput().getBytes()); PDDocument newDocument = pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument); diff --git a/src/main/java/stirling/software/SPDF/controller/api/MergeController.java b/src/main/java/stirling/software/SPDF/controller/api/MergeController.java index 3a71dec9..416546f4 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/MergeController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/MergeController.java @@ -12,7 +12,6 @@ import java.util.Comparator; import java.util.List; import java.util.stream.Collectors; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.multipdf.PDFMergerUtility; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentCatalog; @@ -101,8 +100,8 @@ public class MergeController { }; case "byPDFTitle": return (file1, file2) -> { - try (PDDocument doc1 = Loader.loadPDF(file1.getBytes()); - PDDocument doc2 = Loader.loadPDF(file2.getBytes())) { + try (PDDocument doc1 = pdfDocumentFactory.load(file1.getBytes()); + PDDocument doc2 = pdfDocumentFactory.load(file2.getBytes())) { String title1 = doc1.getDocumentInformation().getTitle(); String title2 = doc2.getDocumentInformation().getTitle(); return title1.compareTo(title2); @@ -152,7 +151,7 @@ public class MergeController { byte[] mergedPdfBytes = docOutputstream.toByteArray(); // Get merged document bytes // Load the merged PDF document - mergedDocument = Loader.loadPDF(mergedPdfBytes); + mergedDocument = pdfDocumentFactory.load(mergedPdfBytes); // Remove signatures if removeCertSign is true if (removeCertSign) { diff --git a/src/main/java/stirling/software/SPDF/controller/api/MultiPageLayoutController.java b/src/main/java/stirling/software/SPDF/controller/api/MultiPageLayoutController.java index 1fb4d5d4..76ad5e75 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/MultiPageLayoutController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/MultiPageLayoutController.java @@ -4,7 +4,6 @@ import java.awt.*; import java.io.ByteArrayOutputStream; import java.io.IOException; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.multipdf.LayerUtility; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; @@ -64,7 +63,7 @@ public class MultiPageLayoutController { : (int) Math.sqrt(pagesPerSheet); int rows = pagesPerSheet == 2 || pagesPerSheet == 3 ? 1 : (int) Math.sqrt(pagesPerSheet); - PDDocument sourceDocument = Loader.loadPDF(file.getBytes()); + PDDocument sourceDocument = pdfDocumentFactory.load(file.getBytes()); PDDocument newDocument = pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument); PDPage newPage = new PDPage(PDRectangle.A4); diff --git a/src/main/java/stirling/software/SPDF/controller/api/RearrangePagesPDFController.java b/src/main/java/stirling/software/SPDF/controller/api/RearrangePagesPDFController.java index 6b04ee77..58e69b72 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/RearrangePagesPDFController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/RearrangePagesPDFController.java @@ -5,7 +5,6 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.springframework.beans.factory.annotation.Autowired; @@ -251,7 +250,7 @@ public class RearrangePagesPDFController { String sortType = request.getCustomMode(); try { // Load the input PDF - PDDocument document = Loader.loadPDF(pdfFile.getBytes()); + PDDocument document = pdfDocumentFactory.load(pdfFile.getBytes()); // Split the page order string into an array of page numbers or range of numbers String[] pageOrderArr = pageOrder != null ? pageOrder.split(",") : new String[0]; diff --git a/src/main/java/stirling/software/SPDF/controller/api/ScalePagesController.java b/src/main/java/stirling/software/SPDF/controller/api/ScalePagesController.java index c098a005..c1715347 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/ScalePagesController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/ScalePagesController.java @@ -5,7 +5,6 @@ import java.io.IOException; import java.util.HashMap; import java.util.Map; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.multipdf.LayerUtility; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; @@ -52,7 +51,7 @@ public class ScalePagesController { String targetPDRectangle = request.getPageSize(); float scaleFactor = request.getScaleFactor(); - PDDocument sourceDocument = Loader.loadPDF(file.getBytes()); + PDDocument sourceDocument = pdfDocumentFactory.load(file.getBytes()); PDDocument outputDocument = pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument); diff --git a/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java b/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java index 00776bb8..573bfb7d 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java @@ -10,7 +10,6 @@ import java.util.stream.Collectors; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.springframework.beans.factory.annotation.Autowired; @@ -63,7 +62,7 @@ public class SplitPDFController { String pages = request.getPageNumbers(); // open the pdf document - document = Loader.loadPDF(file.getBytes()); + document = pdfDocumentFactory.load(file.getBytes()); // PdfMetadata metadata = PdfMetadataService.extractMetadataFromPdf(document); int totalPages = document.getNumberOfPages(); List pageNumbers = request.getPageNumbersList(document, false); diff --git a/src/main/java/stirling/software/SPDF/controller/api/SplitPdfByChaptersController.java b/src/main/java/stirling/software/SPDF/controller/api/SplitPdfByChaptersController.java index 3d7af365..195dbd0d 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/SplitPdfByChaptersController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/SplitPdfByChaptersController.java @@ -8,7 +8,6 @@ import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; @@ -34,6 +33,7 @@ import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.model.PdfMetadata; import stirling.software.SPDF.model.api.SplitPdfByChaptersRequest; +import stirling.software.SPDF.service.CustomPDDocumentFactory; import stirling.software.SPDF.service.PdfMetadataService; import stirling.software.SPDF.utils.WebResponseUtils; @@ -45,9 +45,13 @@ public class SplitPdfByChaptersController { private final PdfMetadataService pdfMetadataService; + private final CustomPDDocumentFactory pdfDocumentFactory; + @Autowired - public SplitPdfByChaptersController(PdfMetadataService pdfMetadataService) { + public SplitPdfByChaptersController( + PdfMetadataService pdfMetadataService, CustomPDDocumentFactory pdfDocumentFactory) { this.pdfMetadataService = pdfMetadataService; + this.pdfDocumentFactory = pdfDocumentFactory; } private static List extractOutlineItems( @@ -135,7 +139,7 @@ public class SplitPdfByChaptersController { if (bookmarkLevel < 0) { return ResponseEntity.badRequest().body("Invalid bookmark level".getBytes()); } - sourceDocument = Loader.loadPDF(file.getBytes()); + sourceDocument = pdfDocumentFactory.load(file.getBytes()); PDDocumentOutline outline = sourceDocument.getDocumentCatalog().getDocumentOutline(); diff --git a/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySectionsController.java b/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySectionsController.java index 816ea6d5..1a3842e0 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySectionsController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySectionsController.java @@ -9,7 +9,6 @@ import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.multipdf.LayerUtility; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; @@ -57,7 +56,7 @@ public class SplitPdfBySectionsController { List splitDocumentsBoas = new ArrayList<>(); MultipartFile file = request.getFileInput(); - PDDocument sourceDocument = Loader.loadPDF(file.getBytes()); + PDDocument sourceDocument = pdfDocumentFactory.load(file.getBytes()); // Process the PDF based on split parameters int horiz = request.getHorizontalDivisions() + 1; diff --git a/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySizeController.java b/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySizeController.java index ee7de379..ee73cbb8 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySizeController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySizeController.java @@ -7,7 +7,6 @@ import java.nio.file.Path; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.springframework.beans.factory.annotation.Autowired; @@ -41,6 +40,9 @@ public class SplitPdfBySizeController { @Autowired public SplitPdfBySizeController(CustomPDDocumentFactory pdfDocumentFactory) { this.pdfDocumentFactory = pdfDocumentFactory; + log.info( + "SplitPdfBySizeController initialized with pdfDocumentFactory: {}", + pdfDocumentFactory); } @PostMapping(value = "/split-by-size-or-count", consumes = "multipart/form-data") @@ -52,38 +54,92 @@ public class SplitPdfBySizeController { public ResponseEntity autoSplitPdf(@ModelAttribute SplitPdfBySizeOrCountRequest request) throws Exception { + log.info("Starting PDF split process with request: {}", request); MultipartFile file = request.getFileInput(); + log.info( + "File received: name={}, size={} bytes", + file.getOriginalFilename(), + file.getSize()); + Path zipFile = Files.createTempFile("split_documents", ".zip"); + log.info("Created temporary zip file: {}", zipFile); + String filename = Filenames.toSimpleFileName(file.getOriginalFilename()) .replaceFirst("[.][^.]+$", ""); + log.info("Base filename for output: {}", filename); + byte[] data = null; - try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile)); - PDDocument sourceDocument = Loader.loadPDF(file.getBytes())) { + try { + log.info("Reading input file bytes"); + byte[] pdfBytes = file.getBytes(); + log.info("Successfully read {} bytes from input file", pdfBytes.length); - int type = request.getSplitType(); - String value = request.getSplitValue(); + log.info("Creating ZIP output stream"); + try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile))) { + log.info("Loading PDF document"); + try (PDDocument sourceDocument = pdfDocumentFactory.load(pdfBytes)) { + log.info( + "Successfully loaded PDF with {} pages", + sourceDocument.getNumberOfPages()); - if (type == 0) { - long maxBytes = GeneralUtils.convertSizeToBytes(value); - handleSplitBySize(sourceDocument, maxBytes, zipOut, filename); - } else if (type == 1) { - int pageCount = Integer.parseInt(value); - handleSplitByPageCount(sourceDocument, pageCount, zipOut, filename); - } else if (type == 2) { - int documentCount = Integer.parseInt(value); - handleSplitByDocCount(sourceDocument, documentCount, zipOut, filename); - } else { - throw new IllegalArgumentException("Invalid argument for split type"); + int type = request.getSplitType(); + String value = request.getSplitValue(); + log.info("Split type: {}, Split value: {}", type, value); + + if (type == 0) { + log.info("Processing split by size"); + long maxBytes = GeneralUtils.convertSizeToBytes(value); + log.info("Max bytes per document: {}", maxBytes); + handleSplitBySize(sourceDocument, maxBytes, zipOut, filename); + } else if (type == 1) { + log.info("Processing split by page count"); + int pageCount = Integer.parseInt(value); + log.info("Pages per document: {}", pageCount); + handleSplitByPageCount(sourceDocument, pageCount, zipOut, filename); + } else if (type == 2) { + log.info("Processing split by document count"); + int documentCount = Integer.parseInt(value); + log.info("Total number of documents: {}", documentCount); + handleSplitByDocCount(sourceDocument, documentCount, zipOut, filename); + } else { + log.error("Invalid split type: {}", type); + throw new IllegalArgumentException( + "Invalid argument for split type: " + type); + } + + log.info("PDF splitting completed successfully"); + } catch (Exception e) { + log.error("Error loading or processing PDF document", e); + throw e; + } + } catch (IOException e) { + log.error("Error creating or writing to ZIP file", e); + throw e; } } catch (Exception e) { - log.error("exception", e); + log.error("Exception during PDF splitting process", e); + throw e; // Re-throw to ensure proper error response } finally { - data = Files.readAllBytes(zipFile); - Files.deleteIfExists(zipFile); + try { + log.info("Reading ZIP file data"); + data = Files.readAllBytes(zipFile); + log.info("Successfully read {} bytes from ZIP file", data.length); + } catch (IOException e) { + log.error("Error reading ZIP file data", e); + } + + try { + log.info("Deleting temporary ZIP file"); + boolean deleted = Files.deleteIfExists(zipFile); + log.info("Temporary ZIP file deleted: {}", deleted); + } catch (IOException e) { + log.error("Error deleting temporary ZIP file", e); + } } + log.info("Returning response with {} bytes of data", data != null ? data.length : 0); return WebResponseUtils.bytesToWebResponse( data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM); } @@ -91,63 +147,230 @@ public class SplitPdfBySizeController { private void handleSplitBySize( PDDocument sourceDocument, long maxBytes, ZipOutputStream zipOut, String baseFilename) throws IOException { - long currentSize = 0; + log.info("Starting handleSplitBySize with maxBytes={}", maxBytes); + PDDocument currentDoc = pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument); int fileIndex = 1; + int totalPages = sourceDocument.getNumberOfPages(); + int pageAdded = 0; - for (int pageIndex = 0; pageIndex < sourceDocument.getNumberOfPages(); pageIndex++) { + // Smart size check frequency - check more often with larger documents + int baseCheckFrequency = 5; + + for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { PDPage page = sourceDocument.getPage(pageIndex); - ByteArrayOutputStream pageOutputStream = new ByteArrayOutputStream(); + log.info("Processing page {} of {}", pageIndex + 1, totalPages); - try (PDDocument tempDoc = new PDDocument()) { - PDPage importedPage = tempDoc.importPage(page); // This creates a new PDPage object - tempDoc.save(pageOutputStream); - } + // Add the page to current document + PDPage newPage = new PDPage(page.getCOSObject()); + currentDoc.addPage(newPage); + pageAdded++; - long pageSize = pageOutputStream.size(); - if (currentSize + pageSize > maxBytes) { - if (currentDoc.getNumberOfPages() > 0) { + // Dynamic size checking based on document size and page count + boolean shouldCheckSize = + (pageAdded % baseCheckFrequency == 0) + || (pageIndex == totalPages - 1) + || (pageAdded >= 20); // Always check after 20 pages + + if (shouldCheckSize) { + log.info("Performing size check after {} pages", pageAdded); + ByteArrayOutputStream checkSizeStream = new ByteArrayOutputStream(); + currentDoc.save(checkSizeStream); + long actualSize = checkSizeStream.size(); + log.info("Current document size: {} bytes (max: {} bytes)", actualSize, maxBytes); + + if (actualSize > maxBytes) { + // We exceeded the limit - remove the last page and save + if (currentDoc.getNumberOfPages() > 1) { + currentDoc.removePage(currentDoc.getNumberOfPages() - 1); + pageIndex--; // Process this page again in the next document + log.info("Size limit exceeded - removed last page"); + } + + log.info( + "Saving document with {} pages as part {}", + currentDoc.getNumberOfPages(), + fileIndex); saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++); - currentDoc.close(); // Make sure to close the document currentDoc = new PDDocument(); - currentSize = 0; + pageAdded = 0; + } else if (pageIndex < totalPages - 1) { + // We're under the limit, calculate if we might fit more pages + // Try to predict how many more similar pages might fit + if (actualSize < maxBytes * 0.75 && pageAdded > 0) { + // Rather than using a ratio, look ahead to test actual upcoming pages + int pagesToLookAhead = Math.min(5, totalPages - pageIndex - 1); + + if (pagesToLookAhead > 0) { + log.info( + "Testing {} upcoming pages for potential addition", + pagesToLookAhead); + + // Create a temp document with current pages + look-ahead pages + PDDocument testDoc = new PDDocument(); + // First copy existing pages + for (int i = 0; i < currentDoc.getNumberOfPages(); i++) { + testDoc.addPage(new PDPage(currentDoc.getPage(i).getCOSObject())); + } + + // Try adding look-ahead pages one by one + int extraPagesAdded = 0; + for (int i = 0; i < pagesToLookAhead; i++) { + int testPageIndex = pageIndex + 1 + i; + PDPage testPage = sourceDocument.getPage(testPageIndex); + testDoc.addPage(new PDPage(testPage.getCOSObject())); + + // Check if we're still under size + ByteArrayOutputStream testStream = new ByteArrayOutputStream(); + testDoc.save(testStream); + long testSize = testStream.size(); + + if (testSize <= maxBytes) { + extraPagesAdded++; + log.info( + "Test: Can add page {} (size would be {})", + testPageIndex + 1, + testSize); + } else { + log.info( + "Test: Cannot add page {} (size would be {})", + testPageIndex + 1, + testSize); + break; + } + } + + testDoc.close(); + + // Add the pages we verified would fit + if (extraPagesAdded > 0) { + log.info("Adding {} verified pages ahead", extraPagesAdded); + for (int i = 0; i < extraPagesAdded; i++) { + int extraPageIndex = pageIndex + 1 + i; + PDPage extraPage = sourceDocument.getPage(extraPageIndex); + currentDoc.addPage(new PDPage(extraPage.getCOSObject())); + } + pageIndex += extraPagesAdded; + pageAdded += extraPagesAdded; + } + } + } } } - - PDPage newPage = new PDPage(page.getCOSObject()); // Re-create the page - currentDoc.addPage(newPage); - currentSize += pageSize; } - if (currentDoc.getNumberOfPages() != 0) { + // Save final document if it has any pages + if (currentDoc.getNumberOfPages() > 0) { + log.info( + "Saving final document with {} pages as part {}", + currentDoc.getNumberOfPages(), + fileIndex); saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++); - currentDoc.close(); } + + log.info("Completed handleSplitBySize with {} document parts created", fileIndex - 1); } private void handleSplitByPageCount( PDDocument sourceDocument, int pageCount, ZipOutputStream zipOut, String baseFilename) throws IOException { + log.info("Starting handleSplitByPageCount with pageCount={}", pageCount); int currentPageCount = 0; - PDDocument currentDoc = - pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument); - int fileIndex = 1; - for (PDPage page : sourceDocument.getPages()) { - currentDoc.addPage(page); - currentPageCount++; + log.info("Creating initial output document"); + PDDocument currentDoc = null; + try { + currentDoc = pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument); + log.info("Successfully created initial output document"); + } catch (Exception e) { + log.error("Error creating initial output document", e); + throw new IOException("Failed to create initial output document", e); + } - if (currentPageCount == pageCount) { - // Save and reset current document - saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++); - currentDoc = new PDDocument(); - currentPageCount = 0; + int fileIndex = 1; + int pageIndex = 0; + int totalPages = sourceDocument.getNumberOfPages(); + log.info("Processing {} pages", totalPages); + + try { + for (PDPage page : sourceDocument.getPages()) { + pageIndex++; + log.info("Processing page {} of {}", pageIndex, totalPages); + + try { + log.info("Adding page {} to current document", pageIndex); + currentDoc.addPage(page); + log.info("Successfully added page {} to current document", pageIndex); + } catch (Exception e) { + log.error("Error adding page {} to current document", pageIndex, e); + throw new IOException("Failed to add page to document", e); + } + + currentPageCount++; + log.info("Current page count: {}/{}", currentPageCount, pageCount); + + if (currentPageCount == pageCount) { + log.info( + "Reached target page count ({}), saving current document as part {}", + pageCount, + fileIndex); + try { + saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++); + log.info("Successfully saved document part {}", fileIndex - 1); + } catch (Exception e) { + log.error("Error saving document part {}", fileIndex - 1, e); + throw e; + } + + try { + log.info("Creating new document for next part"); + currentDoc = new PDDocument(); + log.info("Successfully created new document"); + } catch (Exception e) { + log.error("Error creating new document for next part", e); + throw new IOException("Failed to create new document", e); + } + + currentPageCount = 0; + log.info("Reset current page count to 0"); + } + } + } catch (Exception e) { + log.error("Error iterating through pages", e); + throw new IOException("Failed to iterate through pages", e); + } + + // Add the last document if it contains any pages + try { + if (currentDoc.getPages().getCount() != 0) { + log.info( + "Saving final document with {} pages as part {}", + currentDoc.getPages().getCount(), + fileIndex); + try { + saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++); + log.info("Successfully saved final document part {}", fileIndex - 1); + } catch (Exception e) { + log.error("Error saving final document part {}", fileIndex - 1, e); + throw e; + } + } else { + log.info("Final document has no pages, skipping"); + } + } catch (Exception e) { + log.error("Error checking or saving final document", e); + throw new IOException("Failed to process final document", e); + } finally { + try { + log.info("Closing final document"); + currentDoc.close(); + log.info("Successfully closed final document"); + } catch (Exception e) { + log.error("Error closing final document", e); } } - // Add the last document if it contains any pages - if (currentDoc.getPages().getCount() != 0) { - saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++); - } + + log.info("Completed handleSplitByPageCount with {} document parts created", fileIndex - 1); } private void handleSplitByDocCount( @@ -156,35 +379,101 @@ public class SplitPdfBySizeController { ZipOutputStream zipOut, String baseFilename) throws IOException { + log.info("Starting handleSplitByDocCount with documentCount={}", documentCount); int totalPageCount = sourceDocument.getNumberOfPages(); + log.info("Total pages in source document: {}", totalPageCount); + int pagesPerDocument = totalPageCount / documentCount; int extraPages = totalPageCount % documentCount; + log.info("Pages per document: {}, Extra pages: {}", pagesPerDocument, extraPages); + int currentPageIndex = 0; int fileIndex = 1; - for (int i = 0; i < documentCount; i++) { - PDDocument currentDoc = - pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument); - int pagesToAdd = pagesPerDocument + (i < extraPages ? 1 : 0); - for (int j = 0; j < pagesToAdd; j++) { - currentDoc.addPage(sourceDocument.getPage(currentPageIndex++)); + for (int i = 0; i < documentCount; i++) { + log.info("Creating document {} of {}", i + 1, documentCount); + PDDocument currentDoc = null; + try { + currentDoc = pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument); + log.info("Successfully created document {} of {}", i + 1, documentCount); + } catch (Exception e) { + log.error("Error creating document {} of {}", i + 1, documentCount, e); + throw new IOException("Failed to create document", e); } - saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++); + int pagesToAdd = pagesPerDocument + (i < extraPages ? 1 : 0); + log.info("Adding {} pages to document {}", pagesToAdd, i + 1); + + for (int j = 0; j < pagesToAdd; j++) { + try { + log.info( + "Adding page {} (index {}) to document {}", + j + 1, + currentPageIndex, + i + 1); + currentDoc.addPage(sourceDocument.getPage(currentPageIndex)); + log.info("Successfully added page {} to document {}", j + 1, i + 1); + currentPageIndex++; + } catch (Exception e) { + log.error("Error adding page {} to document {}", j + 1, i + 1, e); + throw new IOException("Failed to add page to document", e); + } + } + + try { + log.info("Saving document {} with {} pages", i + 1, pagesToAdd); + saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++); + log.info("Successfully saved document {}", i + 1); + } catch (Exception e) { + log.error("Error saving document {}", i + 1, e); + throw e; + } } + + log.info("Completed handleSplitByDocCount with {} documents created", documentCount); } private void saveDocumentToZip( PDDocument document, ZipOutputStream zipOut, String baseFilename, int index) throws IOException { + log.info("Starting saveDocumentToZip for document part {}", index); ByteArrayOutputStream outStream = new ByteArrayOutputStream(); - document.save(outStream); - document.close(); // Close the document to free resources - // Create a new zip entry - ZipEntry zipEntry = new ZipEntry(baseFilename + "_" + index + ".pdf"); - zipOut.putNextEntry(zipEntry); - zipOut.write(outStream.toByteArray()); - zipOut.closeEntry(); + try { + log.info("Saving document part {} to byte array", index); + document.save(outStream); + log.info("Successfully saved document part {} ({} bytes)", index, outStream.size()); + } catch (Exception e) { + log.error("Error saving document part {} to byte array", index, e); + throw new IOException("Failed to save document to byte array", e); + } + + try { + log.info("Closing document part {}", index); + document.close(); + log.info("Successfully closed document part {}", index); + } catch (Exception e) { + log.error("Error closing document part {}", index, e); + // Continue despite close error + } + + try { + // Create a new zip entry + String entryName = baseFilename + "_" + index + ".pdf"; + log.info("Creating ZIP entry: {}", entryName); + ZipEntry zipEntry = new ZipEntry(entryName); + zipOut.putNextEntry(zipEntry); + + byte[] bytes = outStream.toByteArray(); + log.info("Writing {} bytes to ZIP entry", bytes.length); + zipOut.write(bytes); + + log.info("Closing ZIP entry"); + zipOut.closeEntry(); + log.info("Successfully added document part {} to ZIP", index); + } catch (Exception e) { + log.error("Error adding document part {} to ZIP", index, e); + throw new IOException("Failed to add document to ZIP file", e); + } } } diff --git a/src/main/java/stirling/software/SPDF/controller/api/ToSinglePageController.java b/src/main/java/stirling/software/SPDF/controller/api/ToSinglePageController.java index 07b2f4d4..4e851a48 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/ToSinglePageController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/ToSinglePageController.java @@ -4,7 +4,6 @@ import java.awt.geom.AffineTransform; import java.io.ByteArrayOutputStream; import java.io.IOException; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.multipdf.LayerUtility; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; @@ -46,7 +45,7 @@ public class ToSinglePageController { throws IOException { // Load the source document - PDDocument sourceDocument = Loader.loadPDF(request.getFileInput().getBytes()); + PDDocument sourceDocument = pdfDocumentFactory.load(request.getFileInput().getBytes()); // Calculate total height and max width float totalHeight = 0; diff --git a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertImgPDFController.java b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertImgPDFController.java index 104ed0fa..3e277ab1 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertImgPDFController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertImgPDFController.java @@ -13,7 +13,6 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; import org.apache.commons.io.FileUtils; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.rendering.ImageType; @@ -91,6 +90,7 @@ public class ConvertImgPDFController { result = PdfUtils.convertFromPdf( + pdfDocumentFactory, newPdfBytes, "webp".equalsIgnoreCase(imageFormat) ? "png" @@ -245,7 +245,7 @@ public class ConvertImgPDFController { */ private byte[] rearrangePdfPages(byte[] pdfBytes, String[] pageOrderArr) throws IOException { // Load the input PDF - PDDocument document = Loader.loadPDF(pdfBytes); + PDDocument document = pdfDocumentFactory.load(pdfBytes); int totalPages = document.getNumberOfPages(); List newPageOrder = GeneralUtils.parsePageList(pageOrderArr, totalPages, false); diff --git a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToOffice.java b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToOffice.java index 030ede95..58b6fd7f 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToOffice.java +++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToOffice.java @@ -2,9 +2,9 @@ package stirling.software.SPDF.controller.api.converters; import java.io.IOException; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.ModelAttribute; @@ -21,6 +21,7 @@ import stirling.software.SPDF.model.api.PDFFile; import stirling.software.SPDF.model.api.converters.PdfToPresentationRequest; import stirling.software.SPDF.model.api.converters.PdfToTextOrRTFRequest; import stirling.software.SPDF.model.api.converters.PdfToWordRequest; +import stirling.software.SPDF.service.CustomPDDocumentFactory; import stirling.software.SPDF.utils.PDFToFile; import stirling.software.SPDF.utils.WebResponseUtils; @@ -29,6 +30,13 @@ import stirling.software.SPDF.utils.WebResponseUtils; @Tag(name = "Convert", description = "Convert APIs") public class ConvertPDFToOffice { + private final CustomPDDocumentFactory pdfDocumentFactory; + + @Autowired + public ConvertPDFToOffice(CustomPDDocumentFactory pdfDocumentFactory) { + this.pdfDocumentFactory = pdfDocumentFactory; + } + @PostMapping(consumes = "multipart/form-data", value = "/pdf/presentation") @Operation( summary = "Convert PDF to Presentation format", @@ -54,7 +62,7 @@ public class ConvertPDFToOffice { MultipartFile inputFile = request.getFileInput(); String outputFormat = request.getOutputFormat(); if ("txt".equals(request.getOutputFormat())) { - try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) { + try (PDDocument document = pdfDocumentFactory.load(inputFile.getBytes())) { PDFTextStripper stripper = new PDFTextStripper(); String text = stripper.getText(document); return WebResponseUtils.bytesToWebResponse( diff --git a/src/main/java/stirling/software/SPDF/controller/api/converters/ExtractCSVController.java b/src/main/java/stirling/software/SPDF/controller/api/converters/ExtractCSVController.java index d88bb02d..54620113 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/converters/ExtractCSVController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ExtractCSVController.java @@ -12,8 +12,8 @@ import java.util.zip.ZipOutputStream; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.QuoteMode; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.ContentDisposition; import org.springframework.http.HttpHeaders; import org.springframework.http.MediaType; @@ -30,6 +30,7 @@ import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.model.api.PDFWithPageNums; import stirling.software.SPDF.pdf.FlexibleCSVWriter; +import stirling.software.SPDF.service.CustomPDDocumentFactory; import technology.tabula.ObjectExtractor; import technology.tabula.Page; @@ -42,6 +43,13 @@ import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; @Slf4j public class ExtractCSVController { + private final CustomPDDocumentFactory pdfDocumentFactory; + + @Autowired + public ExtractCSVController(CustomPDDocumentFactory pdfDocumentFactory) { + this.pdfDocumentFactory = pdfDocumentFactory; + } + @PostMapping(value = "/pdf/csv", consumes = "multipart/form-data") @Operation( summary = "Extracts a CSV document from a PDF", @@ -51,7 +59,7 @@ public class ExtractCSVController { String baseName = getBaseName(form.getFileInput().getOriginalFilename()); List csvEntries = new ArrayList<>(); - try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) { + try (PDDocument document = pdfDocumentFactory.load(form.getFileInput().getBytes())) { List pages = form.getPageNumbersList(document, true); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); CSVFormat format = diff --git a/src/main/java/stirling/software/SPDF/controller/api/filters/FilterController.java b/src/main/java/stirling/software/SPDF/controller/api/filters/FilterController.java index 9beaf413..7ddded2a 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/filters/FilterController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/filters/FilterController.java @@ -2,10 +2,10 @@ package stirling.software.SPDF.controller.api.filters; import java.io.IOException; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.PostMapping; @@ -23,6 +23,7 @@ import stirling.software.SPDF.model.api.filter.ContainsTextRequest; import stirling.software.SPDF.model.api.filter.FileSizeRequest; import stirling.software.SPDF.model.api.filter.PageRotationRequest; import stirling.software.SPDF.model.api.filter.PageSizeRequest; +import stirling.software.SPDF.service.CustomPDDocumentFactory; import stirling.software.SPDF.utils.PdfUtils; import stirling.software.SPDF.utils.WebResponseUtils; @@ -31,6 +32,13 @@ import stirling.software.SPDF.utils.WebResponseUtils; @Tag(name = "Filter", description = "Filter APIs") public class FilterController { + private final CustomPDDocumentFactory pdfDocumentFactory; + + @Autowired + public FilterController(CustomPDDocumentFactory pdfDocumentFactory) { + this.pdfDocumentFactory = pdfDocumentFactory; + } + @PostMapping(consumes = "multipart/form-data", value = "/filter-contains-text") @Operation( summary = "Checks if a PDF contains set text, returns true if does", @@ -41,7 +49,7 @@ public class FilterController { String text = request.getText(); String pageNumber = request.getPageNumbers(); - PDDocument pdfDocument = Loader.loadPDF(inputFile.getBytes()); + PDDocument pdfDocument = pdfDocumentFactory.load(inputFile.getBytes()); if (PdfUtils.hasText(pdfDocument, pageNumber, text)) return WebResponseUtils.pdfDocToWebResponse( pdfDocument, Filenames.toSimpleFileName(inputFile.getOriginalFilename())); @@ -58,7 +66,7 @@ public class FilterController { MultipartFile inputFile = request.getFileInput(); String pageNumber = request.getPageNumbers(); - PDDocument pdfDocument = Loader.loadPDF(inputFile.getBytes()); + PDDocument pdfDocument = pdfDocumentFactory.load(inputFile.getBytes()); if (PdfUtils.hasImages(pdfDocument, pageNumber)) return WebResponseUtils.pdfDocToWebResponse( pdfDocument, Filenames.toSimpleFileName(inputFile.getOriginalFilename())); @@ -75,7 +83,7 @@ public class FilterController { String pageCount = request.getPageCount(); String comparator = request.getComparator(); // Load the PDF - PDDocument document = Loader.loadPDF(inputFile.getBytes()); + PDDocument document = pdfDocumentFactory.load(inputFile.getBytes()); int actualPageCount = document.getNumberOfPages(); boolean valid = false; @@ -109,7 +117,7 @@ public class FilterController { String comparator = request.getComparator(); // Load the PDF - PDDocument document = Loader.loadPDF(inputFile.getBytes()); + PDDocument document = pdfDocumentFactory.load(inputFile.getBytes()); PDPage firstPage = document.getPage(0); PDRectangle actualPageSize = firstPage.getMediaBox(); @@ -185,7 +193,7 @@ public class FilterController { String comparator = request.getComparator(); // Load the PDF - PDDocument document = Loader.loadPDF(inputFile.getBytes()); + PDDocument document = pdfDocumentFactory.load(inputFile.getBytes()); // Get the rotation of the first page PDPage firstPage = document.getPage(0); diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/AutoRenameController.java b/src/main/java/stirling/software/SPDF/controller/api/misc/AutoRenameController.java index b85b7fdf..0bbe7e6b 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/misc/AutoRenameController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/misc/AutoRenameController.java @@ -5,10 +5,10 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.List; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.PostMapping; @@ -23,6 +23,7 @@ import io.swagger.v3.oas.annotations.tags.Tag; import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.model.api.misc.ExtractHeaderRequest; +import stirling.software.SPDF.service.CustomPDDocumentFactory; import stirling.software.SPDF.utils.WebResponseUtils; @RestController @@ -34,6 +35,13 @@ public class AutoRenameController { private static final float TITLE_FONT_SIZE_THRESHOLD = 20.0f; private static final int LINE_LIMIT = 200; + private final CustomPDDocumentFactory pdfDocumentFactory; + + @Autowired + public AutoRenameController(CustomPDDocumentFactory pdfDocumentFactory) { + this.pdfDocumentFactory = pdfDocumentFactory; + } + @PostMapping(consumes = "multipart/form-data", value = "/auto-rename") @Operation( summary = "Extract header from PDF file", @@ -44,7 +52,7 @@ public class AutoRenameController { MultipartFile file = request.getFileInput(); Boolean useFirstTextAsFallback = request.isUseFirstTextAsFallback(); - PDDocument document = Loader.loadPDF(file.getBytes()); + PDDocument document = pdfDocumentFactory.load(file.getBytes()); PDFTextStripper reader = new PDFTextStripper() { List lineInfos = new ArrayList<>(); diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/AutoSplitPdfController.java b/src/main/java/stirling/software/SPDF/controller/api/misc/AutoSplitPdfController.java index 50e7032c..16b02114 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/misc/AutoSplitPdfController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/misc/AutoSplitPdfController.java @@ -111,9 +111,9 @@ public class AutoSplitPdfController { summary = "Auto split PDF pages into separate documents", description = "This endpoint accepts a PDF file, scans each page for a specific QR code, and" - + " splits the document at the QR code boundaries. The output is a zip file" - + " containing each separate PDF document. Input:PDF Output:ZIP-PDF" - + " Type:SISO") + + " splits the document at the QR code boundaries. The output is a zip file" + + " containing each separate PDF document. Input:PDF Output:ZIP-PDF" + + " Type:SISO") public ResponseEntity autoSplitPdf(@ModelAttribute AutoSplitPdfRequest request) throws IOException { MultipartFile file = request.getFileInput(); diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/BlankPageController.java b/src/main/java/stirling/software/SPDF/controller/api/misc/BlankPageController.java index 9c84a8e9..7fee8e2a 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/misc/BlankPageController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/misc/BlankPageController.java @@ -8,7 +8,6 @@ import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageTree; @@ -85,7 +84,7 @@ public class BlankPageController { int threshold = request.getThreshold(); float whitePercent = request.getWhitePercent(); - try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) { + try (PDDocument document = pdfDocumentFactory.load(inputFile.getBytes())) { PDPageTree pages = document.getDocumentCatalog().getPages(); PDFTextStripper textStripper = new PDFTextStripper(); diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java b/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java index 759dc72f..deb4e7aa 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java @@ -18,7 +18,6 @@ import javax.imageio.ImageWriter; import javax.imageio.plugins.jpeg.JPEGImageWriteParam; import javax.imageio.stream.ImageOutputStream; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; @@ -59,7 +58,8 @@ public class CompressController { this.pdfDocumentFactory = pdfDocumentFactory; } - private void compressImagesInPDF(Path pdfFile, double scaleFactor, float jpegQuality) throws Exception { + private void compressImagesInPDF(Path pdfFile, double scaleFactor, float jpegQuality) + throws Exception { byte[] fileBytes = Files.readAllBytes(pdfFile); long originalFileSize = fileBytes.length; log.info( @@ -71,7 +71,7 @@ public class CompressController { // Track processed images to avoid recompression Set processedImages = new HashSet<>(); - try (PDDocument doc = Loader.loadPDF(fileBytes)) { + try (PDDocument doc = pdfDocumentFactory.load(fileBytes)) { int totalImages = 0; int compressedImages = 0; int skippedImages = 0; @@ -204,10 +204,12 @@ public class CompressController { // Choose appropriate format and compression String format = bufferedImage.getColorModel().hasAlpha() ? "png" : "jpeg"; - // First get the actual size of the original image by encoding it to the chosen format + // First get the actual size of the original image by encoding it to the chosen + // format ByteArrayOutputStream originalImageStream = new ByteArrayOutputStream(); if (format.equals("jpeg")) { - // Get the best available JPEG writer (prioritizes TwelveMonkeys if available) + // Get the best available JPEG writer (prioritizes TwelveMonkeys if + // available) Iterator writers = ImageIO.getImageWritersByFormatName("jpeg"); ImageWriter writer = null; @@ -430,8 +432,8 @@ public class CompressController { // All levels (1-9): Apply QPDF compression if (!qpdfCompressionApplied) { - long preQpdfSize = Files.size(tempInputFile); - log.info("Pre-QPDF file size: {}", GeneralUtils.formatBytes(preQpdfSize)); + long preQpdfSize = Files.size(tempInputFile); + log.info("Pre-QPDF file size: {}", GeneralUtils.formatBytes(preQpdfSize)); // For levels 1-3, map to qpdf compression levels 1-9 int qpdfCompressionLevel = optimizeLevel; @@ -472,8 +474,7 @@ public class CompressController { double qpdfReduction = 100.0 - ((postQpdfSize * 100.0) / preQpdfSize); log.info( "Post-QPDF file size: {} (reduced by {:.1f}%)", - GeneralUtils.formatBytes(postQpdfSize), - qpdfReduction); + GeneralUtils.formatBytes(postQpdfSize), qpdfReduction); } else { tempOutputFile = tempInputFile; diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/DecompressPdfController.java b/src/main/java/stirling/software/SPDF/controller/api/misc/DecompressPdfController.java new file mode 100644 index 00000000..626f3568 --- /dev/null +++ b/src/main/java/stirling/software/SPDF/controller/api/misc/DecompressPdfController.java @@ -0,0 +1,145 @@ +package stirling.software.SPDF.controller.api.misc; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.HashSet; +import java.util.Set; + +import org.apache.pdfbox.cos.*; +import org.apache.pdfbox.io.IOUtils; +import org.apache.pdfbox.pdfwriter.compress.CompressParameters; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.ModelAttribute; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.multipart.MultipartFile; + +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.tags.Tag; + +import lombok.extern.slf4j.Slf4j; + +import stirling.software.SPDF.model.api.PDFFile; +import stirling.software.SPDF.service.CustomPDDocumentFactory; +import stirling.software.SPDF.utils.WebResponseUtils; + +@RestController +@RequestMapping("/api/v1/misc") +@Slf4j +@Tag(name = "Misc", description = "Miscellaneous APIs") +public class DecompressPdfController { + + private final CustomPDDocumentFactory pdfDocumentFactory; + + @Autowired + public DecompressPdfController(CustomPDDocumentFactory pdfDocumentFactory) { + this.pdfDocumentFactory = pdfDocumentFactory; + } + + @PostMapping(value = "/decompress-pdf", consumes = "multipart/form-data") + @Operation( + summary = "Decompress PDF streams", + description = "Fully decompresses all PDF streams including text content") + public ResponseEntity decompressPdf(@ModelAttribute PDFFile request) + throws IOException { + + MultipartFile file = request.getFileInput(); + + try (PDDocument document = pdfDocumentFactory.load(file.getBytes())) { + // Process all objects in document + processAllObjects(document); + + // Save with explicit no compression + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + document.save(baos, CompressParameters.NO_COMPRESSION); + + String outputFilename = + file.getOriginalFilename().replaceFirst("\\.(?=[^.]+$)", "_decompressed."); + return WebResponseUtils.bytesToWebResponse( + baos.toByteArray(), outputFilename, MediaType.APPLICATION_PDF); + } + } + + private void processAllObjects(PDDocument document) { + Set processed = new HashSet<>(); + COSDocument cosDoc = document.getDocument(); + + // Process all objects in the document + for (COSObjectKey key : cosDoc.getXrefTable().keySet()) { + COSObject obj = cosDoc.getObjectFromPool(key); + processObject(obj, processed); + } + } + + private void processObject(COSBase obj, Set processed) { + // Skip null objects or already processed objects to avoid infinite recursion + if (obj == null || processed.contains(obj)) return; + processed.add(obj); + + if (obj instanceof COSObject cosObj) { + processObject(cosObj.getObject(), processed); + } else if (obj instanceof COSDictionary dict) { + processDictionary(dict, processed); + } else if (obj instanceof COSArray array) { + processArray(array, processed); + } + } + + private void processDictionary(COSDictionary dict, Set processed) { + // Process all dictionary entries + for (COSName key : dict.keySet()) { + processObject(dict.getDictionaryObject(key), processed); + } + + // If this is a stream, decompress it + if (dict instanceof COSStream stream) { + decompressStream(stream); + } + } + + private void processArray(COSArray array, Set processed) { + // Process all array elements + for (int i = 0; i < array.size(); i++) { + processObject(array.get(i), processed); + } + } + + private void decompressStream(COSStream stream) { + try { + log.debug("Processing stream: {}", stream); + + // Only remove filter information if it exists + if (stream.containsKey(COSName.FILTER) + || stream.containsKey(COSName.DECODE_PARMS) + || stream.containsKey(COSName.D)) { + + // Read the decompressed content first + byte[] decompressedBytes; + try (COSInputStream is = stream.createInputStream()) { + decompressedBytes = IOUtils.toByteArray(is); + } + + // Now remove filter information + stream.removeItem(COSName.FILTER); + stream.removeItem(COSName.DECODE_PARMS); + stream.removeItem(COSName.D); + + // Write the raw content back + try (OutputStream out = stream.createRawOutputStream()) { + out.write(decompressedBytes); + } + + // Set the Length to reflect the new stream size + stream.setInt(COSName.LENGTH, decompressedBytes.length); + } + } catch (IOException e) { + log.error("Error decompressing stream", e); + // Continue processing other streams even if this one fails + } + } +} diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImageScansController.java b/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImageScansController.java index 92c4ed42..3769bc23 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImageScansController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImageScansController.java @@ -14,9 +14,9 @@ import java.util.zip.ZipOutputStream; import javax.imageio.ImageIO; import org.apache.commons.io.FileUtils; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.rendering.PDFRenderer; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.PostMapping; @@ -32,6 +32,7 @@ import io.swagger.v3.oas.annotations.tags.Tag; import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.model.api.misc.ExtractImageScansRequest; +import stirling.software.SPDF.service.CustomPDDocumentFactory; import stirling.software.SPDF.utils.CheckProgramInstall; import stirling.software.SPDF.utils.ProcessExecutor; import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult; @@ -45,6 +46,13 @@ public class ExtractImageScansController { private static final String REPLACEFIRST = "[.][^.]+$"; + private final CustomPDDocumentFactory pdfDocumentFactory; + + @Autowired + public ExtractImageScansController(CustomPDDocumentFactory pdfDocumentFactory) { + this.pdfDocumentFactory = pdfDocumentFactory; + } + @PostMapping(consumes = "multipart/form-data", value = "/extract-image-scans") @Operation( summary = "Extract image scans from an input file", @@ -87,7 +95,8 @@ public class ExtractImageScansController { // Check if input file is a PDF if ("pdf".equalsIgnoreCase(extension)) { // Load PDF document - try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) { + try (PDDocument document = + pdfDocumentFactory.load(form.getFileInput().getBytes())) { PDFRenderer pdfRenderer = new PDFRenderer(document); pdfRenderer.setSubsamplingAllowed(true); int pageCount = document.getNumberOfPages(); diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java b/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java index 189b1a7e..4c5c60e3 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java @@ -20,11 +20,11 @@ import java.util.zip.ZipOutputStream; import javax.imageio.ImageIO; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.ModelAttribute; @@ -40,6 +40,7 @@ import io.swagger.v3.oas.annotations.tags.Tag; import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.model.api.PDFExtractImagesRequest; +import stirling.software.SPDF.service.CustomPDDocumentFactory; import stirling.software.SPDF.utils.ImageProcessingUtils; import stirling.software.SPDF.utils.WebResponseUtils; @@ -49,6 +50,13 @@ import stirling.software.SPDF.utils.WebResponseUtils; @Tag(name = "Misc", description = "Miscellaneous APIs") public class ExtractImagesController { + private final CustomPDDocumentFactory pdfDocumentFactory; + + @Autowired + public ExtractImagesController(CustomPDDocumentFactory pdfDocumentFactory) { + this.pdfDocumentFactory = pdfDocumentFactory; + } + @PostMapping(consumes = "multipart/form-data", value = "/extract-images") @Operation( summary = "Extract images from a PDF file", @@ -59,7 +67,7 @@ public class ExtractImagesController { MultipartFile file = request.getFileInput(); String format = request.getFormat(); boolean allowDuplicates = request.isAllowDuplicates(); - PDDocument document = Loader.loadPDF(file.getBytes()); + PDDocument document = pdfDocumentFactory.load(file.getBytes()); // Determine if multithreading should be used based on PDF size or number of pages boolean useMultithreading = shouldUseMultithreading(file, document); diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/FlattenController.java b/src/main/java/stirling/software/SPDF/controller/api/misc/FlattenController.java index 9361e715..9b9126e1 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/misc/FlattenController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/misc/FlattenController.java @@ -3,7 +3,6 @@ package stirling.software.SPDF.controller.api.misc; import java.awt.image.BufferedImage; import java.io.IOException; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; @@ -51,7 +50,7 @@ public class FlattenController { public ResponseEntity flatten(@ModelAttribute FlattenRequest request) throws Exception { MultipartFile file = request.getFileInput(); - PDDocument document = Loader.loadPDF(file.getBytes()); + PDDocument document = pdfDocumentFactory.load(file.getBytes()); Boolean flattenOnlyForms = request.getFlattenOnlyForms(); if (Boolean.TRUE.equals(flattenOnlyForms)) { diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/MetadataController.java b/src/main/java/stirling/software/SPDF/controller/api/misc/MetadataController.java index 4ebce319..69553e42 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/misc/MetadataController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/misc/MetadataController.java @@ -7,10 +7,10 @@ import java.util.Calendar; import java.util.Map; import java.util.Map.Entry; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.WebDataBinder; import org.springframework.web.bind.annotation.*; @@ -23,6 +23,7 @@ import io.swagger.v3.oas.annotations.tags.Tag; import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.model.api.misc.MetadataRequest; +import stirling.software.SPDF.service.CustomPDDocumentFactory; import stirling.software.SPDF.utils.WebResponseUtils; import stirling.software.SPDF.utils.propertyeditor.StringToMapPropertyEditor; @@ -32,6 +33,13 @@ import stirling.software.SPDF.utils.propertyeditor.StringToMapPropertyEditor; @Tag(name = "Misc", description = "Miscellaneous APIs") public class MetadataController { + private final CustomPDDocumentFactory pdfDocumentFactory; + + @Autowired + public MetadataController(CustomPDDocumentFactory pdfDocumentFactory) { + this.pdfDocumentFactory = pdfDocumentFactory; + } + private String checkUndefined(String entry) { // Check if the string is "undefined" if ("undefined".equals(entry)) { @@ -76,7 +84,7 @@ public class MetadataController { allRequestParams = new java.util.HashMap(); } // Load the PDF file into a PDDocument - PDDocument document = Loader.loadPDF(pdfFile.getBytes()); + PDDocument document = pdfDocumentFactory.load(pdfFile.getBytes()); // Get the document information from the PDF PDDocumentInformation info = document.getDocumentInformation(); diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/PageNumbersController.java b/src/main/java/stirling/software/SPDF/controller/api/misc/PageNumbersController.java index 040d5e96..c3bbb721 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/misc/PageNumbersController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/misc/PageNumbersController.java @@ -73,17 +73,16 @@ public class PageNumbersController { case "x-large": marginFactor = 0.075f; break; - default: marginFactor = 0.035f; break; } float fontSize = font_size; - if (pagesToNumber == null || pagesToNumber.length() == 0) { + if (pagesToNumber == null || pagesToNumber.isEmpty()) { pagesToNumber = "all"; } - if (customText == null || customText.length() == 0) { + if (customText == null || customText.isEmpty()) { customText = "{n}"; } List pagesToNumberList = @@ -94,63 +93,69 @@ public class PageNumbersController { PDRectangle pageSize = page.getMediaBox(); String text = - customText != null - ? customText - .replace("{n}", String.valueOf(pageNumber)) - .replace("{total}", String.valueOf(document.getNumberOfPages())) - .replace( - "{filename}", - Filenames.toSimpleFileName(file.getOriginalFilename()) - .replaceFirst("[.][^.]+$", "")) - : String.valueOf(pageNumber); + customText + .replace("{n}", String.valueOf(pageNumber)) + .replace("{total}", String.valueOf(document.getNumberOfPages())) + .replace( + "{filename}", + Filenames.toSimpleFileName(file.getOriginalFilename()) + .replaceFirst("[.][^.]+$", "")); + + PDType1Font currentFont = + switch (font_type.toLowerCase()) { + case "courier" -> new PDType1Font(Standard14Fonts.FontName.COURIER); + case "times" -> new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN); + default -> new PDType1Font(Standard14Fonts.FontName.HELVETICA); + }; float x, y; - int xGroup = (position - 1) % 3; - int yGroup = 2 - (position - 1) / 3; + if (position == 5) { + // Calculate text width and font metrics + float textWidth = currentFont.getStringWidth(text) / 1000 * fontSize; - switch (xGroup) { - case 0: // left - x = pageSize.getLowerLeftX() + marginFactor * pageSize.getWidth(); - break; - case 1: // center - x = pageSize.getLowerLeftX() + (pageSize.getWidth() / 2); - break; - default: // right - x = pageSize.getUpperRightX() - marginFactor * pageSize.getWidth(); - break; - } + float ascent = currentFont.getFontDescriptor().getAscent() / 1000 * fontSize; + float descent = currentFont.getFontDescriptor().getDescent() / 1000 * fontSize; - switch (yGroup) { - case 0: // bottom - y = pageSize.getLowerLeftY() + marginFactor * pageSize.getHeight(); - break; - case 1: // middle - y = pageSize.getLowerLeftY() + (pageSize.getHeight() / 2); - break; - default: // top - y = pageSize.getUpperRightY() - marginFactor * pageSize.getHeight(); - break; + float centerX = pageSize.getLowerLeftX() + (pageSize.getWidth() / 2); + float centerY = pageSize.getLowerLeftY() + (pageSize.getHeight() / 2); + + x = centerX - (textWidth / 2); + y = centerY - (ascent + descent) / 2; + } else { + int xGroup = (position - 1) % 3; + int yGroup = 2 - (position - 1) / 3; + + x = + switch (xGroup) { + case 0 -> + pageSize.getLowerLeftX() + + marginFactor * pageSize.getWidth(); // left + case 1 -> + pageSize.getLowerLeftX() + (pageSize.getWidth() / 2); // center + default -> + pageSize.getUpperRightX() + - marginFactor * pageSize.getWidth(); // right + }; + + y = + switch (yGroup) { + case 0 -> + pageSize.getLowerLeftY() + + marginFactor * pageSize.getHeight(); // bottom + case 1 -> + pageSize.getLowerLeftY() + (pageSize.getHeight() / 2); // middle + default -> + pageSize.getUpperRightY() + - marginFactor * pageSize.getHeight(); // top + }; } PDPageContentStream contentStream = new PDPageContentStream( document, page, PDPageContentStream.AppendMode.APPEND, true, true); contentStream.beginText(); - switch (font_type.toLowerCase()) { - case "helvetica": - contentStream.setFont( - new PDType1Font(Standard14Fonts.FontName.HELVETICA), fontSize); - break; - case "courier": - contentStream.setFont( - new PDType1Font(Standard14Fonts.FontName.COURIER), fontSize); - break; - case "times": - contentStream.setFont( - new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN), fontSize); - break; - } + contentStream.setFont(currentFont, fontSize); contentStream.newLineAtOffset(x, y); contentStream.showText(text); contentStream.endText(); diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/ShowJavascript.java b/src/main/java/stirling/software/SPDF/controller/api/misc/ShowJavascript.java index d252800e..a3b9dbdc 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/misc/ShowJavascript.java +++ b/src/main/java/stirling/software/SPDF/controller/api/misc/ShowJavascript.java @@ -3,10 +3,10 @@ package stirling.software.SPDF.controller.api.misc; import java.nio.charset.StandardCharsets; import java.util.Map; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.ModelAttribute; @@ -20,6 +20,7 @@ import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; import stirling.software.SPDF.model.api.PDFFile; +import stirling.software.SPDF.service.CustomPDDocumentFactory; import stirling.software.SPDF.utils.WebResponseUtils; @RestController @@ -27,6 +28,13 @@ import stirling.software.SPDF.utils.WebResponseUtils; @Tag(name = "Misc", description = "Miscellaneous APIs") public class ShowJavascript { + private final CustomPDDocumentFactory pdfDocumentFactory; + + @Autowired + public ShowJavascript(CustomPDDocumentFactory pdfDocumentFactory) { + this.pdfDocumentFactory = pdfDocumentFactory; + } + @PostMapping(consumes = "multipart/form-data", value = "/show-javascript") @Operation( summary = "Grabs all JS from a PDF and returns a single JS file with all code", @@ -35,7 +43,7 @@ public class ShowJavascript { MultipartFile inputFile = request.getFileInput(); String script = ""; - try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) { + try (PDDocument document = pdfDocumentFactory.load(inputFile.getBytes())) { if (document.getDocumentCatalog() != null && document.getDocumentCatalog().getNames() != null) { diff --git a/src/main/java/stirling/software/SPDF/controller/api/security/CertSignController.java b/src/main/java/stirling/software/SPDF/controller/api/security/CertSignController.java index 3a19e9b6..3dc19098 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/security/CertSignController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/security/CertSignController.java @@ -130,8 +130,8 @@ public class CertSignController { summary = "Sign PDF with a Digital Certificate", description = "This endpoint accepts a PDF file, a digital certificate and related" - + " information to sign the PDF. It then returns the digitally signed PDF" - + " file. Input:PDF Output:PDF Type:SISO") + + " information to sign the PDF. It then returns the digitally signed PDF" + + " file. Input:PDF Output:PDF Type:SISO") public ResponseEntity signPDFWithCert(@ModelAttribute SignPDFWithCertRequest request) throws Exception { MultipartFile pdf = request.getFileInput(); diff --git a/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java b/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java index 1f30bccf..a6387adb 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java +++ b/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java @@ -6,7 +6,6 @@ import java.nio.charset.StandardCharsets; import java.text.SimpleDateFormat; import java.util.*; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSInputStream; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSString; @@ -44,6 +43,7 @@ import org.apache.xmpbox.XMPMetadata; import org.apache.xmpbox.xml.DomXmpParser; import org.apache.xmpbox.xml.XmpParsingException; import org.apache.xmpbox.xml.XmpSerializer; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.ModelAttribute; @@ -62,6 +62,7 @@ import io.swagger.v3.oas.annotations.tags.Tag; import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.model.api.PDFFile; +import stirling.software.SPDF.service.CustomPDDocumentFactory; import stirling.software.SPDF.utils.WebResponseUtils; @RestController @@ -72,6 +73,13 @@ public class GetInfoOnPDF { static ObjectMapper objectMapper = new ObjectMapper(); + private final CustomPDDocumentFactory pdfDocumentFactory; + + @Autowired + public GetInfoOnPDF(CustomPDDocumentFactory pdfDocumentFactory) { + this.pdfDocumentFactory = pdfDocumentFactory; + } + private static void addOutlinesToArray(PDOutlineItem outline, ArrayNode arrayNode) { if (outline == null) return; @@ -118,7 +126,7 @@ public class GetInfoOnPDF { @Operation(summary = "Summary here", description = "desc. Input:PDF Output:JSON Type:SISO") public ResponseEntity getPdfInfo(@ModelAttribute PDFFile request) throws IOException { MultipartFile inputFile = request.getFileInput(); - try (PDDocument pdfBoxDoc = Loader.loadPDF(inputFile.getBytes()); ) { + try (PDDocument pdfBoxDoc = pdfDocumentFactory.load(inputFile.getBytes()); ) { ObjectMapper objectMapper = new ObjectMapper(); ObjectNode jsonOutput = objectMapper.createObjectNode(); diff --git a/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index 71e90c24..5bffe68e 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -3,8 +3,11 @@ package stirling.software.SPDF.controller.api.security; import java.awt.*; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; @@ -94,7 +97,10 @@ public class RedactController { private void redactAreas( List redactionAreas, PDDocument document, PDPageTree allPages) throws IOException { - Color redactColor = null; + // Group redaction areas by page + Map> redactionsByPage = new HashMap<>(); + + // Process and validate each redaction area for (RedactionArea redactionArea : redactionAreas) { if (redactionArea.getPage() == null || redactionArea.getPage() <= 0 @@ -102,23 +108,44 @@ public class RedactController { || redactionArea.getHeight() <= 0.0D || redactionArea.getWidth() == null || redactionArea.getWidth() <= 0.0D) continue; - PDPage page = allPages.get(redactionArea.getPage() - 1); + // Group by page number + redactionsByPage + .computeIfAbsent(redactionArea.getPage(), k -> new ArrayList<>()) + .add(redactionArea); + } + + // Process each page only once + for (Map.Entry> entry : redactionsByPage.entrySet()) { + Integer pageNumber = entry.getKey(); + List areasForPage = entry.getValue(); + + if (pageNumber > allPages.getCount()) { + continue; // Skip if page number is out of bounds + } + + PDPage page = allPages.get(pageNumber - 1); + PDRectangle box = page.getBBox(); + + // Create only one content stream per page PDPageContentStream contentStream = new PDPageContentStream( document, page, PDPageContentStream.AppendMode.APPEND, true, true); - redactColor = decodeOrDefault(redactionArea.getColor(), Color.BLACK); - contentStream.setNonStrokingColor(redactColor); - float x = redactionArea.getX().floatValue(); - float y = redactionArea.getY().floatValue(); - float width = redactionArea.getWidth().floatValue(); - float height = redactionArea.getHeight().floatValue(); + // Process all redactions for this page + for (RedactionArea redactionArea : areasForPage) { + Color redactColor = decodeOrDefault(redactionArea.getColor(), Color.BLACK); + contentStream.setNonStrokingColor(redactColor); - PDRectangle box = page.getBBox(); + float x = redactionArea.getX().floatValue(); + float y = redactionArea.getY().floatValue(); + float width = redactionArea.getWidth().floatValue(); + float height = redactionArea.getHeight().floatValue(); + + contentStream.addRect(x, box.getHeight() - y - height, width, height); + contentStream.fill(); + } - contentStream.addRect(x, box.getHeight() - y - height, width, height); - contentStream.fill(); contentStream.close(); } } diff --git a/src/main/java/stirling/software/SPDF/controller/api/security/SanitizeController.java b/src/main/java/stirling/software/SPDF/controller/api/security/SanitizeController.java index 9b42e23b..e075559e 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/security/SanitizeController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/security/SanitizeController.java @@ -45,7 +45,7 @@ public class SanitizeController { summary = "Sanitize a PDF file", description = "This endpoint processes a PDF file and removes specific elements based on the" - + " provided options. Input:PDF Output:PDF Type:SISO") + + " provided options. Input:PDF Output:PDF Type:SISO") public ResponseEntity sanitizePDF(@ModelAttribute SanitizePdfRequest request) throws IOException { MultipartFile inputFile = request.getFileInput(); diff --git a/src/main/java/stirling/software/SPDF/model/PDFText.java b/src/main/java/stirling/software/SPDF/model/PDFText.java index 9c460f3c..cb92c668 100644 --- a/src/main/java/stirling/software/SPDF/model/PDFText.java +++ b/src/main/java/stirling/software/SPDF/model/PDFText.java @@ -1,5 +1,8 @@ package stirling.software.SPDF.model; +import lombok.Data; + +@Data public class PDFText { private final int pageIndex; private final float x1; @@ -7,37 +10,4 @@ public class PDFText { private final float x2; private final float y2; private final String text; - - public PDFText(int pageIndex, float x1, float y1, float x2, float y2, String text) { - this.pageIndex = pageIndex; - this.x1 = x1; - this.y1 = y1; - this.x2 = x2; - this.y2 = y2; - this.text = text; - } - - public int getPageIndex() { - return pageIndex; - } - - public float getX1() { - return x1; - } - - public float getY1() { - return y1; - } - - public float getX2() { - return x2; - } - - public float getY2() { - return y2; - } - - public String getText() { - return text; - } } diff --git a/src/main/java/stirling/software/SPDF/service/CustomPDDocumentFactory.java b/src/main/java/stirling/software/SPDF/service/CustomPDDocumentFactory.java index 138420dd..e9bc3b1a 100644 --- a/src/main/java/stirling/software/SPDF/service/CustomPDDocumentFactory.java +++ b/src/main/java/stirling/software/SPDF/service/CustomPDDocumentFactory.java @@ -4,142 +4,355 @@ import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.util.concurrent.atomic.AtomicLong; import org.apache.pdfbox.Loader; +import org.apache.pdfbox.io.IOUtils; +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.io.RandomAccessReadBufferedFile; +import org.apache.pdfbox.io.RandomAccessStreamCache.StreamCacheCreateFunction; +import org.apache.pdfbox.io.ScratchFile; import org.apache.pdfbox.pdmodel.PDDocument; -import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import org.springframework.web.multipart.MultipartFile; import lombok.extern.slf4j.Slf4j; -import stirling.software.SPDF.model.PdfMetadata; import stirling.software.SPDF.model.api.PDFFile; +/** + * Adaptive PDF document factory that optimizes memory usage based on file size and available system + * resources. + */ @Component @Slf4j public class CustomPDDocumentFactory { private final PdfMetadataService pdfMetadataService; - @Autowired + // Memory thresholds and limits + + private static final long SMALL_FILE_THRESHOLD = 10 * 1024 * 1024; // 10 MB + // Files smaller than this threshold are loaded entirely in memory for better performance. + // These files use IOUtils.createMemoryOnlyStreamCache() which keeps all document data in RAM. + // No temp files are created for document data, reducing I/O operations but consuming more + // memory. + + private static final long LARGE_FILE_THRESHOLD = 50 * 1024 * 1024; // 50 MB + // Files between SMALL and LARGE thresholds use file-based caching with ScratchFile, + // but are loaded directly from byte arrays if provided that way. + // When loading from byte arrays, once size exceeds this threshold, bytes are first + // written to temp files before loading to reduce memory pressure. + + private static final long LARGE_FILE_USAGE = 10 * 1024 * 1024; + + private static final long EXTREMELY_LARGE_THRESHOLD = 100 * 1024 * 1024; // 100 MB + // Files exceeding this threshold use specialized loading with RandomAccessReadBufferedFile + // which provides buffered access to the file without loading the entire content at once. + // These files are always processed using file-based caching with minimal memory footprint, + // trading some performance for significantly reduced memory usage. + // For extremely large PDFs, this prevents OutOfMemoryErrors at the cost of being more I/O + // bound. + + private static final double MIN_FREE_MEMORY_PERCENTAGE = 30.0; // 30% + private static final long MIN_FREE_MEMORY_BYTES = 4L * 1024 * 1024 * 1024; // 4 GB + + // Counter for tracking temporary resources + private static final AtomicLong tempCounter = new AtomicLong(0); + public CustomPDDocumentFactory(PdfMetadataService pdfMetadataService) { this.pdfMetadataService = pdfMetadataService; } - public PDDocument createNewDocument() throws IOException { - PDDocument document = new PDDocument(); - pdfMetadataService.setMetadataToPdf(document, PdfMetadata.builder().build(), true); + /** + * Main entry point for loading a PDF document from a file. Automatically selects the most + * appropriate loading strategy. + */ + public PDDocument load(File file) throws IOException { + if (file == null) { + throw new IllegalArgumentException("File cannot be null"); + } + + long fileSize = file.length(); + log.info("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024)); + + return loadAdaptively(file, fileSize); + } + + /** Load a PDF from byte array with automatic optimization. */ + public PDDocument load(byte[] input) throws IOException { + if (input == null) { + throw new IllegalArgumentException("Input bytes cannot be null"); + } + + long dataSize = input.length; + log.info("Loading PDF from byte array, size: {}MB", dataSize / (1024 * 1024)); + + return loadAdaptively(input, dataSize); + } + + /** Load a PDF from InputStream with automatic optimization. */ + public PDDocument load(InputStream input) throws IOException { + if (input == null) { + throw new IllegalArgumentException("InputStream cannot be null"); + } + + // Since we don't know the size upfront, buffer to a temp file + Path tempFile = createTempFile("pdf-stream-"); + try { + Files.copy(input, tempFile, StandardCopyOption.REPLACE_EXISTING); + return loadAdaptively(tempFile.toFile(), Files.size(tempFile)); + } catch (IOException e) { + cleanupFile(tempFile); + throw e; + } + } + + private PDDocument loadAdaptively(Object source, long contentSize) throws IOException { + long maxMemory = Runtime.getRuntime().maxMemory(); + long freeMemory = Runtime.getRuntime().freeMemory(); + long totalMemory = Runtime.getRuntime().totalMemory(); + long usedMemory = totalMemory - freeMemory; + + // Calculate percentage of free memory + double freeMemoryPercent = (double) (maxMemory - usedMemory) / maxMemory * 100; + long actualFreeMemory = maxMemory - usedMemory; + + // Log memory status + log.info( + "Memory status - Free: {}MB ({}%), Used: {}MB, Max: {}MB", + actualFreeMemory / (1024 * 1024), + String.format("%.2f", freeMemoryPercent), + usedMemory / (1024 * 1024), + maxMemory / (1024 * 1024)); + + // Determine caching strategy based on both file size and available memory + StreamCacheCreateFunction cacheFunction; + + // If free memory is critically low, always use file-based caching + // In loadAdaptively method, replace current caching strategy decision with: + if (freeMemoryPercent < MIN_FREE_MEMORY_PERCENTAGE + || actualFreeMemory < MIN_FREE_MEMORY_BYTES) { + log.info( + "Low memory detected ({}%), forcing file-based cache", + String.format("%.2f", freeMemoryPercent)); + cacheFunction = createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly()); + } else if (contentSize < SMALL_FILE_THRESHOLD) { + log.info("Using memory-only cache for small document ({}KB)", contentSize / 1024); + cacheFunction = IOUtils.createMemoryOnlyStreamCache(); + } else if (contentSize < LARGE_FILE_THRESHOLD) { + // For medium files (10-50MB), use a mixed approach + log.info( + "Using mixed memory/file cache for medium document ({}MB)", + contentSize / (1024 * 1024)); + cacheFunction = + createScratchFileCacheFunction(MemoryUsageSetting.setupMixed(LARGE_FILE_USAGE)); + } else { + log.info("Using file-based cache for large document"); + cacheFunction = createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly()); + } + + PDDocument document; + if (source instanceof File file) { + document = loadFromFile(file, contentSize, cacheFunction); + } else if (source instanceof byte[] bytes) { + document = loadFromBytes(bytes, contentSize, cacheFunction); + } else { + throw new IllegalArgumentException("Unsupported source type: " + source.getClass()); + } + + postProcessDocument(document); return document; } + private StreamCacheCreateFunction createScratchFileCacheFunction(MemoryUsageSetting settings) { + return () -> { + try { + return new ScratchFile(settings); + } catch (IOException e) { + throw new RuntimeException("ScratchFile initialization failed", e); + } + }; + } + + private void postProcessDocument(PDDocument doc) throws IOException { + pdfMetadataService.setDefaultMetadata(doc); + removePassword(doc); + } + + private PDDocument loadFromFile(File file, long size, StreamCacheCreateFunction cache) + throws IOException { + if (size >= EXTREMELY_LARGE_THRESHOLD) { + log.info("Loading extremely large file via buffered access"); + return Loader.loadPDF(new RandomAccessReadBufferedFile(file), "", null, null, cache); + } + return Loader.loadPDF(file, "", null, null, cache); + } + + private PDDocument loadFromBytes(byte[] bytes, long size, StreamCacheCreateFunction cache) + throws IOException { + if (size >= SMALL_FILE_THRESHOLD) { + log.info("Writing large byte array to temp file"); + Path tempFile = createTempFile("pdf-bytes-"); + try { + Files.write(tempFile, bytes); + return Loader.loadPDF(tempFile.toFile(), "", null, null, cache); + } finally { + cleanupFile(tempFile); + } + } + return Loader.loadPDF(bytes, "", null, null, cache); + } + + public PDDocument createNewDocument(MemoryUsageSetting settings) throws IOException { + PDDocument doc = new PDDocument(createScratchFileCacheFunction(settings)); + pdfMetadataService.setDefaultMetadata(doc); + return doc; + } + + public PDDocument createNewDocument() throws IOException { + return createNewDocument(MemoryUsageSetting.setupTempFileOnly()); + } + + public byte[] saveToBytes(PDDocument document) throws IOException { + if (document.getNumberOfPages() < 10) { // Simple heuristic + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + document.save(baos); + return baos.toByteArray(); + } + } else { + Path tempFile = createTempFile("pdf-save-"); + try { + document.save(tempFile.toFile()); + return Files.readAllBytes(tempFile); + } finally { + cleanupFile(tempFile); + } + } + } + + // Improved password handling + private void removePassword(PDDocument document) throws IOException { + if (document.isEncrypted()) { + try { + document.setAllSecurityToBeRemoved(true); + } catch (Exception e) { + log.error("Decryption failed", e); + throw new IOException("PDF decryption failed", e); + } + } + } + + // Temp file handling with enhanced logging + private Path createTempFile(String prefix) throws IOException { + Path file = Files.createTempFile(prefix + tempCounter.incrementAndGet() + "-", ".tmp"); + log.info("Created temp file: {}", file); + return file; + } + + /** Create a uniquely named temporary directory */ + private Path createTempDirectory(String prefix) throws IOException { + return Files.createTempDirectory(prefix + tempCounter.incrementAndGet() + "-"); + } + + /** Clean up a temporary file */ + private void cleanupFile(Path file) { + try { + if (Files.deleteIfExists(file)) { + log.info("Deleted temp file: {}", file); + } + } catch (IOException e) { + log.info("Error deleting temp file {}", file, e); + } + } + + /** Create new document bytes based on an existing document */ public byte[] createNewBytesBasedOnOldDocument(byte[] oldDocument) throws IOException { - PDDocument document = Loader.loadPDF(oldDocument); - return createNewBytesBasedOnOldDocument(document); + try (PDDocument document = load(oldDocument)) { + return saveToBytes(document); + } } + /** Create new document bytes based on an existing document file */ public byte[] createNewBytesBasedOnOldDocument(File oldDocument) throws IOException { - PDDocument document = Loader.loadPDF(oldDocument); - return createNewBytesBasedOnOldDocument(document); + try (PDDocument document = load(oldDocument)) { + return saveToBytes(document); + } } + /** Create new document bytes based on an existing PDDocument */ public byte[] createNewBytesBasedOnOldDocument(PDDocument oldDocument) throws IOException { pdfMetadataService.setMetadataToPdf( oldDocument, pdfMetadataService.extractMetadataFromPdf(oldDocument), true); - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - oldDocument.save(baos); - oldDocument.close(); - return baos.toByteArray(); + return saveToBytes(oldDocument); } + /** Create a new document based on an existing document bytes */ public PDDocument createNewDocumentBasedOnOldDocument(byte[] oldDocument) throws IOException { - PDDocument document = Loader.loadPDF(oldDocument); - return createNewDocumentBasedOnOldDocument(document); + try (PDDocument document = load(oldDocument)) { + return createNewDocumentBasedOnOldDocument(document); + } } + /** Create a new document based on an existing document file */ public PDDocument createNewDocumentBasedOnOldDocument(File oldDocument) throws IOException { - PDDocument document = Loader.loadPDF(oldDocument); - return createNewDocumentBasedOnOldDocument(document); + try (PDDocument document = load(oldDocument)) { + return createNewDocumentBasedOnOldDocument(document); + } } + /** Create a new document based on an existing PDDocument */ public PDDocument createNewDocumentBasedOnOldDocument(PDDocument oldDocument) throws IOException { - PDDocument document = new PDDocument(); + PDDocument document = createNewDocument(); pdfMetadataService.setMetadataToPdf( document, pdfMetadataService.extractMetadataFromPdf(oldDocument), true); return document; } + /** Load document from a file and convert it to bytes */ public byte[] loadToBytes(File file) throws IOException { - PDDocument document = load(file); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - document.save(baos); - // Close the document - document.close(); - return baos.toByteArray(); + try (PDDocument document = load(file)) { + return saveToBytes(document); + } } + /** Load document from bytes and convert it back to bytes */ public byte[] loadToBytes(byte[] bytes) throws IOException { - PDDocument document = load(bytes); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - document.save(baos); - // Close the document - document.close(); - return baos.toByteArray(); - } - - // if loading from a file, assume the file has been made with Stirling-PDF - public PDDocument load(File file) throws IOException { - PDDocument document = Loader.loadPDF(file); - pdfMetadataService.setMetadataToPdf(document, PdfMetadata.builder().build(), true); - return document; - } - - public PDDocument load(InputStream input) throws IOException { - return load(input.readAllBytes()); - } - - public PDDocument load(byte[] input) throws IOException { - PDDocument document = Loader.loadPDF(input); - pdfMetadataService.setDefaultMetadata(document); - removezeropassword(document); - return document; - } - - public PDDocument load(PDFFile pdfFile) throws IOException { - return load(pdfFile.getFileInput()); - } - - public PDDocument load(MultipartFile pdfFile) throws IOException { - return load(pdfFile.getBytes()); + try (PDDocument document = load(bytes)) { + return saveToBytes(document); + } } + /** Load from a file path string */ public PDDocument load(String path) throws IOException { return load(new File(path)); } + /** Load from a PDFFile object */ + public PDDocument load(PDFFile pdfFile) throws IOException { + return load(pdfFile.getFileInput()); + } + + /** Load from a MultipartFile */ + public PDDocument load(MultipartFile pdfFile) throws IOException { + return load(pdfFile.getBytes()); + } + + /** Load with password from MultipartFile */ public PDDocument load(MultipartFile fileInput, String password) throws IOException { return load(fileInput.getBytes(), password); } + /** Load with password from byte array */ private PDDocument load(byte[] bytes, String password) throws IOException { + // Since we don't have direct password support in the adaptive loader, + // we'll need to use PDFBox's Loader directly PDDocument document = Loader.loadPDF(bytes, password); pdfMetadataService.setDefaultMetadata(document); return document; } - - private PDDocument removezeropassword(PDDocument document) throws IOException { - if (document.isEncrypted()) { - try { - log.info("Removing security from the source document"); - document.setAllSecurityToBeRemoved(true); - } catch (Exception e) { - log.warn("Cannot decrypt the pdf"); - } - } - return document; - } - - // Add other load methods as needed, following the same pattern } diff --git a/src/main/java/stirling/software/SPDF/utils/PdfUtils.java b/src/main/java/stirling/software/SPDF/utils/PdfUtils.java index de161550..ad404660 100644 --- a/src/main/java/stirling/software/SPDF/utils/PdfUtils.java +++ b/src/main/java/stirling/software/SPDF/utils/PdfUtils.java @@ -14,7 +14,6 @@ import java.util.zip.ZipOutputStream; import javax.imageio.*; import javax.imageio.stream.ImageOutputStream; -import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; @@ -128,6 +127,7 @@ public class PdfUtils { } public static byte[] convertFromPdf( + CustomPDDocumentFactory pdfDocumentFactory, byte[] inputStream, String imageType, ImageType colorType, @@ -135,7 +135,7 @@ public class PdfUtils { int DPI, String filename) throws IOException, Exception { - try (PDDocument document = Loader.loadPDF(inputStream)) { + try (PDDocument document = pdfDocumentFactory.load(inputStream)) { PDFRenderer pdfRenderer = new PDFRenderer(document); pdfRenderer.setSubsamplingAllowed(true); int pageCount = document.getNumberOfPages();