Memory enhancements and PDF decompress API (#3129)

# Description of Changes

- PDF split by size to check size of PDF as it splits, avoids issue were
a PDFs size is different viewed vs saved due to compression caused by
repeated data etc.
- Additionally memory enhancements for PDF load to dynamically load in
memory vs scratch
- PDF Decompress API for PDF testing


## Checklist

### General

- [ ] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [ ] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md)
(if applicable)
- [ ] I have performed a self-review of my own code
- [ ] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### UI Changes (if applicable)

- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [ ] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing)
for more details.
This commit is contained in:
Anthony Stirling 2025-03-08 00:03:27 +00:00 committed by GitHub
parent 33eb3fd034
commit ed2ef01690
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
43 changed files with 1042 additions and 321 deletions

View File

@ -124,7 +124,7 @@ These files provide pre-configured setups for different scenarios. For example,
services: services:
stirling-pdf: stirling-pdf:
container_name: Stirling-PDF-Security container_name: Stirling-PDF-Security
image: stirlingtools/stirling-pdf:latest image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest
deploy: deploy:
resources: resources:
limits: limits:

View File

@ -25,7 +25,7 @@ ext {
} }
group = "stirling.software" group = "stirling.software"
version = "0.43.2" version = "0.44.0"
java { java {
// 17 is lowest but we support and recommend 21 // 17 is lowest but we support and recommend 21

View File

@ -1,7 +1,7 @@
services: services:
stirling-pdf: stirling-pdf:
container_name: Stirling-PDF-Security-Fat-Postgres container_name: Stirling-PDF-Security-Fat-Postgres
image: stirlingtools/stirling-pdf:latest-fat-postgres image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-fat-postgres
deploy: deploy:
resources: resources:
limits: limits:

View File

@ -1,7 +1,7 @@
services: services:
stirling-pdf: stirling-pdf:
container_name: Stirling-PDF-Security-Fat container_name: Stirling-PDF-Security-Fat
image: stirlingtools/stirling-pdf:latest-fat image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-fat
deploy: deploy:
resources: resources:
limits: limits:

View File

@ -1,7 +1,7 @@
services: services:
stirling-pdf: stirling-pdf:
container_name: Stirling-PDF-Security container_name: Stirling-PDF-Security
image: stirlingtools/stirling-pdf:latest image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest
deploy: deploy:
resources: resources:
limits: limits:

View File

@ -1,7 +1,7 @@
services: services:
stirling-pdf: stirling-pdf:
container_name: Stirling-PDF-Security container_name: Stirling-PDF-Security
image: stirlingtools/stirling-pdf:latest image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest
deploy: deploy:
resources: resources:
limits: limits:

View File

@ -1,7 +1,7 @@
services: services:
stirling-pdf: stirling-pdf:
container_name: Stirling-PDF-Ultra-Lite-Security container_name: Stirling-PDF-Ultra-Lite-Security
image: stirlingtools/stirling-pdf:latest-ultra-lite image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-ultra-lite
deploy: deploy:
resources: resources:
limits: limits:

View File

@ -1,7 +1,7 @@
services: services:
stirling-pdf: stirling-pdf:
container_name: Stirling-PDF-Ultra-Lite container_name: Stirling-PDF-Ultra-Lite
image: stirlingtools/stirling-pdf:latest-ultra-lite image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-ultra-lite
deploy: deploy:
resources: resources:
limits: limits:

View File

@ -1,7 +1,7 @@
services: services:
stirling-pdf: stirling-pdf:
container_name: Stirling-PDF container_name: Stirling-PDF
image: stirlingtools/stirling-pdf:latest image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest
deploy: deploy:
resources: resources:
limits: limits:

View File

@ -11,6 +11,7 @@ import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.ApplicationProperties; import stirling.software.SPDF.model.ApplicationProperties;
@Service @Service

View File

@ -3,7 +3,6 @@ package stirling.software.SPDF.controller.api;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.PDDocumentInformation;
@ -12,24 +11,33 @@ import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.encryption.PDEncryption; import org.apache.pdfbox.pdmodel.encryption.PDEncryption;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.*; import org.springframework.web.bind.annotation.*;
import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag; import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.PDFFile; import stirling.software.SPDF.model.api.PDFFile;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
@RestController @RestController
@RequestMapping("/api/v1/analysis") @RequestMapping("/api/v1/analysis")
@Tag(name = "Analysis", description = "Analysis APIs") @Tag(name = "Analysis", description = "Analysis APIs")
public class AnalysisController { public class AnalysisController {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public AnalysisController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(value = "/page-count", consumes = "multipart/form-data") @PostMapping(value = "/page-count", consumes = "multipart/form-data")
@Operation( @Operation(
summary = "Get PDF page count", summary = "Get PDF page count",
description = "Returns total number of pages in PDF. Input:PDF Output:JSON Type:SISO") description = "Returns total number of pages in PDF. Input:PDF Output:JSON Type:SISO")
public Map<String, Integer> getPageCount(@ModelAttribute PDFFile file) throws IOException { public Map<String, Integer> getPageCount(@ModelAttribute PDFFile file) throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
return Map.of("pageCount", document.getNumberOfPages()); return Map.of("pageCount", document.getNumberOfPages());
} }
} }
@ -39,7 +47,7 @@ public class AnalysisController {
summary = "Get basic PDF information", summary = "Get basic PDF information",
description = "Returns page count, version, file size. Input:PDF Output:JSON Type:SISO") description = "Returns page count, version, file size. Input:PDF Output:JSON Type:SISO")
public Map<String, Object> getBasicInfo(@ModelAttribute PDFFile file) throws IOException { public Map<String, Object> getBasicInfo(@ModelAttribute PDFFile file) throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
Map<String, Object> info = new HashMap<>(); Map<String, Object> info = new HashMap<>();
info.put("pageCount", document.getNumberOfPages()); info.put("pageCount", document.getNumberOfPages());
info.put("pdfVersion", document.getVersion()); info.put("pdfVersion", document.getVersion());
@ -54,7 +62,7 @@ public class AnalysisController {
description = "Returns title, author, subject, etc. Input:PDF Output:JSON Type:SISO") description = "Returns title, author, subject, etc. Input:PDF Output:JSON Type:SISO")
public Map<String, String> getDocumentProperties(@ModelAttribute PDFFile file) public Map<String, String> getDocumentProperties(@ModelAttribute PDFFile file)
throws IOException { throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
PDDocumentInformation info = document.getDocumentInformation(); PDDocumentInformation info = document.getDocumentInformation();
Map<String, String> properties = new HashMap<>(); Map<String, String> properties = new HashMap<>();
properties.put("title", info.getTitle()); properties.put("title", info.getTitle());
@ -75,7 +83,7 @@ public class AnalysisController {
description = "Returns width and height of each page. Input:PDF Output:JSON Type:SISO") description = "Returns width and height of each page. Input:PDF Output:JSON Type:SISO")
public List<Map<String, Float>> getPageDimensions(@ModelAttribute PDFFile file) public List<Map<String, Float>> getPageDimensions(@ModelAttribute PDFFile file)
throws IOException { throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
List<Map<String, Float>> dimensions = new ArrayList<>(); List<Map<String, Float>> dimensions = new ArrayList<>();
PDPageTree pages = document.getPages(); PDPageTree pages = document.getPages();
@ -95,7 +103,7 @@ public class AnalysisController {
description = description =
"Returns count and details of form fields. Input:PDF Output:JSON Type:SISO") "Returns count and details of form fields. Input:PDF Output:JSON Type:SISO")
public Map<String, Object> getFormFields(@ModelAttribute PDFFile file) throws IOException { public Map<String, Object> getFormFields(@ModelAttribute PDFFile file) throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
Map<String, Object> formInfo = new HashMap<>(); Map<String, Object> formInfo = new HashMap<>();
PDAcroForm form = document.getDocumentCatalog().getAcroForm(); PDAcroForm form = document.getDocumentCatalog().getAcroForm();
@ -117,7 +125,7 @@ public class AnalysisController {
summary = "Get annotation information", summary = "Get annotation information",
description = "Returns count and types of annotations. Input:PDF Output:JSON Type:SISO") description = "Returns count and types of annotations. Input:PDF Output:JSON Type:SISO")
public Map<String, Object> getAnnotationInfo(@ModelAttribute PDFFile file) throws IOException { public Map<String, Object> getAnnotationInfo(@ModelAttribute PDFFile file) throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
Map<String, Object> annotInfo = new HashMap<>(); Map<String, Object> annotInfo = new HashMap<>();
int totalAnnotations = 0; int totalAnnotations = 0;
Map<String, Integer> annotationTypes = new HashMap<>(); Map<String, Integer> annotationTypes = new HashMap<>();
@ -142,7 +150,7 @@ public class AnalysisController {
description = description =
"Returns list of fonts used in the document. Input:PDF Output:JSON Type:SISO") "Returns list of fonts used in the document. Input:PDF Output:JSON Type:SISO")
public Map<String, Object> getFontInfo(@ModelAttribute PDFFile file) throws IOException { public Map<String, Object> getFontInfo(@ModelAttribute PDFFile file) throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
Map<String, Object> fontInfo = new HashMap<>(); Map<String, Object> fontInfo = new HashMap<>();
Set<String> fontNames = new HashSet<>(); Set<String> fontNames = new HashSet<>();
@ -164,7 +172,7 @@ public class AnalysisController {
description = description =
"Returns encryption and permission details. Input:PDF Output:JSON Type:SISO") "Returns encryption and permission details. Input:PDF Output:JSON Type:SISO")
public Map<String, Object> getSecurityInfo(@ModelAttribute PDFFile file) throws IOException { public Map<String, Object> getSecurityInfo(@ModelAttribute PDFFile file) throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) { try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
Map<String, Object> securityInfo = new HashMap<>(); Map<String, Object> securityInfo = new HashMap<>();
PDEncryption encryption = document.getEncryption(); PDEncryption encryption = document.getEncryption();

View File

@ -3,7 +3,6 @@ package stirling.software.SPDF.controller.api;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.multipdf.LayerUtility; import org.apache.pdfbox.multipdf.LayerUtility;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
@ -23,7 +22,6 @@ import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.general.CropPdfForm; import stirling.software.SPDF.model.api.general.CropPdfForm;
import stirling.software.SPDF.service.CustomPDDocumentFactory; import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.service.PostHogService;
import stirling.software.SPDF.utils.WebResponseUtils; import stirling.software.SPDF.utils.WebResponseUtils;
@RestController @RestController
@ -33,13 +31,9 @@ public class CropController {
private final CustomPDDocumentFactory pdfDocumentFactory; private final CustomPDDocumentFactory pdfDocumentFactory;
private final PostHogService postHogService;
@Autowired @Autowired
public CropController( public CropController(CustomPDDocumentFactory pdfDocumentFactory) {
CustomPDDocumentFactory pdfDocumentFactory, PostHogService postHogService) {
this.pdfDocumentFactory = pdfDocumentFactory; this.pdfDocumentFactory = pdfDocumentFactory;
this.postHogService = postHogService;
} }
@PostMapping(value = "/crop", consumes = "multipart/form-data") @PostMapping(value = "/crop", consumes = "multipart/form-data")
@ -48,7 +42,7 @@ public class CropController {
description = description =
"This operation takes an input PDF file and crops it according to the given coordinates. Input:PDF Output:PDF Type:SISO") "This operation takes an input PDF file and crops it according to the given coordinates. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> cropPdf(@ModelAttribute CropPdfForm form) throws IOException { public ResponseEntity<byte[]> cropPdf(@ModelAttribute CropPdfForm form) throws IOException {
PDDocument sourceDocument = Loader.loadPDF(form.getFileInput().getBytes()); PDDocument sourceDocument = pdfDocumentFactory.load(form.getFileInput().getBytes());
PDDocument newDocument = PDDocument newDocument =
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument); pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);

View File

@ -12,7 +12,6 @@ import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.multipdf.PDFMergerUtility; import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog; import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
@ -101,8 +100,8 @@ public class MergeController {
}; };
case "byPDFTitle": case "byPDFTitle":
return (file1, file2) -> { return (file1, file2) -> {
try (PDDocument doc1 = Loader.loadPDF(file1.getBytes()); try (PDDocument doc1 = pdfDocumentFactory.load(file1.getBytes());
PDDocument doc2 = Loader.loadPDF(file2.getBytes())) { PDDocument doc2 = pdfDocumentFactory.load(file2.getBytes())) {
String title1 = doc1.getDocumentInformation().getTitle(); String title1 = doc1.getDocumentInformation().getTitle();
String title2 = doc2.getDocumentInformation().getTitle(); String title2 = doc2.getDocumentInformation().getTitle();
return title1.compareTo(title2); return title1.compareTo(title2);
@ -152,7 +151,7 @@ public class MergeController {
byte[] mergedPdfBytes = docOutputstream.toByteArray(); // Get merged document bytes byte[] mergedPdfBytes = docOutputstream.toByteArray(); // Get merged document bytes
// Load the merged PDF document // Load the merged PDF document
mergedDocument = Loader.loadPDF(mergedPdfBytes); mergedDocument = pdfDocumentFactory.load(mergedPdfBytes);
// Remove signatures if removeCertSign is true // Remove signatures if removeCertSign is true
if (removeCertSign) { if (removeCertSign) {

View File

@ -4,7 +4,6 @@ import java.awt.*;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.multipdf.LayerUtility; import org.apache.pdfbox.multipdf.LayerUtility;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
@ -64,7 +63,7 @@ public class MultiPageLayoutController {
: (int) Math.sqrt(pagesPerSheet); : (int) Math.sqrt(pagesPerSheet);
int rows = pagesPerSheet == 2 || pagesPerSheet == 3 ? 1 : (int) Math.sqrt(pagesPerSheet); int rows = pagesPerSheet == 2 || pagesPerSheet == 3 ? 1 : (int) Math.sqrt(pagesPerSheet);
PDDocument sourceDocument = Loader.loadPDF(file.getBytes()); PDDocument sourceDocument = pdfDocumentFactory.load(file.getBytes());
PDDocument newDocument = PDDocument newDocument =
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument); pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
PDPage newPage = new PDPage(PDRectangle.A4); PDPage newPage = new PDPage(PDRectangle.A4);

View File

@ -5,7 +5,6 @@ import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
@ -251,7 +250,7 @@ public class RearrangePagesPDFController {
String sortType = request.getCustomMode(); String sortType = request.getCustomMode();
try { try {
// Load the input PDF // Load the input PDF
PDDocument document = Loader.loadPDF(pdfFile.getBytes()); PDDocument document = pdfDocumentFactory.load(pdfFile.getBytes());
// Split the page order string into an array of page numbers or range of numbers // Split the page order string into an array of page numbers or range of numbers
String[] pageOrderArr = pageOrder != null ? pageOrder.split(",") : new String[0]; String[] pageOrderArr = pageOrder != null ? pageOrder.split(",") : new String[0];

View File

@ -5,7 +5,6 @@ import java.io.IOException;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.multipdf.LayerUtility; import org.apache.pdfbox.multipdf.LayerUtility;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
@ -52,7 +51,7 @@ public class ScalePagesController {
String targetPDRectangle = request.getPageSize(); String targetPDRectangle = request.getPageSize();
float scaleFactor = request.getScaleFactor(); float scaleFactor = request.getScaleFactor();
PDDocument sourceDocument = Loader.loadPDF(file.getBytes()); PDDocument sourceDocument = pdfDocumentFactory.load(file.getBytes());
PDDocument outputDocument = PDDocument outputDocument =
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument); pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);

View File

@ -10,7 +10,6 @@ import java.util.stream.Collectors;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream; import java.util.zip.ZipOutputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
@ -63,7 +62,7 @@ public class SplitPDFController {
String pages = request.getPageNumbers(); String pages = request.getPageNumbers();
// open the pdf document // open the pdf document
document = Loader.loadPDF(file.getBytes()); document = pdfDocumentFactory.load(file.getBytes());
// PdfMetadata metadata = PdfMetadataService.extractMetadataFromPdf(document); // PdfMetadata metadata = PdfMetadataService.extractMetadataFromPdf(document);
int totalPages = document.getNumberOfPages(); int totalPages = document.getNumberOfPages();
List<Integer> pageNumbers = request.getPageNumbersList(document, false); List<Integer> pageNumbers = request.getPageNumbersList(document, false);

View File

@ -8,7 +8,6 @@ import java.util.List;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream; import java.util.zip.ZipOutputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
@ -34,6 +33,7 @@ import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.PdfMetadata; import stirling.software.SPDF.model.PdfMetadata;
import stirling.software.SPDF.model.api.SplitPdfByChaptersRequest; import stirling.software.SPDF.model.api.SplitPdfByChaptersRequest;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.service.PdfMetadataService; import stirling.software.SPDF.service.PdfMetadataService;
import stirling.software.SPDF.utils.WebResponseUtils; import stirling.software.SPDF.utils.WebResponseUtils;
@ -45,9 +45,13 @@ public class SplitPdfByChaptersController {
private final PdfMetadataService pdfMetadataService; private final PdfMetadataService pdfMetadataService;
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired @Autowired
public SplitPdfByChaptersController(PdfMetadataService pdfMetadataService) { public SplitPdfByChaptersController(
PdfMetadataService pdfMetadataService, CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfMetadataService = pdfMetadataService; this.pdfMetadataService = pdfMetadataService;
this.pdfDocumentFactory = pdfDocumentFactory;
} }
private static List<Bookmark> extractOutlineItems( private static List<Bookmark> extractOutlineItems(
@ -135,7 +139,7 @@ public class SplitPdfByChaptersController {
if (bookmarkLevel < 0) { if (bookmarkLevel < 0) {
return ResponseEntity.badRequest().body("Invalid bookmark level".getBytes()); return ResponseEntity.badRequest().body("Invalid bookmark level".getBytes());
} }
sourceDocument = Loader.loadPDF(file.getBytes()); sourceDocument = pdfDocumentFactory.load(file.getBytes());
PDDocumentOutline outline = sourceDocument.getDocumentCatalog().getDocumentOutline(); PDDocumentOutline outline = sourceDocument.getDocumentCatalog().getDocumentOutline();

View File

@ -9,7 +9,6 @@ import java.util.List;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream; import java.util.zip.ZipOutputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.multipdf.LayerUtility; import org.apache.pdfbox.multipdf.LayerUtility;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
@ -57,7 +56,7 @@ public class SplitPdfBySectionsController {
List<ByteArrayOutputStream> splitDocumentsBoas = new ArrayList<>(); List<ByteArrayOutputStream> splitDocumentsBoas = new ArrayList<>();
MultipartFile file = request.getFileInput(); MultipartFile file = request.getFileInput();
PDDocument sourceDocument = Loader.loadPDF(file.getBytes()); PDDocument sourceDocument = pdfDocumentFactory.load(file.getBytes());
// Process the PDF based on split parameters // Process the PDF based on split parameters
int horiz = request.getHorizontalDivisions() + 1; int horiz = request.getHorizontalDivisions() + 1;

View File

@ -7,7 +7,6 @@ import java.nio.file.Path;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream; import java.util.zip.ZipOutputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
@ -41,6 +40,9 @@ public class SplitPdfBySizeController {
@Autowired @Autowired
public SplitPdfBySizeController(CustomPDDocumentFactory pdfDocumentFactory) { public SplitPdfBySizeController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory; this.pdfDocumentFactory = pdfDocumentFactory;
log.info(
"SplitPdfBySizeController initialized with pdfDocumentFactory: {}",
pdfDocumentFactory);
} }
@PostMapping(value = "/split-by-size-or-count", consumes = "multipart/form-data") @PostMapping(value = "/split-by-size-or-count", consumes = "multipart/form-data")
@ -52,38 +54,92 @@ public class SplitPdfBySizeController {
public ResponseEntity<byte[]> autoSplitPdf(@ModelAttribute SplitPdfBySizeOrCountRequest request) public ResponseEntity<byte[]> autoSplitPdf(@ModelAttribute SplitPdfBySizeOrCountRequest request)
throws Exception { throws Exception {
log.info("Starting PDF split process with request: {}", request);
MultipartFile file = request.getFileInput(); MultipartFile file = request.getFileInput();
log.info(
"File received: name={}, size={} bytes",
file.getOriginalFilename(),
file.getSize());
Path zipFile = Files.createTempFile("split_documents", ".zip"); Path zipFile = Files.createTempFile("split_documents", ".zip");
log.info("Created temporary zip file: {}", zipFile);
String filename = String filename =
Filenames.toSimpleFileName(file.getOriginalFilename()) Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", ""); .replaceFirst("[.][^.]+$", "");
log.info("Base filename for output: {}", filename);
byte[] data = null; byte[] data = null;
try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile)); try {
PDDocument sourceDocument = Loader.loadPDF(file.getBytes())) { log.info("Reading input file bytes");
byte[] pdfBytes = file.getBytes();
log.info("Successfully read {} bytes from input file", pdfBytes.length);
int type = request.getSplitType(); log.info("Creating ZIP output stream");
String value = request.getSplitValue(); try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile))) {
log.info("Loading PDF document");
try (PDDocument sourceDocument = pdfDocumentFactory.load(pdfBytes)) {
log.info(
"Successfully loaded PDF with {} pages",
sourceDocument.getNumberOfPages());
if (type == 0) { int type = request.getSplitType();
long maxBytes = GeneralUtils.convertSizeToBytes(value); String value = request.getSplitValue();
handleSplitBySize(sourceDocument, maxBytes, zipOut, filename); log.info("Split type: {}, Split value: {}", type, value);
} else if (type == 1) {
int pageCount = Integer.parseInt(value); if (type == 0) {
handleSplitByPageCount(sourceDocument, pageCount, zipOut, filename); log.info("Processing split by size");
} else if (type == 2) { long maxBytes = GeneralUtils.convertSizeToBytes(value);
int documentCount = Integer.parseInt(value); log.info("Max bytes per document: {}", maxBytes);
handleSplitByDocCount(sourceDocument, documentCount, zipOut, filename); handleSplitBySize(sourceDocument, maxBytes, zipOut, filename);
} else { } else if (type == 1) {
throw new IllegalArgumentException("Invalid argument for split type"); log.info("Processing split by page count");
int pageCount = Integer.parseInt(value);
log.info("Pages per document: {}", pageCount);
handleSplitByPageCount(sourceDocument, pageCount, zipOut, filename);
} else if (type == 2) {
log.info("Processing split by document count");
int documentCount = Integer.parseInt(value);
log.info("Total number of documents: {}", documentCount);
handleSplitByDocCount(sourceDocument, documentCount, zipOut, filename);
} else {
log.error("Invalid split type: {}", type);
throw new IllegalArgumentException(
"Invalid argument for split type: " + type);
}
log.info("PDF splitting completed successfully");
} catch (Exception e) {
log.error("Error loading or processing PDF document", e);
throw e;
}
} catch (IOException e) {
log.error("Error creating or writing to ZIP file", e);
throw e;
} }
} catch (Exception e) { } catch (Exception e) {
log.error("exception", e); log.error("Exception during PDF splitting process", e);
throw e; // Re-throw to ensure proper error response
} finally { } finally {
data = Files.readAllBytes(zipFile); try {
Files.deleteIfExists(zipFile); log.info("Reading ZIP file data");
data = Files.readAllBytes(zipFile);
log.info("Successfully read {} bytes from ZIP file", data.length);
} catch (IOException e) {
log.error("Error reading ZIP file data", e);
}
try {
log.info("Deleting temporary ZIP file");
boolean deleted = Files.deleteIfExists(zipFile);
log.info("Temporary ZIP file deleted: {}", deleted);
} catch (IOException e) {
log.error("Error deleting temporary ZIP file", e);
}
} }
log.info("Returning response with {} bytes of data", data != null ? data.length : 0);
return WebResponseUtils.bytesToWebResponse( return WebResponseUtils.bytesToWebResponse(
data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM); data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM);
} }
@ -91,63 +147,230 @@ public class SplitPdfBySizeController {
private void handleSplitBySize( private void handleSplitBySize(
PDDocument sourceDocument, long maxBytes, ZipOutputStream zipOut, String baseFilename) PDDocument sourceDocument, long maxBytes, ZipOutputStream zipOut, String baseFilename)
throws IOException { throws IOException {
long currentSize = 0; log.info("Starting handleSplitBySize with maxBytes={}", maxBytes);
PDDocument currentDoc = PDDocument currentDoc =
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument); pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
int fileIndex = 1; int fileIndex = 1;
int totalPages = sourceDocument.getNumberOfPages();
int pageAdded = 0;
for (int pageIndex = 0; pageIndex < sourceDocument.getNumberOfPages(); pageIndex++) { // Smart size check frequency - check more often with larger documents
int baseCheckFrequency = 5;
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
PDPage page = sourceDocument.getPage(pageIndex); PDPage page = sourceDocument.getPage(pageIndex);
ByteArrayOutputStream pageOutputStream = new ByteArrayOutputStream(); log.info("Processing page {} of {}", pageIndex + 1, totalPages);
try (PDDocument tempDoc = new PDDocument()) { // Add the page to current document
PDPage importedPage = tempDoc.importPage(page); // This creates a new PDPage object PDPage newPage = new PDPage(page.getCOSObject());
tempDoc.save(pageOutputStream); currentDoc.addPage(newPage);
} pageAdded++;
long pageSize = pageOutputStream.size(); // Dynamic size checking based on document size and page count
if (currentSize + pageSize > maxBytes) { boolean shouldCheckSize =
if (currentDoc.getNumberOfPages() > 0) { (pageAdded % baseCheckFrequency == 0)
|| (pageIndex == totalPages - 1)
|| (pageAdded >= 20); // Always check after 20 pages
if (shouldCheckSize) {
log.info("Performing size check after {} pages", pageAdded);
ByteArrayOutputStream checkSizeStream = new ByteArrayOutputStream();
currentDoc.save(checkSizeStream);
long actualSize = checkSizeStream.size();
log.info("Current document size: {} bytes (max: {} bytes)", actualSize, maxBytes);
if (actualSize > maxBytes) {
// We exceeded the limit - remove the last page and save
if (currentDoc.getNumberOfPages() > 1) {
currentDoc.removePage(currentDoc.getNumberOfPages() - 1);
pageIndex--; // Process this page again in the next document
log.info("Size limit exceeded - removed last page");
}
log.info(
"Saving document with {} pages as part {}",
currentDoc.getNumberOfPages(),
fileIndex);
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++); saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
currentDoc.close(); // Make sure to close the document
currentDoc = new PDDocument(); currentDoc = new PDDocument();
currentSize = 0; pageAdded = 0;
} else if (pageIndex < totalPages - 1) {
// We're under the limit, calculate if we might fit more pages
// Try to predict how many more similar pages might fit
if (actualSize < maxBytes * 0.75 && pageAdded > 0) {
// Rather than using a ratio, look ahead to test actual upcoming pages
int pagesToLookAhead = Math.min(5, totalPages - pageIndex - 1);
if (pagesToLookAhead > 0) {
log.info(
"Testing {} upcoming pages for potential addition",
pagesToLookAhead);
// Create a temp document with current pages + look-ahead pages
PDDocument testDoc = new PDDocument();
// First copy existing pages
for (int i = 0; i < currentDoc.getNumberOfPages(); i++) {
testDoc.addPage(new PDPage(currentDoc.getPage(i).getCOSObject()));
}
// Try adding look-ahead pages one by one
int extraPagesAdded = 0;
for (int i = 0; i < pagesToLookAhead; i++) {
int testPageIndex = pageIndex + 1 + i;
PDPage testPage = sourceDocument.getPage(testPageIndex);
testDoc.addPage(new PDPage(testPage.getCOSObject()));
// Check if we're still under size
ByteArrayOutputStream testStream = new ByteArrayOutputStream();
testDoc.save(testStream);
long testSize = testStream.size();
if (testSize <= maxBytes) {
extraPagesAdded++;
log.info(
"Test: Can add page {} (size would be {})",
testPageIndex + 1,
testSize);
} else {
log.info(
"Test: Cannot add page {} (size would be {})",
testPageIndex + 1,
testSize);
break;
}
}
testDoc.close();
// Add the pages we verified would fit
if (extraPagesAdded > 0) {
log.info("Adding {} verified pages ahead", extraPagesAdded);
for (int i = 0; i < extraPagesAdded; i++) {
int extraPageIndex = pageIndex + 1 + i;
PDPage extraPage = sourceDocument.getPage(extraPageIndex);
currentDoc.addPage(new PDPage(extraPage.getCOSObject()));
}
pageIndex += extraPagesAdded;
pageAdded += extraPagesAdded;
}
}
}
} }
} }
PDPage newPage = new PDPage(page.getCOSObject()); // Re-create the page
currentDoc.addPage(newPage);
currentSize += pageSize;
} }
if (currentDoc.getNumberOfPages() != 0) { // Save final document if it has any pages
if (currentDoc.getNumberOfPages() > 0) {
log.info(
"Saving final document with {} pages as part {}",
currentDoc.getNumberOfPages(),
fileIndex);
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++); saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
currentDoc.close();
} }
log.info("Completed handleSplitBySize with {} document parts created", fileIndex - 1);
} }
private void handleSplitByPageCount( private void handleSplitByPageCount(
PDDocument sourceDocument, int pageCount, ZipOutputStream zipOut, String baseFilename) PDDocument sourceDocument, int pageCount, ZipOutputStream zipOut, String baseFilename)
throws IOException { throws IOException {
log.info("Starting handleSplitByPageCount with pageCount={}", pageCount);
int currentPageCount = 0; int currentPageCount = 0;
PDDocument currentDoc = log.info("Creating initial output document");
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument); PDDocument currentDoc = null;
int fileIndex = 1; try {
for (PDPage page : sourceDocument.getPages()) { currentDoc = pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
currentDoc.addPage(page); log.info("Successfully created initial output document");
currentPageCount++; } catch (Exception e) {
log.error("Error creating initial output document", e);
throw new IOException("Failed to create initial output document", e);
}
if (currentPageCount == pageCount) { int fileIndex = 1;
// Save and reset current document int pageIndex = 0;
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++); int totalPages = sourceDocument.getNumberOfPages();
currentDoc = new PDDocument(); log.info("Processing {} pages", totalPages);
currentPageCount = 0;
try {
for (PDPage page : sourceDocument.getPages()) {
pageIndex++;
log.info("Processing page {} of {}", pageIndex, totalPages);
try {
log.info("Adding page {} to current document", pageIndex);
currentDoc.addPage(page);
log.info("Successfully added page {} to current document", pageIndex);
} catch (Exception e) {
log.error("Error adding page {} to current document", pageIndex, e);
throw new IOException("Failed to add page to document", e);
}
currentPageCount++;
log.info("Current page count: {}/{}", currentPageCount, pageCount);
if (currentPageCount == pageCount) {
log.info(
"Reached target page count ({}), saving current document as part {}",
pageCount,
fileIndex);
try {
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
log.info("Successfully saved document part {}", fileIndex - 1);
} catch (Exception e) {
log.error("Error saving document part {}", fileIndex - 1, e);
throw e;
}
try {
log.info("Creating new document for next part");
currentDoc = new PDDocument();
log.info("Successfully created new document");
} catch (Exception e) {
log.error("Error creating new document for next part", e);
throw new IOException("Failed to create new document", e);
}
currentPageCount = 0;
log.info("Reset current page count to 0");
}
}
} catch (Exception e) {
log.error("Error iterating through pages", e);
throw new IOException("Failed to iterate through pages", e);
}
// Add the last document if it contains any pages
try {
if (currentDoc.getPages().getCount() != 0) {
log.info(
"Saving final document with {} pages as part {}",
currentDoc.getPages().getCount(),
fileIndex);
try {
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
log.info("Successfully saved final document part {}", fileIndex - 1);
} catch (Exception e) {
log.error("Error saving final document part {}", fileIndex - 1, e);
throw e;
}
} else {
log.info("Final document has no pages, skipping");
}
} catch (Exception e) {
log.error("Error checking or saving final document", e);
throw new IOException("Failed to process final document", e);
} finally {
try {
log.info("Closing final document");
currentDoc.close();
log.info("Successfully closed final document");
} catch (Exception e) {
log.error("Error closing final document", e);
} }
} }
// Add the last document if it contains any pages
if (currentDoc.getPages().getCount() != 0) { log.info("Completed handleSplitByPageCount with {} document parts created", fileIndex - 1);
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
}
} }
private void handleSplitByDocCount( private void handleSplitByDocCount(
@ -156,35 +379,101 @@ public class SplitPdfBySizeController {
ZipOutputStream zipOut, ZipOutputStream zipOut,
String baseFilename) String baseFilename)
throws IOException { throws IOException {
log.info("Starting handleSplitByDocCount with documentCount={}", documentCount);
int totalPageCount = sourceDocument.getNumberOfPages(); int totalPageCount = sourceDocument.getNumberOfPages();
log.info("Total pages in source document: {}", totalPageCount);
int pagesPerDocument = totalPageCount / documentCount; int pagesPerDocument = totalPageCount / documentCount;
int extraPages = totalPageCount % documentCount; int extraPages = totalPageCount % documentCount;
log.info("Pages per document: {}, Extra pages: {}", pagesPerDocument, extraPages);
int currentPageIndex = 0; int currentPageIndex = 0;
int fileIndex = 1; int fileIndex = 1;
for (int i = 0; i < documentCount; i++) {
PDDocument currentDoc =
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
int pagesToAdd = pagesPerDocument + (i < extraPages ? 1 : 0);
for (int j = 0; j < pagesToAdd; j++) { for (int i = 0; i < documentCount; i++) {
currentDoc.addPage(sourceDocument.getPage(currentPageIndex++)); log.info("Creating document {} of {}", i + 1, documentCount);
PDDocument currentDoc = null;
try {
currentDoc = pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
log.info("Successfully created document {} of {}", i + 1, documentCount);
} catch (Exception e) {
log.error("Error creating document {} of {}", i + 1, documentCount, e);
throw new IOException("Failed to create document", e);
} }
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++); int pagesToAdd = pagesPerDocument + (i < extraPages ? 1 : 0);
log.info("Adding {} pages to document {}", pagesToAdd, i + 1);
for (int j = 0; j < pagesToAdd; j++) {
try {
log.info(
"Adding page {} (index {}) to document {}",
j + 1,
currentPageIndex,
i + 1);
currentDoc.addPage(sourceDocument.getPage(currentPageIndex));
log.info("Successfully added page {} to document {}", j + 1, i + 1);
currentPageIndex++;
} catch (Exception e) {
log.error("Error adding page {} to document {}", j + 1, i + 1, e);
throw new IOException("Failed to add page to document", e);
}
}
try {
log.info("Saving document {} with {} pages", i + 1, pagesToAdd);
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
log.info("Successfully saved document {}", i + 1);
} catch (Exception e) {
log.error("Error saving document {}", i + 1, e);
throw e;
}
} }
log.info("Completed handleSplitByDocCount with {} documents created", documentCount);
} }
private void saveDocumentToZip( private void saveDocumentToZip(
PDDocument document, ZipOutputStream zipOut, String baseFilename, int index) PDDocument document, ZipOutputStream zipOut, String baseFilename, int index)
throws IOException { throws IOException {
log.info("Starting saveDocumentToZip for document part {}", index);
ByteArrayOutputStream outStream = new ByteArrayOutputStream(); ByteArrayOutputStream outStream = new ByteArrayOutputStream();
document.save(outStream);
document.close(); // Close the document to free resources
// Create a new zip entry try {
ZipEntry zipEntry = new ZipEntry(baseFilename + "_" + index + ".pdf"); log.info("Saving document part {} to byte array", index);
zipOut.putNextEntry(zipEntry); document.save(outStream);
zipOut.write(outStream.toByteArray()); log.info("Successfully saved document part {} ({} bytes)", index, outStream.size());
zipOut.closeEntry(); } catch (Exception e) {
log.error("Error saving document part {} to byte array", index, e);
throw new IOException("Failed to save document to byte array", e);
}
try {
log.info("Closing document part {}", index);
document.close();
log.info("Successfully closed document part {}", index);
} catch (Exception e) {
log.error("Error closing document part {}", index, e);
// Continue despite close error
}
try {
// Create a new zip entry
String entryName = baseFilename + "_" + index + ".pdf";
log.info("Creating ZIP entry: {}", entryName);
ZipEntry zipEntry = new ZipEntry(entryName);
zipOut.putNextEntry(zipEntry);
byte[] bytes = outStream.toByteArray();
log.info("Writing {} bytes to ZIP entry", bytes.length);
zipOut.write(bytes);
log.info("Closing ZIP entry");
zipOut.closeEntry();
log.info("Successfully added document part {} to ZIP", index);
} catch (Exception e) {
log.error("Error adding document part {} to ZIP", index, e);
throw new IOException("Failed to add document to ZIP file", e);
}
} }
} }

View File

@ -4,7 +4,6 @@ import java.awt.geom.AffineTransform;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.multipdf.LayerUtility; import org.apache.pdfbox.multipdf.LayerUtility;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
@ -46,7 +45,7 @@ public class ToSinglePageController {
throws IOException { throws IOException {
// Load the source document // Load the source document
PDDocument sourceDocument = Loader.loadPDF(request.getFileInput().getBytes()); PDDocument sourceDocument = pdfDocumentFactory.load(request.getFileInput().getBytes());
// Calculate total height and max width // Calculate total height and max width
float totalHeight = 0; float totalHeight = 0;

View File

@ -13,7 +13,6 @@ import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream; import java.util.zip.ZipOutputStream;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.rendering.ImageType;
@ -91,6 +90,7 @@ public class ConvertImgPDFController {
result = result =
PdfUtils.convertFromPdf( PdfUtils.convertFromPdf(
pdfDocumentFactory,
newPdfBytes, newPdfBytes,
"webp".equalsIgnoreCase(imageFormat) "webp".equalsIgnoreCase(imageFormat)
? "png" ? "png"
@ -245,7 +245,7 @@ public class ConvertImgPDFController {
*/ */
private byte[] rearrangePdfPages(byte[] pdfBytes, String[] pageOrderArr) throws IOException { private byte[] rearrangePdfPages(byte[] pdfBytes, String[] pageOrderArr) throws IOException {
// Load the input PDF // Load the input PDF
PDDocument document = Loader.loadPDF(pdfBytes); PDDocument document = pdfDocumentFactory.load(pdfBytes);
int totalPages = document.getNumberOfPages(); int totalPages = document.getNumberOfPages();
List<Integer> newPageOrder = GeneralUtils.parsePageList(pageOrderArr, totalPages, false); List<Integer> newPageOrder = GeneralUtils.parsePageList(pageOrderArr, totalPages, false);

View File

@ -2,9 +2,9 @@ package stirling.software.SPDF.controller.api.converters;
import java.io.IOException; import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.MediaType; import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.ModelAttribute;
@ -21,6 +21,7 @@ import stirling.software.SPDF.model.api.PDFFile;
import stirling.software.SPDF.model.api.converters.PdfToPresentationRequest; import stirling.software.SPDF.model.api.converters.PdfToPresentationRequest;
import stirling.software.SPDF.model.api.converters.PdfToTextOrRTFRequest; import stirling.software.SPDF.model.api.converters.PdfToTextOrRTFRequest;
import stirling.software.SPDF.model.api.converters.PdfToWordRequest; import stirling.software.SPDF.model.api.converters.PdfToWordRequest;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.PDFToFile; import stirling.software.SPDF.utils.PDFToFile;
import stirling.software.SPDF.utils.WebResponseUtils; import stirling.software.SPDF.utils.WebResponseUtils;
@ -29,6 +30,13 @@ import stirling.software.SPDF.utils.WebResponseUtils;
@Tag(name = "Convert", description = "Convert APIs") @Tag(name = "Convert", description = "Convert APIs")
public class ConvertPDFToOffice { public class ConvertPDFToOffice {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public ConvertPDFToOffice(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(consumes = "multipart/form-data", value = "/pdf/presentation") @PostMapping(consumes = "multipart/form-data", value = "/pdf/presentation")
@Operation( @Operation(
summary = "Convert PDF to Presentation format", summary = "Convert PDF to Presentation format",
@ -54,7 +62,7 @@ public class ConvertPDFToOffice {
MultipartFile inputFile = request.getFileInput(); MultipartFile inputFile = request.getFileInput();
String outputFormat = request.getOutputFormat(); String outputFormat = request.getOutputFormat();
if ("txt".equals(request.getOutputFormat())) { if ("txt".equals(request.getOutputFormat())) {
try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) { try (PDDocument document = pdfDocumentFactory.load(inputFile.getBytes())) {
PDFTextStripper stripper = new PDFTextStripper(); PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(document); String text = stripper.getText(document);
return WebResponseUtils.bytesToWebResponse( return WebResponseUtils.bytesToWebResponse(

View File

@ -12,8 +12,8 @@ import java.util.zip.ZipOutputStream;
import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.QuoteMode; import org.apache.commons.csv.QuoteMode;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.ContentDisposition; import org.springframework.http.ContentDisposition;
import org.springframework.http.HttpHeaders; import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType; import org.springframework.http.MediaType;
@ -30,6 +30,7 @@ import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.PDFWithPageNums; import stirling.software.SPDF.model.api.PDFWithPageNums;
import stirling.software.SPDF.pdf.FlexibleCSVWriter; import stirling.software.SPDF.pdf.FlexibleCSVWriter;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import technology.tabula.ObjectExtractor; import technology.tabula.ObjectExtractor;
import technology.tabula.Page; import technology.tabula.Page;
@ -42,6 +43,13 @@ import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
@Slf4j @Slf4j
public class ExtractCSVController { public class ExtractCSVController {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public ExtractCSVController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(value = "/pdf/csv", consumes = "multipart/form-data") @PostMapping(value = "/pdf/csv", consumes = "multipart/form-data")
@Operation( @Operation(
summary = "Extracts a CSV document from a PDF", summary = "Extracts a CSV document from a PDF",
@ -51,7 +59,7 @@ public class ExtractCSVController {
String baseName = getBaseName(form.getFileInput().getOriginalFilename()); String baseName = getBaseName(form.getFileInput().getOriginalFilename());
List<CsvEntry> csvEntries = new ArrayList<>(); List<CsvEntry> csvEntries = new ArrayList<>();
try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) { try (PDDocument document = pdfDocumentFactory.load(form.getFileInput().getBytes())) {
List<Integer> pages = form.getPageNumbersList(document, true); List<Integer> pages = form.getPageNumbersList(document, true);
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
CSVFormat format = CSVFormat format =

View File

@ -2,10 +2,10 @@ package stirling.software.SPDF.controller.api.filters;
import java.io.IOException; import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.PostMapping;
@ -23,6 +23,7 @@ import stirling.software.SPDF.model.api.filter.ContainsTextRequest;
import stirling.software.SPDF.model.api.filter.FileSizeRequest; import stirling.software.SPDF.model.api.filter.FileSizeRequest;
import stirling.software.SPDF.model.api.filter.PageRotationRequest; import stirling.software.SPDF.model.api.filter.PageRotationRequest;
import stirling.software.SPDF.model.api.filter.PageSizeRequest; import stirling.software.SPDF.model.api.filter.PageSizeRequest;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.PdfUtils; import stirling.software.SPDF.utils.PdfUtils;
import stirling.software.SPDF.utils.WebResponseUtils; import stirling.software.SPDF.utils.WebResponseUtils;
@ -31,6 +32,13 @@ import stirling.software.SPDF.utils.WebResponseUtils;
@Tag(name = "Filter", description = "Filter APIs") @Tag(name = "Filter", description = "Filter APIs")
public class FilterController { public class FilterController {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public FilterController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(consumes = "multipart/form-data", value = "/filter-contains-text") @PostMapping(consumes = "multipart/form-data", value = "/filter-contains-text")
@Operation( @Operation(
summary = "Checks if a PDF contains set text, returns true if does", summary = "Checks if a PDF contains set text, returns true if does",
@ -41,7 +49,7 @@ public class FilterController {
String text = request.getText(); String text = request.getText();
String pageNumber = request.getPageNumbers(); String pageNumber = request.getPageNumbers();
PDDocument pdfDocument = Loader.loadPDF(inputFile.getBytes()); PDDocument pdfDocument = pdfDocumentFactory.load(inputFile.getBytes());
if (PdfUtils.hasText(pdfDocument, pageNumber, text)) if (PdfUtils.hasText(pdfDocument, pageNumber, text))
return WebResponseUtils.pdfDocToWebResponse( return WebResponseUtils.pdfDocToWebResponse(
pdfDocument, Filenames.toSimpleFileName(inputFile.getOriginalFilename())); pdfDocument, Filenames.toSimpleFileName(inputFile.getOriginalFilename()));
@ -58,7 +66,7 @@ public class FilterController {
MultipartFile inputFile = request.getFileInput(); MultipartFile inputFile = request.getFileInput();
String pageNumber = request.getPageNumbers(); String pageNumber = request.getPageNumbers();
PDDocument pdfDocument = Loader.loadPDF(inputFile.getBytes()); PDDocument pdfDocument = pdfDocumentFactory.load(inputFile.getBytes());
if (PdfUtils.hasImages(pdfDocument, pageNumber)) if (PdfUtils.hasImages(pdfDocument, pageNumber))
return WebResponseUtils.pdfDocToWebResponse( return WebResponseUtils.pdfDocToWebResponse(
pdfDocument, Filenames.toSimpleFileName(inputFile.getOriginalFilename())); pdfDocument, Filenames.toSimpleFileName(inputFile.getOriginalFilename()));
@ -75,7 +83,7 @@ public class FilterController {
String pageCount = request.getPageCount(); String pageCount = request.getPageCount();
String comparator = request.getComparator(); String comparator = request.getComparator();
// Load the PDF // Load the PDF
PDDocument document = Loader.loadPDF(inputFile.getBytes()); PDDocument document = pdfDocumentFactory.load(inputFile.getBytes());
int actualPageCount = document.getNumberOfPages(); int actualPageCount = document.getNumberOfPages();
boolean valid = false; boolean valid = false;
@ -109,7 +117,7 @@ public class FilterController {
String comparator = request.getComparator(); String comparator = request.getComparator();
// Load the PDF // Load the PDF
PDDocument document = Loader.loadPDF(inputFile.getBytes()); PDDocument document = pdfDocumentFactory.load(inputFile.getBytes());
PDPage firstPage = document.getPage(0); PDPage firstPage = document.getPage(0);
PDRectangle actualPageSize = firstPage.getMediaBox(); PDRectangle actualPageSize = firstPage.getMediaBox();
@ -185,7 +193,7 @@ public class FilterController {
String comparator = request.getComparator(); String comparator = request.getComparator();
// Load the PDF // Load the PDF
PDDocument document = Loader.loadPDF(inputFile.getBytes()); PDDocument document = pdfDocumentFactory.load(inputFile.getBytes());
// Get the rotation of the first page // Get the rotation of the first page
PDPage firstPage = document.getPage(0); PDPage firstPage = document.getPage(0);

View File

@ -5,10 +5,10 @@ import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.List; import java.util.List;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.text.TextPosition;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.PostMapping;
@ -23,6 +23,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.misc.ExtractHeaderRequest; import stirling.software.SPDF.model.api.misc.ExtractHeaderRequest;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.WebResponseUtils; import stirling.software.SPDF.utils.WebResponseUtils;
@RestController @RestController
@ -34,6 +35,13 @@ public class AutoRenameController {
private static final float TITLE_FONT_SIZE_THRESHOLD = 20.0f; private static final float TITLE_FONT_SIZE_THRESHOLD = 20.0f;
private static final int LINE_LIMIT = 200; private static final int LINE_LIMIT = 200;
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public AutoRenameController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(consumes = "multipart/form-data", value = "/auto-rename") @PostMapping(consumes = "multipart/form-data", value = "/auto-rename")
@Operation( @Operation(
summary = "Extract header from PDF file", summary = "Extract header from PDF file",
@ -44,7 +52,7 @@ public class AutoRenameController {
MultipartFile file = request.getFileInput(); MultipartFile file = request.getFileInput();
Boolean useFirstTextAsFallback = request.isUseFirstTextAsFallback(); Boolean useFirstTextAsFallback = request.isUseFirstTextAsFallback();
PDDocument document = Loader.loadPDF(file.getBytes()); PDDocument document = pdfDocumentFactory.load(file.getBytes());
PDFTextStripper reader = PDFTextStripper reader =
new PDFTextStripper() { new PDFTextStripper() {
List<LineInfo> lineInfos = new ArrayList<>(); List<LineInfo> lineInfos = new ArrayList<>();

View File

@ -111,9 +111,9 @@ public class AutoSplitPdfController {
summary = "Auto split PDF pages into separate documents", summary = "Auto split PDF pages into separate documents",
description = description =
"This endpoint accepts a PDF file, scans each page for a specific QR code, and" "This endpoint accepts a PDF file, scans each page for a specific QR code, and"
+ " splits the document at the QR code boundaries. The output is a zip file" + " splits the document at the QR code boundaries. The output is a zip file"
+ " containing each separate PDF document. Input:PDF Output:ZIP-PDF" + " containing each separate PDF document. Input:PDF Output:ZIP-PDF"
+ " Type:SISO") + " Type:SISO")
public ResponseEntity<byte[]> autoSplitPdf(@ModelAttribute AutoSplitPdfRequest request) public ResponseEntity<byte[]> autoSplitPdf(@ModelAttribute AutoSplitPdfRequest request)
throws IOException { throws IOException {
MultipartFile file = request.getFileInput(); MultipartFile file = request.getFileInput();

View File

@ -8,7 +8,6 @@ import java.util.List;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream; import java.util.zip.ZipOutputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree; import org.apache.pdfbox.pdmodel.PDPageTree;
@ -85,7 +84,7 @@ public class BlankPageController {
int threshold = request.getThreshold(); int threshold = request.getThreshold();
float whitePercent = request.getWhitePercent(); float whitePercent = request.getWhitePercent();
try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) { try (PDDocument document = pdfDocumentFactory.load(inputFile.getBytes())) {
PDPageTree pages = document.getDocumentCatalog().getPages(); PDPageTree pages = document.getDocumentCatalog().getPages();
PDFTextStripper textStripper = new PDFTextStripper(); PDFTextStripper textStripper = new PDFTextStripper();

View File

@ -18,7 +18,6 @@ import javax.imageio.ImageWriter;
import javax.imageio.plugins.jpeg.JPEGImageWriteParam; import javax.imageio.plugins.jpeg.JPEGImageWriteParam;
import javax.imageio.stream.ImageOutputStream; import javax.imageio.stream.ImageOutputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
@ -59,7 +58,8 @@ public class CompressController {
this.pdfDocumentFactory = pdfDocumentFactory; this.pdfDocumentFactory = pdfDocumentFactory;
} }
private void compressImagesInPDF(Path pdfFile, double scaleFactor, float jpegQuality) throws Exception { private void compressImagesInPDF(Path pdfFile, double scaleFactor, float jpegQuality)
throws Exception {
byte[] fileBytes = Files.readAllBytes(pdfFile); byte[] fileBytes = Files.readAllBytes(pdfFile);
long originalFileSize = fileBytes.length; long originalFileSize = fileBytes.length;
log.info( log.info(
@ -71,7 +71,7 @@ public class CompressController {
// Track processed images to avoid recompression // Track processed images to avoid recompression
Set<String> processedImages = new HashSet<>(); Set<String> processedImages = new HashSet<>();
try (PDDocument doc = Loader.loadPDF(fileBytes)) { try (PDDocument doc = pdfDocumentFactory.load(fileBytes)) {
int totalImages = 0; int totalImages = 0;
int compressedImages = 0; int compressedImages = 0;
int skippedImages = 0; int skippedImages = 0;
@ -204,10 +204,12 @@ public class CompressController {
// Choose appropriate format and compression // Choose appropriate format and compression
String format = bufferedImage.getColorModel().hasAlpha() ? "png" : "jpeg"; String format = bufferedImage.getColorModel().hasAlpha() ? "png" : "jpeg";
// First get the actual size of the original image by encoding it to the chosen format // First get the actual size of the original image by encoding it to the chosen
// format
ByteArrayOutputStream originalImageStream = new ByteArrayOutputStream(); ByteArrayOutputStream originalImageStream = new ByteArrayOutputStream();
if (format.equals("jpeg")) { if (format.equals("jpeg")) {
// Get the best available JPEG writer (prioritizes TwelveMonkeys if available) // Get the best available JPEG writer (prioritizes TwelveMonkeys if
// available)
Iterator<ImageWriter> writers = ImageIO.getImageWritersByFormatName("jpeg"); Iterator<ImageWriter> writers = ImageIO.getImageWritersByFormatName("jpeg");
ImageWriter writer = null; ImageWriter writer = null;
@ -430,8 +432,8 @@ public class CompressController {
// All levels (1-9): Apply QPDF compression // All levels (1-9): Apply QPDF compression
if (!qpdfCompressionApplied) { if (!qpdfCompressionApplied) {
long preQpdfSize = Files.size(tempInputFile); long preQpdfSize = Files.size(tempInputFile);
log.info("Pre-QPDF file size: {}", GeneralUtils.formatBytes(preQpdfSize)); log.info("Pre-QPDF file size: {}", GeneralUtils.formatBytes(preQpdfSize));
// For levels 1-3, map to qpdf compression levels 1-9 // For levels 1-3, map to qpdf compression levels 1-9
int qpdfCompressionLevel = optimizeLevel; int qpdfCompressionLevel = optimizeLevel;
@ -472,8 +474,7 @@ public class CompressController {
double qpdfReduction = 100.0 - ((postQpdfSize * 100.0) / preQpdfSize); double qpdfReduction = 100.0 - ((postQpdfSize * 100.0) / preQpdfSize);
log.info( log.info(
"Post-QPDF file size: {} (reduced by {:.1f}%)", "Post-QPDF file size: {} (reduced by {:.1f}%)",
GeneralUtils.formatBytes(postQpdfSize), GeneralUtils.formatBytes(postQpdfSize), qpdfReduction);
qpdfReduction);
} else { } else {
tempOutputFile = tempInputFile; tempOutputFile = tempInputFile;

View File

@ -0,0 +1,145 @@
package stirling.software.SPDF.controller.api.misc;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.HashSet;
import java.util.Set;
import org.apache.pdfbox.cos.*;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.PDFFile;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@RequestMapping("/api/v1/misc")
@Slf4j
@Tag(name = "Misc", description = "Miscellaneous APIs")
public class DecompressPdfController {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public DecompressPdfController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(value = "/decompress-pdf", consumes = "multipart/form-data")
@Operation(
summary = "Decompress PDF streams",
description = "Fully decompresses all PDF streams including text content")
public ResponseEntity<byte[]> decompressPdf(@ModelAttribute PDFFile request)
throws IOException {
MultipartFile file = request.getFileInput();
try (PDDocument document = pdfDocumentFactory.load(file.getBytes())) {
// Process all objects in document
processAllObjects(document);
// Save with explicit no compression
ByteArrayOutputStream baos = new ByteArrayOutputStream();
document.save(baos, CompressParameters.NO_COMPRESSION);
String outputFilename =
file.getOriginalFilename().replaceFirst("\\.(?=[^.]+$)", "_decompressed.");
return WebResponseUtils.bytesToWebResponse(
baos.toByteArray(), outputFilename, MediaType.APPLICATION_PDF);
}
}
private void processAllObjects(PDDocument document) {
Set<COSBase> processed = new HashSet<>();
COSDocument cosDoc = document.getDocument();
// Process all objects in the document
for (COSObjectKey key : cosDoc.getXrefTable().keySet()) {
COSObject obj = cosDoc.getObjectFromPool(key);
processObject(obj, processed);
}
}
private void processObject(COSBase obj, Set<COSBase> processed) {
// Skip null objects or already processed objects to avoid infinite recursion
if (obj == null || processed.contains(obj)) return;
processed.add(obj);
if (obj instanceof COSObject cosObj) {
processObject(cosObj.getObject(), processed);
} else if (obj instanceof COSDictionary dict) {
processDictionary(dict, processed);
} else if (obj instanceof COSArray array) {
processArray(array, processed);
}
}
private void processDictionary(COSDictionary dict, Set<COSBase> processed) {
// Process all dictionary entries
for (COSName key : dict.keySet()) {
processObject(dict.getDictionaryObject(key), processed);
}
// If this is a stream, decompress it
if (dict instanceof COSStream stream) {
decompressStream(stream);
}
}
private void processArray(COSArray array, Set<COSBase> processed) {
// Process all array elements
for (int i = 0; i < array.size(); i++) {
processObject(array.get(i), processed);
}
}
private void decompressStream(COSStream stream) {
try {
log.debug("Processing stream: {}", stream);
// Only remove filter information if it exists
if (stream.containsKey(COSName.FILTER)
|| stream.containsKey(COSName.DECODE_PARMS)
|| stream.containsKey(COSName.D)) {
// Read the decompressed content first
byte[] decompressedBytes;
try (COSInputStream is = stream.createInputStream()) {
decompressedBytes = IOUtils.toByteArray(is);
}
// Now remove filter information
stream.removeItem(COSName.FILTER);
stream.removeItem(COSName.DECODE_PARMS);
stream.removeItem(COSName.D);
// Write the raw content back
try (OutputStream out = stream.createRawOutputStream()) {
out.write(decompressedBytes);
}
// Set the Length to reflect the new stream size
stream.setInt(COSName.LENGTH, decompressedBytes.length);
}
} catch (IOException e) {
log.error("Error decompressing stream", e);
// Continue processing other streams even if this one fails
}
}
}

View File

@ -14,9 +14,9 @@ import java.util.zip.ZipOutputStream;
import javax.imageio.ImageIO; import javax.imageio.ImageIO;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.rendering.PDFRenderer;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.MediaType; import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.PostMapping;
@ -32,6 +32,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.misc.ExtractImageScansRequest; import stirling.software.SPDF.model.api.misc.ExtractImageScansRequest;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.CheckProgramInstall; import stirling.software.SPDF.utils.CheckProgramInstall;
import stirling.software.SPDF.utils.ProcessExecutor; import stirling.software.SPDF.utils.ProcessExecutor;
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult; import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
@ -45,6 +46,13 @@ public class ExtractImageScansController {
private static final String REPLACEFIRST = "[.][^.]+$"; private static final String REPLACEFIRST = "[.][^.]+$";
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public ExtractImageScansController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(consumes = "multipart/form-data", value = "/extract-image-scans") @PostMapping(consumes = "multipart/form-data", value = "/extract-image-scans")
@Operation( @Operation(
summary = "Extract image scans from an input file", summary = "Extract image scans from an input file",
@ -87,7 +95,8 @@ public class ExtractImageScansController {
// Check if input file is a PDF // Check if input file is a PDF
if ("pdf".equalsIgnoreCase(extension)) { if ("pdf".equalsIgnoreCase(extension)) {
// Load PDF document // Load PDF document
try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) { try (PDDocument document =
pdfDocumentFactory.load(form.getFileInput().getBytes())) {
PDFRenderer pdfRenderer = new PDFRenderer(document); PDFRenderer pdfRenderer = new PDFRenderer(document);
pdfRenderer.setSubsamplingAllowed(true); pdfRenderer.setSubsamplingAllowed(true);
int pageCount = document.getNumberOfPages(); int pageCount = document.getNumberOfPages();

View File

@ -20,11 +20,11 @@ import java.util.zip.ZipOutputStream;
import javax.imageio.ImageIO; import javax.imageio.ImageIO;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.MediaType; import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.ModelAttribute;
@ -40,6 +40,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.PDFExtractImagesRequest; import stirling.software.SPDF.model.api.PDFExtractImagesRequest;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.ImageProcessingUtils; import stirling.software.SPDF.utils.ImageProcessingUtils;
import stirling.software.SPDF.utils.WebResponseUtils; import stirling.software.SPDF.utils.WebResponseUtils;
@ -49,6 +50,13 @@ import stirling.software.SPDF.utils.WebResponseUtils;
@Tag(name = "Misc", description = "Miscellaneous APIs") @Tag(name = "Misc", description = "Miscellaneous APIs")
public class ExtractImagesController { public class ExtractImagesController {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public ExtractImagesController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(consumes = "multipart/form-data", value = "/extract-images") @PostMapping(consumes = "multipart/form-data", value = "/extract-images")
@Operation( @Operation(
summary = "Extract images from a PDF file", summary = "Extract images from a PDF file",
@ -59,7 +67,7 @@ public class ExtractImagesController {
MultipartFile file = request.getFileInput(); MultipartFile file = request.getFileInput();
String format = request.getFormat(); String format = request.getFormat();
boolean allowDuplicates = request.isAllowDuplicates(); boolean allowDuplicates = request.isAllowDuplicates();
PDDocument document = Loader.loadPDF(file.getBytes()); PDDocument document = pdfDocumentFactory.load(file.getBytes());
// Determine if multithreading should be used based on PDF size or number of pages // Determine if multithreading should be used based on PDF size or number of pages
boolean useMultithreading = shouldUseMultithreading(file, document); boolean useMultithreading = shouldUseMultithreading(file, document);

View File

@ -3,7 +3,6 @@ package stirling.software.SPDF.controller.api.misc;
import java.awt.image.BufferedImage; import java.awt.image.BufferedImage;
import java.io.IOException; import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.PDPageContentStream;
@ -51,7 +50,7 @@ public class FlattenController {
public ResponseEntity<byte[]> flatten(@ModelAttribute FlattenRequest request) throws Exception { public ResponseEntity<byte[]> flatten(@ModelAttribute FlattenRequest request) throws Exception {
MultipartFile file = request.getFileInput(); MultipartFile file = request.getFileInput();
PDDocument document = Loader.loadPDF(file.getBytes()); PDDocument document = pdfDocumentFactory.load(file.getBytes());
Boolean flattenOnlyForms = request.getFlattenOnlyForms(); Boolean flattenOnlyForms = request.getFlattenOnlyForms();
if (Boolean.TRUE.equals(flattenOnlyForms)) { if (Boolean.TRUE.equals(flattenOnlyForms)) {

View File

@ -7,10 +7,10 @@ import java.util.Calendar;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.WebDataBinder; import org.springframework.web.bind.WebDataBinder;
import org.springframework.web.bind.annotation.*; import org.springframework.web.bind.annotation.*;
@ -23,6 +23,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.misc.MetadataRequest; import stirling.software.SPDF.model.api.misc.MetadataRequest;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.WebResponseUtils; import stirling.software.SPDF.utils.WebResponseUtils;
import stirling.software.SPDF.utils.propertyeditor.StringToMapPropertyEditor; import stirling.software.SPDF.utils.propertyeditor.StringToMapPropertyEditor;
@ -32,6 +33,13 @@ import stirling.software.SPDF.utils.propertyeditor.StringToMapPropertyEditor;
@Tag(name = "Misc", description = "Miscellaneous APIs") @Tag(name = "Misc", description = "Miscellaneous APIs")
public class MetadataController { public class MetadataController {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public MetadataController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
private String checkUndefined(String entry) { private String checkUndefined(String entry) {
// Check if the string is "undefined" // Check if the string is "undefined"
if ("undefined".equals(entry)) { if ("undefined".equals(entry)) {
@ -76,7 +84,7 @@ public class MetadataController {
allRequestParams = new java.util.HashMap<String, String>(); allRequestParams = new java.util.HashMap<String, String>();
} }
// Load the PDF file into a PDDocument // Load the PDF file into a PDDocument
PDDocument document = Loader.loadPDF(pdfFile.getBytes()); PDDocument document = pdfDocumentFactory.load(pdfFile.getBytes());
// Get the document information from the PDF // Get the document information from the PDF
PDDocumentInformation info = document.getDocumentInformation(); PDDocumentInformation info = document.getDocumentInformation();

View File

@ -73,17 +73,16 @@ public class PageNumbersController {
case "x-large": case "x-large":
marginFactor = 0.075f; marginFactor = 0.075f;
break; break;
default: default:
marginFactor = 0.035f; marginFactor = 0.035f;
break; break;
} }
float fontSize = font_size; float fontSize = font_size;
if (pagesToNumber == null || pagesToNumber.length() == 0) { if (pagesToNumber == null || pagesToNumber.isEmpty()) {
pagesToNumber = "all"; pagesToNumber = "all";
} }
if (customText == null || customText.length() == 0) { if (customText == null || customText.isEmpty()) {
customText = "{n}"; customText = "{n}";
} }
List<Integer> pagesToNumberList = List<Integer> pagesToNumberList =
@ -94,63 +93,69 @@ public class PageNumbersController {
PDRectangle pageSize = page.getMediaBox(); PDRectangle pageSize = page.getMediaBox();
String text = String text =
customText != null customText
? customText .replace("{n}", String.valueOf(pageNumber))
.replace("{n}", String.valueOf(pageNumber)) .replace("{total}", String.valueOf(document.getNumberOfPages()))
.replace("{total}", String.valueOf(document.getNumberOfPages())) .replace(
.replace( "{filename}",
"{filename}", Filenames.toSimpleFileName(file.getOriginalFilename())
Filenames.toSimpleFileName(file.getOriginalFilename()) .replaceFirst("[.][^.]+$", ""));
.replaceFirst("[.][^.]+$", ""))
: String.valueOf(pageNumber); PDType1Font currentFont =
switch (font_type.toLowerCase()) {
case "courier" -> new PDType1Font(Standard14Fonts.FontName.COURIER);
case "times" -> new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN);
default -> new PDType1Font(Standard14Fonts.FontName.HELVETICA);
};
float x, y; float x, y;
int xGroup = (position - 1) % 3; if (position == 5) {
int yGroup = 2 - (position - 1) / 3; // Calculate text width and font metrics
float textWidth = currentFont.getStringWidth(text) / 1000 * fontSize;
switch (xGroup) { float ascent = currentFont.getFontDescriptor().getAscent() / 1000 * fontSize;
case 0: // left float descent = currentFont.getFontDescriptor().getDescent() / 1000 * fontSize;
x = pageSize.getLowerLeftX() + marginFactor * pageSize.getWidth();
break;
case 1: // center
x = pageSize.getLowerLeftX() + (pageSize.getWidth() / 2);
break;
default: // right
x = pageSize.getUpperRightX() - marginFactor * pageSize.getWidth();
break;
}
switch (yGroup) { float centerX = pageSize.getLowerLeftX() + (pageSize.getWidth() / 2);
case 0: // bottom float centerY = pageSize.getLowerLeftY() + (pageSize.getHeight() / 2);
y = pageSize.getLowerLeftY() + marginFactor * pageSize.getHeight();
break; x = centerX - (textWidth / 2);
case 1: // middle y = centerY - (ascent + descent) / 2;
y = pageSize.getLowerLeftY() + (pageSize.getHeight() / 2); } else {
break; int xGroup = (position - 1) % 3;
default: // top int yGroup = 2 - (position - 1) / 3;
y = pageSize.getUpperRightY() - marginFactor * pageSize.getHeight();
break; x =
switch (xGroup) {
case 0 ->
pageSize.getLowerLeftX()
+ marginFactor * pageSize.getWidth(); // left
case 1 ->
pageSize.getLowerLeftX() + (pageSize.getWidth() / 2); // center
default ->
pageSize.getUpperRightX()
- marginFactor * pageSize.getWidth(); // right
};
y =
switch (yGroup) {
case 0 ->
pageSize.getLowerLeftY()
+ marginFactor * pageSize.getHeight(); // bottom
case 1 ->
pageSize.getLowerLeftY() + (pageSize.getHeight() / 2); // middle
default ->
pageSize.getUpperRightY()
- marginFactor * pageSize.getHeight(); // top
};
} }
PDPageContentStream contentStream = PDPageContentStream contentStream =
new PDPageContentStream( new PDPageContentStream(
document, page, PDPageContentStream.AppendMode.APPEND, true, true); document, page, PDPageContentStream.AppendMode.APPEND, true, true);
contentStream.beginText(); contentStream.beginText();
switch (font_type.toLowerCase()) { contentStream.setFont(currentFont, fontSize);
case "helvetica":
contentStream.setFont(
new PDType1Font(Standard14Fonts.FontName.HELVETICA), fontSize);
break;
case "courier":
contentStream.setFont(
new PDType1Font(Standard14Fonts.FontName.COURIER), fontSize);
break;
case "times":
contentStream.setFont(
new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN), fontSize);
break;
}
contentStream.newLineAtOffset(x, y); contentStream.newLineAtOffset(x, y);
contentStream.showText(text); contentStream.showText(text);
contentStream.endText(); contentStream.endText();

View File

@ -3,10 +3,10 @@ package stirling.software.SPDF.controller.api.misc;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Map; import java.util.Map;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript; import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.MediaType; import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.ModelAttribute;
@ -20,6 +20,7 @@ import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag; import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.PDFFile; import stirling.software.SPDF.model.api.PDFFile;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.WebResponseUtils; import stirling.software.SPDF.utils.WebResponseUtils;
@RestController @RestController
@ -27,6 +28,13 @@ import stirling.software.SPDF.utils.WebResponseUtils;
@Tag(name = "Misc", description = "Miscellaneous APIs") @Tag(name = "Misc", description = "Miscellaneous APIs")
public class ShowJavascript { public class ShowJavascript {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public ShowJavascript(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(consumes = "multipart/form-data", value = "/show-javascript") @PostMapping(consumes = "multipart/form-data", value = "/show-javascript")
@Operation( @Operation(
summary = "Grabs all JS from a PDF and returns a single JS file with all code", summary = "Grabs all JS from a PDF and returns a single JS file with all code",
@ -35,7 +43,7 @@ public class ShowJavascript {
MultipartFile inputFile = request.getFileInput(); MultipartFile inputFile = request.getFileInput();
String script = ""; String script = "";
try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) { try (PDDocument document = pdfDocumentFactory.load(inputFile.getBytes())) {
if (document.getDocumentCatalog() != null if (document.getDocumentCatalog() != null
&& document.getDocumentCatalog().getNames() != null) { && document.getDocumentCatalog().getNames() != null) {

View File

@ -130,8 +130,8 @@ public class CertSignController {
summary = "Sign PDF with a Digital Certificate", summary = "Sign PDF with a Digital Certificate",
description = description =
"This endpoint accepts a PDF file, a digital certificate and related" "This endpoint accepts a PDF file, a digital certificate and related"
+ " information to sign the PDF. It then returns the digitally signed PDF" + " information to sign the PDF. It then returns the digitally signed PDF"
+ " file. Input:PDF Output:PDF Type:SISO") + " file. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> signPDFWithCert(@ModelAttribute SignPDFWithCertRequest request) public ResponseEntity<byte[]> signPDFWithCert(@ModelAttribute SignPDFWithCertRequest request)
throws Exception { throws Exception {
MultipartFile pdf = request.getFileInput(); MultipartFile pdf = request.getFileInput();

View File

@ -6,7 +6,6 @@ import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.*; import java.util.*;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSInputStream; import org.apache.pdfbox.cos.COSInputStream;
import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString; import org.apache.pdfbox.cos.COSString;
@ -44,6 +43,7 @@ import org.apache.xmpbox.XMPMetadata;
import org.apache.xmpbox.xml.DomXmpParser; import org.apache.xmpbox.xml.DomXmpParser;
import org.apache.xmpbox.xml.XmpParsingException; import org.apache.xmpbox.xml.XmpParsingException;
import org.apache.xmpbox.xml.XmpSerializer; import org.apache.xmpbox.xml.XmpSerializer;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.MediaType; import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.ModelAttribute;
@ -62,6 +62,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.PDFFile; import stirling.software.SPDF.model.api.PDFFile;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.WebResponseUtils; import stirling.software.SPDF.utils.WebResponseUtils;
@RestController @RestController
@ -72,6 +73,13 @@ public class GetInfoOnPDF {
static ObjectMapper objectMapper = new ObjectMapper(); static ObjectMapper objectMapper = new ObjectMapper();
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public GetInfoOnPDF(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
private static void addOutlinesToArray(PDOutlineItem outline, ArrayNode arrayNode) { private static void addOutlinesToArray(PDOutlineItem outline, ArrayNode arrayNode) {
if (outline == null) return; if (outline == null) return;
@ -118,7 +126,7 @@ public class GetInfoOnPDF {
@Operation(summary = "Summary here", description = "desc. Input:PDF Output:JSON Type:SISO") @Operation(summary = "Summary here", description = "desc. Input:PDF Output:JSON Type:SISO")
public ResponseEntity<byte[]> getPdfInfo(@ModelAttribute PDFFile request) throws IOException { public ResponseEntity<byte[]> getPdfInfo(@ModelAttribute PDFFile request) throws IOException {
MultipartFile inputFile = request.getFileInput(); MultipartFile inputFile = request.getFileInput();
try (PDDocument pdfBoxDoc = Loader.loadPDF(inputFile.getBytes()); ) { try (PDDocument pdfBoxDoc = pdfDocumentFactory.load(inputFile.getBytes()); ) {
ObjectMapper objectMapper = new ObjectMapper(); ObjectMapper objectMapper = new ObjectMapper();
ObjectNode jsonOutput = objectMapper.createObjectNode(); ObjectNode jsonOutput = objectMapper.createObjectNode();

View File

@ -3,8 +3,11 @@ package stirling.software.SPDF.controller.api.security;
import java.awt.*; import java.awt.*;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
@ -94,7 +97,10 @@ public class RedactController {
private void redactAreas( private void redactAreas(
List<RedactionArea> redactionAreas, PDDocument document, PDPageTree allPages) List<RedactionArea> redactionAreas, PDDocument document, PDPageTree allPages)
throws IOException { throws IOException {
Color redactColor = null; // Group redaction areas by page
Map<Integer, List<RedactionArea>> redactionsByPage = new HashMap<>();
// Process and validate each redaction area
for (RedactionArea redactionArea : redactionAreas) { for (RedactionArea redactionArea : redactionAreas) {
if (redactionArea.getPage() == null if (redactionArea.getPage() == null
|| redactionArea.getPage() <= 0 || redactionArea.getPage() <= 0
@ -102,23 +108,44 @@ public class RedactController {
|| redactionArea.getHeight() <= 0.0D || redactionArea.getHeight() <= 0.0D
|| redactionArea.getWidth() == null || redactionArea.getWidth() == null
|| redactionArea.getWidth() <= 0.0D) continue; || redactionArea.getWidth() <= 0.0D) continue;
PDPage page = allPages.get(redactionArea.getPage() - 1);
// Group by page number
redactionsByPage
.computeIfAbsent(redactionArea.getPage(), k -> new ArrayList<>())
.add(redactionArea);
}
// Process each page only once
for (Map.Entry<Integer, List<RedactionArea>> entry : redactionsByPage.entrySet()) {
Integer pageNumber = entry.getKey();
List<RedactionArea> areasForPage = entry.getValue();
if (pageNumber > allPages.getCount()) {
continue; // Skip if page number is out of bounds
}
PDPage page = allPages.get(pageNumber - 1);
PDRectangle box = page.getBBox();
// Create only one content stream per page
PDPageContentStream contentStream = PDPageContentStream contentStream =
new PDPageContentStream( new PDPageContentStream(
document, page, PDPageContentStream.AppendMode.APPEND, true, true); document, page, PDPageContentStream.AppendMode.APPEND, true, true);
redactColor = decodeOrDefault(redactionArea.getColor(), Color.BLACK);
contentStream.setNonStrokingColor(redactColor);
float x = redactionArea.getX().floatValue(); // Process all redactions for this page
float y = redactionArea.getY().floatValue(); for (RedactionArea redactionArea : areasForPage) {
float width = redactionArea.getWidth().floatValue(); Color redactColor = decodeOrDefault(redactionArea.getColor(), Color.BLACK);
float height = redactionArea.getHeight().floatValue(); contentStream.setNonStrokingColor(redactColor);
PDRectangle box = page.getBBox(); float x = redactionArea.getX().floatValue();
float y = redactionArea.getY().floatValue();
float width = redactionArea.getWidth().floatValue();
float height = redactionArea.getHeight().floatValue();
contentStream.addRect(x, box.getHeight() - y - height, width, height);
contentStream.fill();
}
contentStream.addRect(x, box.getHeight() - y - height, width, height);
contentStream.fill();
contentStream.close(); contentStream.close();
} }
} }

View File

@ -45,7 +45,7 @@ public class SanitizeController {
summary = "Sanitize a PDF file", summary = "Sanitize a PDF file",
description = description =
"This endpoint processes a PDF file and removes specific elements based on the" "This endpoint processes a PDF file and removes specific elements based on the"
+ " provided options. Input:PDF Output:PDF Type:SISO") + " provided options. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> sanitizePDF(@ModelAttribute SanitizePdfRequest request) public ResponseEntity<byte[]> sanitizePDF(@ModelAttribute SanitizePdfRequest request)
throws IOException { throws IOException {
MultipartFile inputFile = request.getFileInput(); MultipartFile inputFile = request.getFileInput();

View File

@ -1,5 +1,8 @@
package stirling.software.SPDF.model; package stirling.software.SPDF.model;
import lombok.Data;
@Data
public class PDFText { public class PDFText {
private final int pageIndex; private final int pageIndex;
private final float x1; private final float x1;
@ -7,37 +10,4 @@ public class PDFText {
private final float x2; private final float x2;
private final float y2; private final float y2;
private final String text; private final String text;
public PDFText(int pageIndex, float x1, float y1, float x2, float y2, String text) {
this.pageIndex = pageIndex;
this.x1 = x1;
this.y1 = y1;
this.x2 = x2;
this.y2 = y2;
this.text = text;
}
public int getPageIndex() {
return pageIndex;
}
public float getX1() {
return x1;
}
public float getY1() {
return y1;
}
public float getX2() {
return x2;
}
public float getY2() {
return y2;
}
public String getText() {
return text;
}
} }

View File

@ -4,142 +4,355 @@ import java.io.ByteArrayOutputStream;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.pdfbox.Loader; import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
import org.apache.pdfbox.io.RandomAccessStreamCache.StreamCacheCreateFunction;
import org.apache.pdfbox.io.ScratchFile;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile; import org.springframework.web.multipart.MultipartFile;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.PdfMetadata;
import stirling.software.SPDF.model.api.PDFFile; import stirling.software.SPDF.model.api.PDFFile;
/**
* Adaptive PDF document factory that optimizes memory usage based on file size and available system
* resources.
*/
@Component @Component
@Slf4j @Slf4j
public class CustomPDDocumentFactory { public class CustomPDDocumentFactory {
private final PdfMetadataService pdfMetadataService; private final PdfMetadataService pdfMetadataService;
@Autowired // Memory thresholds and limits
private static final long SMALL_FILE_THRESHOLD = 10 * 1024 * 1024; // 10 MB
// Files smaller than this threshold are loaded entirely in memory for better performance.
// These files use IOUtils.createMemoryOnlyStreamCache() which keeps all document data in RAM.
// No temp files are created for document data, reducing I/O operations but consuming more
// memory.
private static final long LARGE_FILE_THRESHOLD = 50 * 1024 * 1024; // 50 MB
// Files between SMALL and LARGE thresholds use file-based caching with ScratchFile,
// but are loaded directly from byte arrays if provided that way.
// When loading from byte arrays, once size exceeds this threshold, bytes are first
// written to temp files before loading to reduce memory pressure.
private static final long LARGE_FILE_USAGE = 10 * 1024 * 1024;
private static final long EXTREMELY_LARGE_THRESHOLD = 100 * 1024 * 1024; // 100 MB
// Files exceeding this threshold use specialized loading with RandomAccessReadBufferedFile
// which provides buffered access to the file without loading the entire content at once.
// These files are always processed using file-based caching with minimal memory footprint,
// trading some performance for significantly reduced memory usage.
// For extremely large PDFs, this prevents OutOfMemoryErrors at the cost of being more I/O
// bound.
private static final double MIN_FREE_MEMORY_PERCENTAGE = 30.0; // 30%
private static final long MIN_FREE_MEMORY_BYTES = 4L * 1024 * 1024 * 1024; // 4 GB
// Counter for tracking temporary resources
private static final AtomicLong tempCounter = new AtomicLong(0);
public CustomPDDocumentFactory(PdfMetadataService pdfMetadataService) { public CustomPDDocumentFactory(PdfMetadataService pdfMetadataService) {
this.pdfMetadataService = pdfMetadataService; this.pdfMetadataService = pdfMetadataService;
} }
public PDDocument createNewDocument() throws IOException { /**
PDDocument document = new PDDocument(); * Main entry point for loading a PDF document from a file. Automatically selects the most
pdfMetadataService.setMetadataToPdf(document, PdfMetadata.builder().build(), true); * appropriate loading strategy.
*/
public PDDocument load(File file) throws IOException {
if (file == null) {
throw new IllegalArgumentException("File cannot be null");
}
long fileSize = file.length();
log.info("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024));
return loadAdaptively(file, fileSize);
}
/** Load a PDF from byte array with automatic optimization. */
public PDDocument load(byte[] input) throws IOException {
if (input == null) {
throw new IllegalArgumentException("Input bytes cannot be null");
}
long dataSize = input.length;
log.info("Loading PDF from byte array, size: {}MB", dataSize / (1024 * 1024));
return loadAdaptively(input, dataSize);
}
/** Load a PDF from InputStream with automatic optimization. */
public PDDocument load(InputStream input) throws IOException {
if (input == null) {
throw new IllegalArgumentException("InputStream cannot be null");
}
// Since we don't know the size upfront, buffer to a temp file
Path tempFile = createTempFile("pdf-stream-");
try {
Files.copy(input, tempFile, StandardCopyOption.REPLACE_EXISTING);
return loadAdaptively(tempFile.toFile(), Files.size(tempFile));
} catch (IOException e) {
cleanupFile(tempFile);
throw e;
}
}
private PDDocument loadAdaptively(Object source, long contentSize) throws IOException {
long maxMemory = Runtime.getRuntime().maxMemory();
long freeMemory = Runtime.getRuntime().freeMemory();
long totalMemory = Runtime.getRuntime().totalMemory();
long usedMemory = totalMemory - freeMemory;
// Calculate percentage of free memory
double freeMemoryPercent = (double) (maxMemory - usedMemory) / maxMemory * 100;
long actualFreeMemory = maxMemory - usedMemory;
// Log memory status
log.info(
"Memory status - Free: {}MB ({}%), Used: {}MB, Max: {}MB",
actualFreeMemory / (1024 * 1024),
String.format("%.2f", freeMemoryPercent),
usedMemory / (1024 * 1024),
maxMemory / (1024 * 1024));
// Determine caching strategy based on both file size and available memory
StreamCacheCreateFunction cacheFunction;
// If free memory is critically low, always use file-based caching
// In loadAdaptively method, replace current caching strategy decision with:
if (freeMemoryPercent < MIN_FREE_MEMORY_PERCENTAGE
|| actualFreeMemory < MIN_FREE_MEMORY_BYTES) {
log.info(
"Low memory detected ({}%), forcing file-based cache",
String.format("%.2f", freeMemoryPercent));
cacheFunction = createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly());
} else if (contentSize < SMALL_FILE_THRESHOLD) {
log.info("Using memory-only cache for small document ({}KB)", contentSize / 1024);
cacheFunction = IOUtils.createMemoryOnlyStreamCache();
} else if (contentSize < LARGE_FILE_THRESHOLD) {
// For medium files (10-50MB), use a mixed approach
log.info(
"Using mixed memory/file cache for medium document ({}MB)",
contentSize / (1024 * 1024));
cacheFunction =
createScratchFileCacheFunction(MemoryUsageSetting.setupMixed(LARGE_FILE_USAGE));
} else {
log.info("Using file-based cache for large document");
cacheFunction = createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly());
}
PDDocument document;
if (source instanceof File file) {
document = loadFromFile(file, contentSize, cacheFunction);
} else if (source instanceof byte[] bytes) {
document = loadFromBytes(bytes, contentSize, cacheFunction);
} else {
throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
}
postProcessDocument(document);
return document; return document;
} }
private StreamCacheCreateFunction createScratchFileCacheFunction(MemoryUsageSetting settings) {
return () -> {
try {
return new ScratchFile(settings);
} catch (IOException e) {
throw new RuntimeException("ScratchFile initialization failed", e);
}
};
}
private void postProcessDocument(PDDocument doc) throws IOException {
pdfMetadataService.setDefaultMetadata(doc);
removePassword(doc);
}
private PDDocument loadFromFile(File file, long size, StreamCacheCreateFunction cache)
throws IOException {
if (size >= EXTREMELY_LARGE_THRESHOLD) {
log.info("Loading extremely large file via buffered access");
return Loader.loadPDF(new RandomAccessReadBufferedFile(file), "", null, null, cache);
}
return Loader.loadPDF(file, "", null, null, cache);
}
private PDDocument loadFromBytes(byte[] bytes, long size, StreamCacheCreateFunction cache)
throws IOException {
if (size >= SMALL_FILE_THRESHOLD) {
log.info("Writing large byte array to temp file");
Path tempFile = createTempFile("pdf-bytes-");
try {
Files.write(tempFile, bytes);
return Loader.loadPDF(tempFile.toFile(), "", null, null, cache);
} finally {
cleanupFile(tempFile);
}
}
return Loader.loadPDF(bytes, "", null, null, cache);
}
public PDDocument createNewDocument(MemoryUsageSetting settings) throws IOException {
PDDocument doc = new PDDocument(createScratchFileCacheFunction(settings));
pdfMetadataService.setDefaultMetadata(doc);
return doc;
}
public PDDocument createNewDocument() throws IOException {
return createNewDocument(MemoryUsageSetting.setupTempFileOnly());
}
public byte[] saveToBytes(PDDocument document) throws IOException {
if (document.getNumberOfPages() < 10) { // Simple heuristic
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
document.save(baos);
return baos.toByteArray();
}
} else {
Path tempFile = createTempFile("pdf-save-");
try {
document.save(tempFile.toFile());
return Files.readAllBytes(tempFile);
} finally {
cleanupFile(tempFile);
}
}
}
// Improved password handling
private void removePassword(PDDocument document) throws IOException {
if (document.isEncrypted()) {
try {
document.setAllSecurityToBeRemoved(true);
} catch (Exception e) {
log.error("Decryption failed", e);
throw new IOException("PDF decryption failed", e);
}
}
}
// Temp file handling with enhanced logging
private Path createTempFile(String prefix) throws IOException {
Path file = Files.createTempFile(prefix + tempCounter.incrementAndGet() + "-", ".tmp");
log.info("Created temp file: {}", file);
return file;
}
/** Create a uniquely named temporary directory */
private Path createTempDirectory(String prefix) throws IOException {
return Files.createTempDirectory(prefix + tempCounter.incrementAndGet() + "-");
}
/** Clean up a temporary file */
private void cleanupFile(Path file) {
try {
if (Files.deleteIfExists(file)) {
log.info("Deleted temp file: {}", file);
}
} catch (IOException e) {
log.info("Error deleting temp file {}", file, e);
}
}
/** Create new document bytes based on an existing document */
public byte[] createNewBytesBasedOnOldDocument(byte[] oldDocument) throws IOException { public byte[] createNewBytesBasedOnOldDocument(byte[] oldDocument) throws IOException {
PDDocument document = Loader.loadPDF(oldDocument); try (PDDocument document = load(oldDocument)) {
return createNewBytesBasedOnOldDocument(document); return saveToBytes(document);
}
} }
/** Create new document bytes based on an existing document file */
public byte[] createNewBytesBasedOnOldDocument(File oldDocument) throws IOException { public byte[] createNewBytesBasedOnOldDocument(File oldDocument) throws IOException {
PDDocument document = Loader.loadPDF(oldDocument); try (PDDocument document = load(oldDocument)) {
return createNewBytesBasedOnOldDocument(document); return saveToBytes(document);
}
} }
/** Create new document bytes based on an existing PDDocument */
public byte[] createNewBytesBasedOnOldDocument(PDDocument oldDocument) throws IOException { public byte[] createNewBytesBasedOnOldDocument(PDDocument oldDocument) throws IOException {
pdfMetadataService.setMetadataToPdf( pdfMetadataService.setMetadataToPdf(
oldDocument, pdfMetadataService.extractMetadataFromPdf(oldDocument), true); oldDocument, pdfMetadataService.extractMetadataFromPdf(oldDocument), true);
return saveToBytes(oldDocument);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
oldDocument.save(baos);
oldDocument.close();
return baos.toByteArray();
} }
/** Create a new document based on an existing document bytes */
public PDDocument createNewDocumentBasedOnOldDocument(byte[] oldDocument) throws IOException { public PDDocument createNewDocumentBasedOnOldDocument(byte[] oldDocument) throws IOException {
PDDocument document = Loader.loadPDF(oldDocument); try (PDDocument document = load(oldDocument)) {
return createNewDocumentBasedOnOldDocument(document); return createNewDocumentBasedOnOldDocument(document);
}
} }
/** Create a new document based on an existing document file */
public PDDocument createNewDocumentBasedOnOldDocument(File oldDocument) throws IOException { public PDDocument createNewDocumentBasedOnOldDocument(File oldDocument) throws IOException {
PDDocument document = Loader.loadPDF(oldDocument); try (PDDocument document = load(oldDocument)) {
return createNewDocumentBasedOnOldDocument(document); return createNewDocumentBasedOnOldDocument(document);
}
} }
/** Create a new document based on an existing PDDocument */
public PDDocument createNewDocumentBasedOnOldDocument(PDDocument oldDocument) public PDDocument createNewDocumentBasedOnOldDocument(PDDocument oldDocument)
throws IOException { throws IOException {
PDDocument document = new PDDocument(); PDDocument document = createNewDocument();
pdfMetadataService.setMetadataToPdf( pdfMetadataService.setMetadataToPdf(
document, pdfMetadataService.extractMetadataFromPdf(oldDocument), true); document, pdfMetadataService.extractMetadataFromPdf(oldDocument), true);
return document; return document;
} }
/** Load document from a file and convert it to bytes */
public byte[] loadToBytes(File file) throws IOException { public byte[] loadToBytes(File file) throws IOException {
PDDocument document = load(file); try (PDDocument document = load(file)) {
ByteArrayOutputStream baos = new ByteArrayOutputStream(); return saveToBytes(document);
document.save(baos); }
// Close the document
document.close();
return baos.toByteArray();
} }
/** Load document from bytes and convert it back to bytes */
public byte[] loadToBytes(byte[] bytes) throws IOException { public byte[] loadToBytes(byte[] bytes) throws IOException {
PDDocument document = load(bytes); try (PDDocument document = load(bytes)) {
ByteArrayOutputStream baos = new ByteArrayOutputStream(); return saveToBytes(document);
document.save(baos); }
// Close the document
document.close();
return baos.toByteArray();
}
// if loading from a file, assume the file has been made with Stirling-PDF
public PDDocument load(File file) throws IOException {
PDDocument document = Loader.loadPDF(file);
pdfMetadataService.setMetadataToPdf(document, PdfMetadata.builder().build(), true);
return document;
}
public PDDocument load(InputStream input) throws IOException {
return load(input.readAllBytes());
}
public PDDocument load(byte[] input) throws IOException {
PDDocument document = Loader.loadPDF(input);
pdfMetadataService.setDefaultMetadata(document);
removezeropassword(document);
return document;
}
public PDDocument load(PDFFile pdfFile) throws IOException {
return load(pdfFile.getFileInput());
}
public PDDocument load(MultipartFile pdfFile) throws IOException {
return load(pdfFile.getBytes());
} }
/** Load from a file path string */
public PDDocument load(String path) throws IOException { public PDDocument load(String path) throws IOException {
return load(new File(path)); return load(new File(path));
} }
/** Load from a PDFFile object */
public PDDocument load(PDFFile pdfFile) throws IOException {
return load(pdfFile.getFileInput());
}
/** Load from a MultipartFile */
public PDDocument load(MultipartFile pdfFile) throws IOException {
return load(pdfFile.getBytes());
}
/** Load with password from MultipartFile */
public PDDocument load(MultipartFile fileInput, String password) throws IOException { public PDDocument load(MultipartFile fileInput, String password) throws IOException {
return load(fileInput.getBytes(), password); return load(fileInput.getBytes(), password);
} }
/** Load with password from byte array */
private PDDocument load(byte[] bytes, String password) throws IOException { private PDDocument load(byte[] bytes, String password) throws IOException {
// Since we don't have direct password support in the adaptive loader,
// we'll need to use PDFBox's Loader directly
PDDocument document = Loader.loadPDF(bytes, password); PDDocument document = Loader.loadPDF(bytes, password);
pdfMetadataService.setDefaultMetadata(document); pdfMetadataService.setDefaultMetadata(document);
return document; return document;
} }
private PDDocument removezeropassword(PDDocument document) throws IOException {
if (document.isEncrypted()) {
try {
log.info("Removing security from the source document");
document.setAllSecurityToBeRemoved(true);
} catch (Exception e) {
log.warn("Cannot decrypt the pdf");
}
}
return document;
}
// Add other load methods as needed, following the same pattern
} }

View File

@ -14,7 +14,6 @@ import java.util.zip.ZipOutputStream;
import javax.imageio.*; import javax.imageio.*;
import javax.imageio.stream.ImageOutputStream; import javax.imageio.stream.ImageOutputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
@ -128,6 +127,7 @@ public class PdfUtils {
} }
public static byte[] convertFromPdf( public static byte[] convertFromPdf(
CustomPDDocumentFactory pdfDocumentFactory,
byte[] inputStream, byte[] inputStream,
String imageType, String imageType,
ImageType colorType, ImageType colorType,
@ -135,7 +135,7 @@ public class PdfUtils {
int DPI, int DPI,
String filename) String filename)
throws IOException, Exception { throws IOException, Exception {
try (PDDocument document = Loader.loadPDF(inputStream)) { try (PDDocument document = pdfDocumentFactory.load(inputStream)) {
PDFRenderer pdfRenderer = new PDFRenderer(document); PDFRenderer pdfRenderer = new PDFRenderer(document);
pdfRenderer.setSubsamplingAllowed(true); pdfRenderer.setSubsamplingAllowed(true);
int pageCount = document.getNumberOfPages(); int pageCount = document.getNumberOfPages();