Memory enhancements and PDF decompress API (#3129)

# Description of Changes

- PDF split by size to check size of PDF as it splits, avoids issue were
a PDFs size is different viewed vs saved due to compression caused by
repeated data etc.
- Additionally memory enhancements for PDF load to dynamically load in
memory vs scratch
- PDF Decompress API for PDF testing


## Checklist

### General

- [ ] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [ ] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md)
(if applicable)
- [ ] I have performed a self-review of my own code
- [ ] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### UI Changes (if applicable)

- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [ ] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing)
for more details.
This commit is contained in:
Anthony Stirling 2025-03-08 00:03:27 +00:00 committed by GitHub
parent 33eb3fd034
commit ed2ef01690
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
43 changed files with 1042 additions and 321 deletions

View File

@ -124,7 +124,7 @@ These files provide pre-configured setups for different scenarios. For example,
services:
stirling-pdf:
container_name: Stirling-PDF-Security
image: stirlingtools/stirling-pdf:latest
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest
deploy:
resources:
limits:

View File

@ -25,7 +25,7 @@ ext {
}
group = "stirling.software"
version = "0.43.2"
version = "0.44.0"
java {
// 17 is lowest but we support and recommend 21

View File

@ -1,7 +1,7 @@
services:
stirling-pdf:
container_name: Stirling-PDF-Security-Fat-Postgres
image: stirlingtools/stirling-pdf:latest-fat-postgres
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-fat-postgres
deploy:
resources:
limits:

View File

@ -1,7 +1,7 @@
services:
stirling-pdf:
container_name: Stirling-PDF-Security-Fat
image: stirlingtools/stirling-pdf:latest-fat
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-fat
deploy:
resources:
limits:

View File

@ -1,7 +1,7 @@
services:
stirling-pdf:
container_name: Stirling-PDF-Security
image: stirlingtools/stirling-pdf:latest
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest
deploy:
resources:
limits:

View File

@ -1,7 +1,7 @@
services:
stirling-pdf:
container_name: Stirling-PDF-Security
image: stirlingtools/stirling-pdf:latest
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest
deploy:
resources:
limits:

View File

@ -1,7 +1,7 @@
services:
stirling-pdf:
container_name: Stirling-PDF-Ultra-Lite-Security
image: stirlingtools/stirling-pdf:latest-ultra-lite
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-ultra-lite
deploy:
resources:
limits:

View File

@ -1,7 +1,7 @@
services:
stirling-pdf:
container_name: Stirling-PDF-Ultra-Lite
image: stirlingtools/stirling-pdf:latest-ultra-lite
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-ultra-lite
deploy:
resources:
limits:

View File

@ -1,7 +1,7 @@
services:
stirling-pdf:
container_name: Stirling-PDF
image: stirlingtools/stirling-pdf:latest
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest
deploy:
resources:
limits:

View File

@ -11,6 +11,7 @@ import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.ApplicationProperties;
@Service

View File

@ -3,7 +3,6 @@ package stirling.software.SPDF.controller.api;
import java.io.IOException;
import java.util.*;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
@ -12,24 +11,33 @@ import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.encryption.PDEncryption;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.*;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.PDFFile;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
@RestController
@RequestMapping("/api/v1/analysis")
@Tag(name = "Analysis", description = "Analysis APIs")
public class AnalysisController {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public AnalysisController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(value = "/page-count", consumes = "multipart/form-data")
@Operation(
summary = "Get PDF page count",
description = "Returns total number of pages in PDF. Input:PDF Output:JSON Type:SISO")
public Map<String, Integer> getPageCount(@ModelAttribute PDFFile file) throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
return Map.of("pageCount", document.getNumberOfPages());
}
}
@ -39,7 +47,7 @@ public class AnalysisController {
summary = "Get basic PDF information",
description = "Returns page count, version, file size. Input:PDF Output:JSON Type:SISO")
public Map<String, Object> getBasicInfo(@ModelAttribute PDFFile file) throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
Map<String, Object> info = new HashMap<>();
info.put("pageCount", document.getNumberOfPages());
info.put("pdfVersion", document.getVersion());
@ -54,7 +62,7 @@ public class AnalysisController {
description = "Returns title, author, subject, etc. Input:PDF Output:JSON Type:SISO")
public Map<String, String> getDocumentProperties(@ModelAttribute PDFFile file)
throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
PDDocumentInformation info = document.getDocumentInformation();
Map<String, String> properties = new HashMap<>();
properties.put("title", info.getTitle());
@ -75,7 +83,7 @@ public class AnalysisController {
description = "Returns width and height of each page. Input:PDF Output:JSON Type:SISO")
public List<Map<String, Float>> getPageDimensions(@ModelAttribute PDFFile file)
throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
List<Map<String, Float>> dimensions = new ArrayList<>();
PDPageTree pages = document.getPages();
@ -95,7 +103,7 @@ public class AnalysisController {
description =
"Returns count and details of form fields. Input:PDF Output:JSON Type:SISO")
public Map<String, Object> getFormFields(@ModelAttribute PDFFile file) throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
Map<String, Object> formInfo = new HashMap<>();
PDAcroForm form = document.getDocumentCatalog().getAcroForm();
@ -117,7 +125,7 @@ public class AnalysisController {
summary = "Get annotation information",
description = "Returns count and types of annotations. Input:PDF Output:JSON Type:SISO")
public Map<String, Object> getAnnotationInfo(@ModelAttribute PDFFile file) throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
Map<String, Object> annotInfo = new HashMap<>();
int totalAnnotations = 0;
Map<String, Integer> annotationTypes = new HashMap<>();
@ -142,7 +150,7 @@ public class AnalysisController {
description =
"Returns list of fonts used in the document. Input:PDF Output:JSON Type:SISO")
public Map<String, Object> getFontInfo(@ModelAttribute PDFFile file) throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
Map<String, Object> fontInfo = new HashMap<>();
Set<String> fontNames = new HashSet<>();
@ -164,7 +172,7 @@ public class AnalysisController {
description =
"Returns encryption and permission details. Input:PDF Output:JSON Type:SISO")
public Map<String, Object> getSecurityInfo(@ModelAttribute PDFFile file) throws IOException {
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
Map<String, Object> securityInfo = new HashMap<>();
PDEncryption encryption = document.getEncryption();

View File

@ -3,7 +3,6 @@ package stirling.software.SPDF.controller.api;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.multipdf.LayerUtility;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
@ -23,7 +22,6 @@ import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.general.CropPdfForm;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.service.PostHogService;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@ -33,13 +31,9 @@ public class CropController {
private final CustomPDDocumentFactory pdfDocumentFactory;
private final PostHogService postHogService;
@Autowired
public CropController(
CustomPDDocumentFactory pdfDocumentFactory, PostHogService postHogService) {
public CropController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
this.postHogService = postHogService;
}
@PostMapping(value = "/crop", consumes = "multipart/form-data")
@ -48,7 +42,7 @@ public class CropController {
description =
"This operation takes an input PDF file and crops it according to the given coordinates. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> cropPdf(@ModelAttribute CropPdfForm form) throws IOException {
PDDocument sourceDocument = Loader.loadPDF(form.getFileInput().getBytes());
PDDocument sourceDocument = pdfDocumentFactory.load(form.getFileInput().getBytes());
PDDocument newDocument =
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);

View File

@ -12,7 +12,6 @@ import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
@ -101,8 +100,8 @@ public class MergeController {
};
case "byPDFTitle":
return (file1, file2) -> {
try (PDDocument doc1 = Loader.loadPDF(file1.getBytes());
PDDocument doc2 = Loader.loadPDF(file2.getBytes())) {
try (PDDocument doc1 = pdfDocumentFactory.load(file1.getBytes());
PDDocument doc2 = pdfDocumentFactory.load(file2.getBytes())) {
String title1 = doc1.getDocumentInformation().getTitle();
String title2 = doc2.getDocumentInformation().getTitle();
return title1.compareTo(title2);
@ -152,7 +151,7 @@ public class MergeController {
byte[] mergedPdfBytes = docOutputstream.toByteArray(); // Get merged document bytes
// Load the merged PDF document
mergedDocument = Loader.loadPDF(mergedPdfBytes);
mergedDocument = pdfDocumentFactory.load(mergedPdfBytes);
// Remove signatures if removeCertSign is true
if (removeCertSign) {

View File

@ -4,7 +4,6 @@ import java.awt.*;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.multipdf.LayerUtility;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
@ -64,7 +63,7 @@ public class MultiPageLayoutController {
: (int) Math.sqrt(pagesPerSheet);
int rows = pagesPerSheet == 2 || pagesPerSheet == 3 ? 1 : (int) Math.sqrt(pagesPerSheet);
PDDocument sourceDocument = Loader.loadPDF(file.getBytes());
PDDocument sourceDocument = pdfDocumentFactory.load(file.getBytes());
PDDocument newDocument =
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
PDPage newPage = new PDPage(PDRectangle.A4);

View File

@ -5,7 +5,6 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.beans.factory.annotation.Autowired;
@ -251,7 +250,7 @@ public class RearrangePagesPDFController {
String sortType = request.getCustomMode();
try {
// Load the input PDF
PDDocument document = Loader.loadPDF(pdfFile.getBytes());
PDDocument document = pdfDocumentFactory.load(pdfFile.getBytes());
// Split the page order string into an array of page numbers or range of numbers
String[] pageOrderArr = pageOrder != null ? pageOrder.split(",") : new String[0];

View File

@ -5,7 +5,6 @@ import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.multipdf.LayerUtility;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
@ -52,7 +51,7 @@ public class ScalePagesController {
String targetPDRectangle = request.getPageSize();
float scaleFactor = request.getScaleFactor();
PDDocument sourceDocument = Loader.loadPDF(file.getBytes());
PDDocument sourceDocument = pdfDocumentFactory.load(file.getBytes());
PDDocument outputDocument =
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);

View File

@ -10,7 +10,6 @@ import java.util.stream.Collectors;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.beans.factory.annotation.Autowired;
@ -63,7 +62,7 @@ public class SplitPDFController {
String pages = request.getPageNumbers();
// open the pdf document
document = Loader.loadPDF(file.getBytes());
document = pdfDocumentFactory.load(file.getBytes());
// PdfMetadata metadata = PdfMetadataService.extractMetadataFromPdf(document);
int totalPages = document.getNumberOfPages();
List<Integer> pageNumbers = request.getPageNumbersList(document, false);

View File

@ -8,7 +8,6 @@ import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
@ -34,6 +33,7 @@ import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.PdfMetadata;
import stirling.software.SPDF.model.api.SplitPdfByChaptersRequest;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.service.PdfMetadataService;
import stirling.software.SPDF.utils.WebResponseUtils;
@ -45,9 +45,13 @@ public class SplitPdfByChaptersController {
private final PdfMetadataService pdfMetadataService;
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public SplitPdfByChaptersController(PdfMetadataService pdfMetadataService) {
public SplitPdfByChaptersController(
PdfMetadataService pdfMetadataService, CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfMetadataService = pdfMetadataService;
this.pdfDocumentFactory = pdfDocumentFactory;
}
private static List<Bookmark> extractOutlineItems(
@ -135,7 +139,7 @@ public class SplitPdfByChaptersController {
if (bookmarkLevel < 0) {
return ResponseEntity.badRequest().body("Invalid bookmark level".getBytes());
}
sourceDocument = Loader.loadPDF(file.getBytes());
sourceDocument = pdfDocumentFactory.load(file.getBytes());
PDDocumentOutline outline = sourceDocument.getDocumentCatalog().getDocumentOutline();

View File

@ -9,7 +9,6 @@ import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.multipdf.LayerUtility;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
@ -57,7 +56,7 @@ public class SplitPdfBySectionsController {
List<ByteArrayOutputStream> splitDocumentsBoas = new ArrayList<>();
MultipartFile file = request.getFileInput();
PDDocument sourceDocument = Loader.loadPDF(file.getBytes());
PDDocument sourceDocument = pdfDocumentFactory.load(file.getBytes());
// Process the PDF based on split parameters
int horiz = request.getHorizontalDivisions() + 1;

View File

@ -7,7 +7,6 @@ import java.nio.file.Path;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.beans.factory.annotation.Autowired;
@ -41,6 +40,9 @@ public class SplitPdfBySizeController {
@Autowired
public SplitPdfBySizeController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
log.info(
"SplitPdfBySizeController initialized with pdfDocumentFactory: {}",
pdfDocumentFactory);
}
@PostMapping(value = "/split-by-size-or-count", consumes = "multipart/form-data")
@ -52,38 +54,92 @@ public class SplitPdfBySizeController {
public ResponseEntity<byte[]> autoSplitPdf(@ModelAttribute SplitPdfBySizeOrCountRequest request)
throws Exception {
log.info("Starting PDF split process with request: {}", request);
MultipartFile file = request.getFileInput();
log.info(
"File received: name={}, size={} bytes",
file.getOriginalFilename(),
file.getSize());
Path zipFile = Files.createTempFile("split_documents", ".zip");
log.info("Created temporary zip file: {}", zipFile);
String filename =
Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", "");
log.info("Base filename for output: {}", filename);
byte[] data = null;
try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile));
PDDocument sourceDocument = Loader.loadPDF(file.getBytes())) {
try {
log.info("Reading input file bytes");
byte[] pdfBytes = file.getBytes();
log.info("Successfully read {} bytes from input file", pdfBytes.length);
int type = request.getSplitType();
String value = request.getSplitValue();
log.info("Creating ZIP output stream");
try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile))) {
log.info("Loading PDF document");
try (PDDocument sourceDocument = pdfDocumentFactory.load(pdfBytes)) {
log.info(
"Successfully loaded PDF with {} pages",
sourceDocument.getNumberOfPages());
if (type == 0) {
long maxBytes = GeneralUtils.convertSizeToBytes(value);
handleSplitBySize(sourceDocument, maxBytes, zipOut, filename);
} else if (type == 1) {
int pageCount = Integer.parseInt(value);
handleSplitByPageCount(sourceDocument, pageCount, zipOut, filename);
} else if (type == 2) {
int documentCount = Integer.parseInt(value);
handleSplitByDocCount(sourceDocument, documentCount, zipOut, filename);
} else {
throw new IllegalArgumentException("Invalid argument for split type");
int type = request.getSplitType();
String value = request.getSplitValue();
log.info("Split type: {}, Split value: {}", type, value);
if (type == 0) {
log.info("Processing split by size");
long maxBytes = GeneralUtils.convertSizeToBytes(value);
log.info("Max bytes per document: {}", maxBytes);
handleSplitBySize(sourceDocument, maxBytes, zipOut, filename);
} else if (type == 1) {
log.info("Processing split by page count");
int pageCount = Integer.parseInt(value);
log.info("Pages per document: {}", pageCount);
handleSplitByPageCount(sourceDocument, pageCount, zipOut, filename);
} else if (type == 2) {
log.info("Processing split by document count");
int documentCount = Integer.parseInt(value);
log.info("Total number of documents: {}", documentCount);
handleSplitByDocCount(sourceDocument, documentCount, zipOut, filename);
} else {
log.error("Invalid split type: {}", type);
throw new IllegalArgumentException(
"Invalid argument for split type: " + type);
}
log.info("PDF splitting completed successfully");
} catch (Exception e) {
log.error("Error loading or processing PDF document", e);
throw e;
}
} catch (IOException e) {
log.error("Error creating or writing to ZIP file", e);
throw e;
}
} catch (Exception e) {
log.error("exception", e);
log.error("Exception during PDF splitting process", e);
throw e; // Re-throw to ensure proper error response
} finally {
data = Files.readAllBytes(zipFile);
Files.deleteIfExists(zipFile);
try {
log.info("Reading ZIP file data");
data = Files.readAllBytes(zipFile);
log.info("Successfully read {} bytes from ZIP file", data.length);
} catch (IOException e) {
log.error("Error reading ZIP file data", e);
}
try {
log.info("Deleting temporary ZIP file");
boolean deleted = Files.deleteIfExists(zipFile);
log.info("Temporary ZIP file deleted: {}", deleted);
} catch (IOException e) {
log.error("Error deleting temporary ZIP file", e);
}
}
log.info("Returning response with {} bytes of data", data != null ? data.length : 0);
return WebResponseUtils.bytesToWebResponse(
data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM);
}
@ -91,63 +147,230 @@ public class SplitPdfBySizeController {
private void handleSplitBySize(
PDDocument sourceDocument, long maxBytes, ZipOutputStream zipOut, String baseFilename)
throws IOException {
long currentSize = 0;
log.info("Starting handleSplitBySize with maxBytes={}", maxBytes);
PDDocument currentDoc =
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
int fileIndex = 1;
int totalPages = sourceDocument.getNumberOfPages();
int pageAdded = 0;
for (int pageIndex = 0; pageIndex < sourceDocument.getNumberOfPages(); pageIndex++) {
// Smart size check frequency - check more often with larger documents
int baseCheckFrequency = 5;
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
PDPage page = sourceDocument.getPage(pageIndex);
ByteArrayOutputStream pageOutputStream = new ByteArrayOutputStream();
log.info("Processing page {} of {}", pageIndex + 1, totalPages);
try (PDDocument tempDoc = new PDDocument()) {
PDPage importedPage = tempDoc.importPage(page); // This creates a new PDPage object
tempDoc.save(pageOutputStream);
}
// Add the page to current document
PDPage newPage = new PDPage(page.getCOSObject());
currentDoc.addPage(newPage);
pageAdded++;
long pageSize = pageOutputStream.size();
if (currentSize + pageSize > maxBytes) {
if (currentDoc.getNumberOfPages() > 0) {
// Dynamic size checking based on document size and page count
boolean shouldCheckSize =
(pageAdded % baseCheckFrequency == 0)
|| (pageIndex == totalPages - 1)
|| (pageAdded >= 20); // Always check after 20 pages
if (shouldCheckSize) {
log.info("Performing size check after {} pages", pageAdded);
ByteArrayOutputStream checkSizeStream = new ByteArrayOutputStream();
currentDoc.save(checkSizeStream);
long actualSize = checkSizeStream.size();
log.info("Current document size: {} bytes (max: {} bytes)", actualSize, maxBytes);
if (actualSize > maxBytes) {
// We exceeded the limit - remove the last page and save
if (currentDoc.getNumberOfPages() > 1) {
currentDoc.removePage(currentDoc.getNumberOfPages() - 1);
pageIndex--; // Process this page again in the next document
log.info("Size limit exceeded - removed last page");
}
log.info(
"Saving document with {} pages as part {}",
currentDoc.getNumberOfPages(),
fileIndex);
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
currentDoc.close(); // Make sure to close the document
currentDoc = new PDDocument();
currentSize = 0;
pageAdded = 0;
} else if (pageIndex < totalPages - 1) {
// We're under the limit, calculate if we might fit more pages
// Try to predict how many more similar pages might fit
if (actualSize < maxBytes * 0.75 && pageAdded > 0) {
// Rather than using a ratio, look ahead to test actual upcoming pages
int pagesToLookAhead = Math.min(5, totalPages - pageIndex - 1);
if (pagesToLookAhead > 0) {
log.info(
"Testing {} upcoming pages for potential addition",
pagesToLookAhead);
// Create a temp document with current pages + look-ahead pages
PDDocument testDoc = new PDDocument();
// First copy existing pages
for (int i = 0; i < currentDoc.getNumberOfPages(); i++) {
testDoc.addPage(new PDPage(currentDoc.getPage(i).getCOSObject()));
}
// Try adding look-ahead pages one by one
int extraPagesAdded = 0;
for (int i = 0; i < pagesToLookAhead; i++) {
int testPageIndex = pageIndex + 1 + i;
PDPage testPage = sourceDocument.getPage(testPageIndex);
testDoc.addPage(new PDPage(testPage.getCOSObject()));
// Check if we're still under size
ByteArrayOutputStream testStream = new ByteArrayOutputStream();
testDoc.save(testStream);
long testSize = testStream.size();
if (testSize <= maxBytes) {
extraPagesAdded++;
log.info(
"Test: Can add page {} (size would be {})",
testPageIndex + 1,
testSize);
} else {
log.info(
"Test: Cannot add page {} (size would be {})",
testPageIndex + 1,
testSize);
break;
}
}
testDoc.close();
// Add the pages we verified would fit
if (extraPagesAdded > 0) {
log.info("Adding {} verified pages ahead", extraPagesAdded);
for (int i = 0; i < extraPagesAdded; i++) {
int extraPageIndex = pageIndex + 1 + i;
PDPage extraPage = sourceDocument.getPage(extraPageIndex);
currentDoc.addPage(new PDPage(extraPage.getCOSObject()));
}
pageIndex += extraPagesAdded;
pageAdded += extraPagesAdded;
}
}
}
}
}
PDPage newPage = new PDPage(page.getCOSObject()); // Re-create the page
currentDoc.addPage(newPage);
currentSize += pageSize;
}
if (currentDoc.getNumberOfPages() != 0) {
// Save final document if it has any pages
if (currentDoc.getNumberOfPages() > 0) {
log.info(
"Saving final document with {} pages as part {}",
currentDoc.getNumberOfPages(),
fileIndex);
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
currentDoc.close();
}
log.info("Completed handleSplitBySize with {} document parts created", fileIndex - 1);
}
private void handleSplitByPageCount(
PDDocument sourceDocument, int pageCount, ZipOutputStream zipOut, String baseFilename)
throws IOException {
log.info("Starting handleSplitByPageCount with pageCount={}", pageCount);
int currentPageCount = 0;
PDDocument currentDoc =
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
int fileIndex = 1;
for (PDPage page : sourceDocument.getPages()) {
currentDoc.addPage(page);
currentPageCount++;
log.info("Creating initial output document");
PDDocument currentDoc = null;
try {
currentDoc = pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
log.info("Successfully created initial output document");
} catch (Exception e) {
log.error("Error creating initial output document", e);
throw new IOException("Failed to create initial output document", e);
}
if (currentPageCount == pageCount) {
// Save and reset current document
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
currentDoc = new PDDocument();
currentPageCount = 0;
int fileIndex = 1;
int pageIndex = 0;
int totalPages = sourceDocument.getNumberOfPages();
log.info("Processing {} pages", totalPages);
try {
for (PDPage page : sourceDocument.getPages()) {
pageIndex++;
log.info("Processing page {} of {}", pageIndex, totalPages);
try {
log.info("Adding page {} to current document", pageIndex);
currentDoc.addPage(page);
log.info("Successfully added page {} to current document", pageIndex);
} catch (Exception e) {
log.error("Error adding page {} to current document", pageIndex, e);
throw new IOException("Failed to add page to document", e);
}
currentPageCount++;
log.info("Current page count: {}/{}", currentPageCount, pageCount);
if (currentPageCount == pageCount) {
log.info(
"Reached target page count ({}), saving current document as part {}",
pageCount,
fileIndex);
try {
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
log.info("Successfully saved document part {}", fileIndex - 1);
} catch (Exception e) {
log.error("Error saving document part {}", fileIndex - 1, e);
throw e;
}
try {
log.info("Creating new document for next part");
currentDoc = new PDDocument();
log.info("Successfully created new document");
} catch (Exception e) {
log.error("Error creating new document for next part", e);
throw new IOException("Failed to create new document", e);
}
currentPageCount = 0;
log.info("Reset current page count to 0");
}
}
} catch (Exception e) {
log.error("Error iterating through pages", e);
throw new IOException("Failed to iterate through pages", e);
}
// Add the last document if it contains any pages
try {
if (currentDoc.getPages().getCount() != 0) {
log.info(
"Saving final document with {} pages as part {}",
currentDoc.getPages().getCount(),
fileIndex);
try {
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
log.info("Successfully saved final document part {}", fileIndex - 1);
} catch (Exception e) {
log.error("Error saving final document part {}", fileIndex - 1, e);
throw e;
}
} else {
log.info("Final document has no pages, skipping");
}
} catch (Exception e) {
log.error("Error checking or saving final document", e);
throw new IOException("Failed to process final document", e);
} finally {
try {
log.info("Closing final document");
currentDoc.close();
log.info("Successfully closed final document");
} catch (Exception e) {
log.error("Error closing final document", e);
}
}
// Add the last document if it contains any pages
if (currentDoc.getPages().getCount() != 0) {
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
}
log.info("Completed handleSplitByPageCount with {} document parts created", fileIndex - 1);
}
private void handleSplitByDocCount(
@ -156,35 +379,101 @@ public class SplitPdfBySizeController {
ZipOutputStream zipOut,
String baseFilename)
throws IOException {
log.info("Starting handleSplitByDocCount with documentCount={}", documentCount);
int totalPageCount = sourceDocument.getNumberOfPages();
log.info("Total pages in source document: {}", totalPageCount);
int pagesPerDocument = totalPageCount / documentCount;
int extraPages = totalPageCount % documentCount;
log.info("Pages per document: {}, Extra pages: {}", pagesPerDocument, extraPages);
int currentPageIndex = 0;
int fileIndex = 1;
for (int i = 0; i < documentCount; i++) {
PDDocument currentDoc =
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
int pagesToAdd = pagesPerDocument + (i < extraPages ? 1 : 0);
for (int j = 0; j < pagesToAdd; j++) {
currentDoc.addPage(sourceDocument.getPage(currentPageIndex++));
for (int i = 0; i < documentCount; i++) {
log.info("Creating document {} of {}", i + 1, documentCount);
PDDocument currentDoc = null;
try {
currentDoc = pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
log.info("Successfully created document {} of {}", i + 1, documentCount);
} catch (Exception e) {
log.error("Error creating document {} of {}", i + 1, documentCount, e);
throw new IOException("Failed to create document", e);
}
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
int pagesToAdd = pagesPerDocument + (i < extraPages ? 1 : 0);
log.info("Adding {} pages to document {}", pagesToAdd, i + 1);
for (int j = 0; j < pagesToAdd; j++) {
try {
log.info(
"Adding page {} (index {}) to document {}",
j + 1,
currentPageIndex,
i + 1);
currentDoc.addPage(sourceDocument.getPage(currentPageIndex));
log.info("Successfully added page {} to document {}", j + 1, i + 1);
currentPageIndex++;
} catch (Exception e) {
log.error("Error adding page {} to document {}", j + 1, i + 1, e);
throw new IOException("Failed to add page to document", e);
}
}
try {
log.info("Saving document {} with {} pages", i + 1, pagesToAdd);
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
log.info("Successfully saved document {}", i + 1);
} catch (Exception e) {
log.error("Error saving document {}", i + 1, e);
throw e;
}
}
log.info("Completed handleSplitByDocCount with {} documents created", documentCount);
}
private void saveDocumentToZip(
PDDocument document, ZipOutputStream zipOut, String baseFilename, int index)
throws IOException {
log.info("Starting saveDocumentToZip for document part {}", index);
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
document.save(outStream);
document.close(); // Close the document to free resources
// Create a new zip entry
ZipEntry zipEntry = new ZipEntry(baseFilename + "_" + index + ".pdf");
zipOut.putNextEntry(zipEntry);
zipOut.write(outStream.toByteArray());
zipOut.closeEntry();
try {
log.info("Saving document part {} to byte array", index);
document.save(outStream);
log.info("Successfully saved document part {} ({} bytes)", index, outStream.size());
} catch (Exception e) {
log.error("Error saving document part {} to byte array", index, e);
throw new IOException("Failed to save document to byte array", e);
}
try {
log.info("Closing document part {}", index);
document.close();
log.info("Successfully closed document part {}", index);
} catch (Exception e) {
log.error("Error closing document part {}", index, e);
// Continue despite close error
}
try {
// Create a new zip entry
String entryName = baseFilename + "_" + index + ".pdf";
log.info("Creating ZIP entry: {}", entryName);
ZipEntry zipEntry = new ZipEntry(entryName);
zipOut.putNextEntry(zipEntry);
byte[] bytes = outStream.toByteArray();
log.info("Writing {} bytes to ZIP entry", bytes.length);
zipOut.write(bytes);
log.info("Closing ZIP entry");
zipOut.closeEntry();
log.info("Successfully added document part {} to ZIP", index);
} catch (Exception e) {
log.error("Error adding document part {} to ZIP", index, e);
throw new IOException("Failed to add document to ZIP file", e);
}
}
}

View File

@ -4,7 +4,6 @@ import java.awt.geom.AffineTransform;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.multipdf.LayerUtility;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
@ -46,7 +45,7 @@ public class ToSinglePageController {
throws IOException {
// Load the source document
PDDocument sourceDocument = Loader.loadPDF(request.getFileInput().getBytes());
PDDocument sourceDocument = pdfDocumentFactory.load(request.getFileInput().getBytes());
// Calculate total height and max width
float totalHeight = 0;

View File

@ -13,7 +13,6 @@ import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.rendering.ImageType;
@ -91,6 +90,7 @@ public class ConvertImgPDFController {
result =
PdfUtils.convertFromPdf(
pdfDocumentFactory,
newPdfBytes,
"webp".equalsIgnoreCase(imageFormat)
? "png"
@ -245,7 +245,7 @@ public class ConvertImgPDFController {
*/
private byte[] rearrangePdfPages(byte[] pdfBytes, String[] pageOrderArr) throws IOException {
// Load the input PDF
PDDocument document = Loader.loadPDF(pdfBytes);
PDDocument document = pdfDocumentFactory.load(pdfBytes);
int totalPages = document.getNumberOfPages();
List<Integer> newPageOrder = GeneralUtils.parsePageList(pageOrderArr, totalPages, false);

View File

@ -2,9 +2,9 @@ package stirling.software.SPDF.controller.api.converters;
import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
@ -21,6 +21,7 @@ import stirling.software.SPDF.model.api.PDFFile;
import stirling.software.SPDF.model.api.converters.PdfToPresentationRequest;
import stirling.software.SPDF.model.api.converters.PdfToTextOrRTFRequest;
import stirling.software.SPDF.model.api.converters.PdfToWordRequest;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.PDFToFile;
import stirling.software.SPDF.utils.WebResponseUtils;
@ -29,6 +30,13 @@ import stirling.software.SPDF.utils.WebResponseUtils;
@Tag(name = "Convert", description = "Convert APIs")
public class ConvertPDFToOffice {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public ConvertPDFToOffice(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(consumes = "multipart/form-data", value = "/pdf/presentation")
@Operation(
summary = "Convert PDF to Presentation format",
@ -54,7 +62,7 @@ public class ConvertPDFToOffice {
MultipartFile inputFile = request.getFileInput();
String outputFormat = request.getOutputFormat();
if ("txt".equals(request.getOutputFormat())) {
try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) {
try (PDDocument document = pdfDocumentFactory.load(inputFile.getBytes())) {
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(document);
return WebResponseUtils.bytesToWebResponse(

View File

@ -12,8 +12,8 @@ import java.util.zip.ZipOutputStream;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.QuoteMode;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.ContentDisposition;
import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType;
@ -30,6 +30,7 @@ import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.PDFWithPageNums;
import stirling.software.SPDF.pdf.FlexibleCSVWriter;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import technology.tabula.ObjectExtractor;
import technology.tabula.Page;
@ -42,6 +43,13 @@ import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
@Slf4j
public class ExtractCSVController {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public ExtractCSVController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(value = "/pdf/csv", consumes = "multipart/form-data")
@Operation(
summary = "Extracts a CSV document from a PDF",
@ -51,7 +59,7 @@ public class ExtractCSVController {
String baseName = getBaseName(form.getFileInput().getOriginalFilename());
List<CsvEntry> csvEntries = new ArrayList<>();
try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
try (PDDocument document = pdfDocumentFactory.load(form.getFileInput().getBytes())) {
List<Integer> pages = form.getPageNumbersList(document, true);
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
CSVFormat format =

View File

@ -2,10 +2,10 @@ package stirling.software.SPDF.controller.api.filters;
import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
@ -23,6 +23,7 @@ import stirling.software.SPDF.model.api.filter.ContainsTextRequest;
import stirling.software.SPDF.model.api.filter.FileSizeRequest;
import stirling.software.SPDF.model.api.filter.PageRotationRequest;
import stirling.software.SPDF.model.api.filter.PageSizeRequest;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.PdfUtils;
import stirling.software.SPDF.utils.WebResponseUtils;
@ -31,6 +32,13 @@ import stirling.software.SPDF.utils.WebResponseUtils;
@Tag(name = "Filter", description = "Filter APIs")
public class FilterController {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public FilterController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(consumes = "multipart/form-data", value = "/filter-contains-text")
@Operation(
summary = "Checks if a PDF contains set text, returns true if does",
@ -41,7 +49,7 @@ public class FilterController {
String text = request.getText();
String pageNumber = request.getPageNumbers();
PDDocument pdfDocument = Loader.loadPDF(inputFile.getBytes());
PDDocument pdfDocument = pdfDocumentFactory.load(inputFile.getBytes());
if (PdfUtils.hasText(pdfDocument, pageNumber, text))
return WebResponseUtils.pdfDocToWebResponse(
pdfDocument, Filenames.toSimpleFileName(inputFile.getOriginalFilename()));
@ -58,7 +66,7 @@ public class FilterController {
MultipartFile inputFile = request.getFileInput();
String pageNumber = request.getPageNumbers();
PDDocument pdfDocument = Loader.loadPDF(inputFile.getBytes());
PDDocument pdfDocument = pdfDocumentFactory.load(inputFile.getBytes());
if (PdfUtils.hasImages(pdfDocument, pageNumber))
return WebResponseUtils.pdfDocToWebResponse(
pdfDocument, Filenames.toSimpleFileName(inputFile.getOriginalFilename()));
@ -75,7 +83,7 @@ public class FilterController {
String pageCount = request.getPageCount();
String comparator = request.getComparator();
// Load the PDF
PDDocument document = Loader.loadPDF(inputFile.getBytes());
PDDocument document = pdfDocumentFactory.load(inputFile.getBytes());
int actualPageCount = document.getNumberOfPages();
boolean valid = false;
@ -109,7 +117,7 @@ public class FilterController {
String comparator = request.getComparator();
// Load the PDF
PDDocument document = Loader.loadPDF(inputFile.getBytes());
PDDocument document = pdfDocumentFactory.load(inputFile.getBytes());
PDPage firstPage = document.getPage(0);
PDRectangle actualPageSize = firstPage.getMediaBox();
@ -185,7 +193,7 @@ public class FilterController {
String comparator = request.getComparator();
// Load the PDF
PDDocument document = Loader.loadPDF(inputFile.getBytes());
PDDocument document = pdfDocumentFactory.load(inputFile.getBytes());
// Get the rotation of the first page
PDPage firstPage = document.getPage(0);

View File

@ -5,10 +5,10 @@ import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
@ -23,6 +23,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.misc.ExtractHeaderRequest;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@ -34,6 +35,13 @@ public class AutoRenameController {
private static final float TITLE_FONT_SIZE_THRESHOLD = 20.0f;
private static final int LINE_LIMIT = 200;
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public AutoRenameController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(consumes = "multipart/form-data", value = "/auto-rename")
@Operation(
summary = "Extract header from PDF file",
@ -44,7 +52,7 @@ public class AutoRenameController {
MultipartFile file = request.getFileInput();
Boolean useFirstTextAsFallback = request.isUseFirstTextAsFallback();
PDDocument document = Loader.loadPDF(file.getBytes());
PDDocument document = pdfDocumentFactory.load(file.getBytes());
PDFTextStripper reader =
new PDFTextStripper() {
List<LineInfo> lineInfos = new ArrayList<>();

View File

@ -111,9 +111,9 @@ public class AutoSplitPdfController {
summary = "Auto split PDF pages into separate documents",
description =
"This endpoint accepts a PDF file, scans each page for a specific QR code, and"
+ " splits the document at the QR code boundaries. The output is a zip file"
+ " containing each separate PDF document. Input:PDF Output:ZIP-PDF"
+ " Type:SISO")
+ " splits the document at the QR code boundaries. The output is a zip file"
+ " containing each separate PDF document. Input:PDF Output:ZIP-PDF"
+ " Type:SISO")
public ResponseEntity<byte[]> autoSplitPdf(@ModelAttribute AutoSplitPdfRequest request)
throws IOException {
MultipartFile file = request.getFileInput();

View File

@ -8,7 +8,6 @@ import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
@ -85,7 +84,7 @@ public class BlankPageController {
int threshold = request.getThreshold();
float whitePercent = request.getWhitePercent();
try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) {
try (PDDocument document = pdfDocumentFactory.load(inputFile.getBytes())) {
PDPageTree pages = document.getDocumentCatalog().getPages();
PDFTextStripper textStripper = new PDFTextStripper();

View File

@ -18,7 +18,6 @@ import javax.imageio.ImageWriter;
import javax.imageio.plugins.jpeg.JPEGImageWriteParam;
import javax.imageio.stream.ImageOutputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
@ -59,7 +58,8 @@ public class CompressController {
this.pdfDocumentFactory = pdfDocumentFactory;
}
private void compressImagesInPDF(Path pdfFile, double scaleFactor, float jpegQuality) throws Exception {
private void compressImagesInPDF(Path pdfFile, double scaleFactor, float jpegQuality)
throws Exception {
byte[] fileBytes = Files.readAllBytes(pdfFile);
long originalFileSize = fileBytes.length;
log.info(
@ -71,7 +71,7 @@ public class CompressController {
// Track processed images to avoid recompression
Set<String> processedImages = new HashSet<>();
try (PDDocument doc = Loader.loadPDF(fileBytes)) {
try (PDDocument doc = pdfDocumentFactory.load(fileBytes)) {
int totalImages = 0;
int compressedImages = 0;
int skippedImages = 0;
@ -204,10 +204,12 @@ public class CompressController {
// Choose appropriate format and compression
String format = bufferedImage.getColorModel().hasAlpha() ? "png" : "jpeg";
// First get the actual size of the original image by encoding it to the chosen format
// First get the actual size of the original image by encoding it to the chosen
// format
ByteArrayOutputStream originalImageStream = new ByteArrayOutputStream();
if (format.equals("jpeg")) {
// Get the best available JPEG writer (prioritizes TwelveMonkeys if available)
// Get the best available JPEG writer (prioritizes TwelveMonkeys if
// available)
Iterator<ImageWriter> writers = ImageIO.getImageWritersByFormatName("jpeg");
ImageWriter writer = null;
@ -430,8 +432,8 @@ public class CompressController {
// All levels (1-9): Apply QPDF compression
if (!qpdfCompressionApplied) {
long preQpdfSize = Files.size(tempInputFile);
log.info("Pre-QPDF file size: {}", GeneralUtils.formatBytes(preQpdfSize));
long preQpdfSize = Files.size(tempInputFile);
log.info("Pre-QPDF file size: {}", GeneralUtils.formatBytes(preQpdfSize));
// For levels 1-3, map to qpdf compression levels 1-9
int qpdfCompressionLevel = optimizeLevel;
@ -472,8 +474,7 @@ public class CompressController {
double qpdfReduction = 100.0 - ((postQpdfSize * 100.0) / preQpdfSize);
log.info(
"Post-QPDF file size: {} (reduced by {:.1f}%)",
GeneralUtils.formatBytes(postQpdfSize),
qpdfReduction);
GeneralUtils.formatBytes(postQpdfSize), qpdfReduction);
} else {
tempOutputFile = tempInputFile;

View File

@ -0,0 +1,145 @@
package stirling.software.SPDF.controller.api.misc;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.HashSet;
import java.util.Set;
import org.apache.pdfbox.cos.*;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.PDFFile;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@RequestMapping("/api/v1/misc")
@Slf4j
@Tag(name = "Misc", description = "Miscellaneous APIs")
public class DecompressPdfController {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public DecompressPdfController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(value = "/decompress-pdf", consumes = "multipart/form-data")
@Operation(
summary = "Decompress PDF streams",
description = "Fully decompresses all PDF streams including text content")
public ResponseEntity<byte[]> decompressPdf(@ModelAttribute PDFFile request)
throws IOException {
MultipartFile file = request.getFileInput();
try (PDDocument document = pdfDocumentFactory.load(file.getBytes())) {
// Process all objects in document
processAllObjects(document);
// Save with explicit no compression
ByteArrayOutputStream baos = new ByteArrayOutputStream();
document.save(baos, CompressParameters.NO_COMPRESSION);
String outputFilename =
file.getOriginalFilename().replaceFirst("\\.(?=[^.]+$)", "_decompressed.");
return WebResponseUtils.bytesToWebResponse(
baos.toByteArray(), outputFilename, MediaType.APPLICATION_PDF);
}
}
private void processAllObjects(PDDocument document) {
Set<COSBase> processed = new HashSet<>();
COSDocument cosDoc = document.getDocument();
// Process all objects in the document
for (COSObjectKey key : cosDoc.getXrefTable().keySet()) {
COSObject obj = cosDoc.getObjectFromPool(key);
processObject(obj, processed);
}
}
private void processObject(COSBase obj, Set<COSBase> processed) {
// Skip null objects or already processed objects to avoid infinite recursion
if (obj == null || processed.contains(obj)) return;
processed.add(obj);
if (obj instanceof COSObject cosObj) {
processObject(cosObj.getObject(), processed);
} else if (obj instanceof COSDictionary dict) {
processDictionary(dict, processed);
} else if (obj instanceof COSArray array) {
processArray(array, processed);
}
}
private void processDictionary(COSDictionary dict, Set<COSBase> processed) {
// Process all dictionary entries
for (COSName key : dict.keySet()) {
processObject(dict.getDictionaryObject(key), processed);
}
// If this is a stream, decompress it
if (dict instanceof COSStream stream) {
decompressStream(stream);
}
}
private void processArray(COSArray array, Set<COSBase> processed) {
// Process all array elements
for (int i = 0; i < array.size(); i++) {
processObject(array.get(i), processed);
}
}
private void decompressStream(COSStream stream) {
try {
log.debug("Processing stream: {}", stream);
// Only remove filter information if it exists
if (stream.containsKey(COSName.FILTER)
|| stream.containsKey(COSName.DECODE_PARMS)
|| stream.containsKey(COSName.D)) {
// Read the decompressed content first
byte[] decompressedBytes;
try (COSInputStream is = stream.createInputStream()) {
decompressedBytes = IOUtils.toByteArray(is);
}
// Now remove filter information
stream.removeItem(COSName.FILTER);
stream.removeItem(COSName.DECODE_PARMS);
stream.removeItem(COSName.D);
// Write the raw content back
try (OutputStream out = stream.createRawOutputStream()) {
out.write(decompressedBytes);
}
// Set the Length to reflect the new stream size
stream.setInt(COSName.LENGTH, decompressedBytes.length);
}
} catch (IOException e) {
log.error("Error decompressing stream", e);
// Continue processing other streams even if this one fails
}
}
}

View File

@ -14,9 +14,9 @@ import java.util.zip.ZipOutputStream;
import javax.imageio.ImageIO;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
@ -32,6 +32,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.misc.ExtractImageScansRequest;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.CheckProgramInstall;
import stirling.software.SPDF.utils.ProcessExecutor;
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
@ -45,6 +46,13 @@ public class ExtractImageScansController {
private static final String REPLACEFIRST = "[.][^.]+$";
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public ExtractImageScansController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(consumes = "multipart/form-data", value = "/extract-image-scans")
@Operation(
summary = "Extract image scans from an input file",
@ -87,7 +95,8 @@ public class ExtractImageScansController {
// Check if input file is a PDF
if ("pdf".equalsIgnoreCase(extension)) {
// Load PDF document
try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
try (PDDocument document =
pdfDocumentFactory.load(form.getFileInput().getBytes())) {
PDFRenderer pdfRenderer = new PDFRenderer(document);
pdfRenderer.setSubsamplingAllowed(true);
int pageCount = document.getNumberOfPages();

View File

@ -20,11 +20,11 @@ import java.util.zip.ZipOutputStream;
import javax.imageio.ImageIO;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
@ -40,6 +40,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.PDFExtractImagesRequest;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.ImageProcessingUtils;
import stirling.software.SPDF.utils.WebResponseUtils;
@ -49,6 +50,13 @@ import stirling.software.SPDF.utils.WebResponseUtils;
@Tag(name = "Misc", description = "Miscellaneous APIs")
public class ExtractImagesController {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public ExtractImagesController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(consumes = "multipart/form-data", value = "/extract-images")
@Operation(
summary = "Extract images from a PDF file",
@ -59,7 +67,7 @@ public class ExtractImagesController {
MultipartFile file = request.getFileInput();
String format = request.getFormat();
boolean allowDuplicates = request.isAllowDuplicates();
PDDocument document = Loader.loadPDF(file.getBytes());
PDDocument document = pdfDocumentFactory.load(file.getBytes());
// Determine if multithreading should be used based on PDF size or number of pages
boolean useMultithreading = shouldUseMultithreading(file, document);

View File

@ -3,7 +3,6 @@ package stirling.software.SPDF.controller.api.misc;
import java.awt.image.BufferedImage;
import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
@ -51,7 +50,7 @@ public class FlattenController {
public ResponseEntity<byte[]> flatten(@ModelAttribute FlattenRequest request) throws Exception {
MultipartFile file = request.getFileInput();
PDDocument document = Loader.loadPDF(file.getBytes());
PDDocument document = pdfDocumentFactory.load(file.getBytes());
Boolean flattenOnlyForms = request.getFlattenOnlyForms();
if (Boolean.TRUE.equals(flattenOnlyForms)) {

View File

@ -7,10 +7,10 @@ import java.util.Calendar;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.WebDataBinder;
import org.springframework.web.bind.annotation.*;
@ -23,6 +23,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.misc.MetadataRequest;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.WebResponseUtils;
import stirling.software.SPDF.utils.propertyeditor.StringToMapPropertyEditor;
@ -32,6 +33,13 @@ import stirling.software.SPDF.utils.propertyeditor.StringToMapPropertyEditor;
@Tag(name = "Misc", description = "Miscellaneous APIs")
public class MetadataController {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public MetadataController(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
private String checkUndefined(String entry) {
// Check if the string is "undefined"
if ("undefined".equals(entry)) {
@ -76,7 +84,7 @@ public class MetadataController {
allRequestParams = new java.util.HashMap<String, String>();
}
// Load the PDF file into a PDDocument
PDDocument document = Loader.loadPDF(pdfFile.getBytes());
PDDocument document = pdfDocumentFactory.load(pdfFile.getBytes());
// Get the document information from the PDF
PDDocumentInformation info = document.getDocumentInformation();

View File

@ -73,17 +73,16 @@ public class PageNumbersController {
case "x-large":
marginFactor = 0.075f;
break;
default:
marginFactor = 0.035f;
break;
}
float fontSize = font_size;
if (pagesToNumber == null || pagesToNumber.length() == 0) {
if (pagesToNumber == null || pagesToNumber.isEmpty()) {
pagesToNumber = "all";
}
if (customText == null || customText.length() == 0) {
if (customText == null || customText.isEmpty()) {
customText = "{n}";
}
List<Integer> pagesToNumberList =
@ -94,63 +93,69 @@ public class PageNumbersController {
PDRectangle pageSize = page.getMediaBox();
String text =
customText != null
? customText
.replace("{n}", String.valueOf(pageNumber))
.replace("{total}", String.valueOf(document.getNumberOfPages()))
.replace(
"{filename}",
Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", ""))
: String.valueOf(pageNumber);
customText
.replace("{n}", String.valueOf(pageNumber))
.replace("{total}", String.valueOf(document.getNumberOfPages()))
.replace(
"{filename}",
Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", ""));
PDType1Font currentFont =
switch (font_type.toLowerCase()) {
case "courier" -> new PDType1Font(Standard14Fonts.FontName.COURIER);
case "times" -> new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN);
default -> new PDType1Font(Standard14Fonts.FontName.HELVETICA);
};
float x, y;
int xGroup = (position - 1) % 3;
int yGroup = 2 - (position - 1) / 3;
if (position == 5) {
// Calculate text width and font metrics
float textWidth = currentFont.getStringWidth(text) / 1000 * fontSize;
switch (xGroup) {
case 0: // left
x = pageSize.getLowerLeftX() + marginFactor * pageSize.getWidth();
break;
case 1: // center
x = pageSize.getLowerLeftX() + (pageSize.getWidth() / 2);
break;
default: // right
x = pageSize.getUpperRightX() - marginFactor * pageSize.getWidth();
break;
}
float ascent = currentFont.getFontDescriptor().getAscent() / 1000 * fontSize;
float descent = currentFont.getFontDescriptor().getDescent() / 1000 * fontSize;
switch (yGroup) {
case 0: // bottom
y = pageSize.getLowerLeftY() + marginFactor * pageSize.getHeight();
break;
case 1: // middle
y = pageSize.getLowerLeftY() + (pageSize.getHeight() / 2);
break;
default: // top
y = pageSize.getUpperRightY() - marginFactor * pageSize.getHeight();
break;
float centerX = pageSize.getLowerLeftX() + (pageSize.getWidth() / 2);
float centerY = pageSize.getLowerLeftY() + (pageSize.getHeight() / 2);
x = centerX - (textWidth / 2);
y = centerY - (ascent + descent) / 2;
} else {
int xGroup = (position - 1) % 3;
int yGroup = 2 - (position - 1) / 3;
x =
switch (xGroup) {
case 0 ->
pageSize.getLowerLeftX()
+ marginFactor * pageSize.getWidth(); // left
case 1 ->
pageSize.getLowerLeftX() + (pageSize.getWidth() / 2); // center
default ->
pageSize.getUpperRightX()
- marginFactor * pageSize.getWidth(); // right
};
y =
switch (yGroup) {
case 0 ->
pageSize.getLowerLeftY()
+ marginFactor * pageSize.getHeight(); // bottom
case 1 ->
pageSize.getLowerLeftY() + (pageSize.getHeight() / 2); // middle
default ->
pageSize.getUpperRightY()
- marginFactor * pageSize.getHeight(); // top
};
}
PDPageContentStream contentStream =
new PDPageContentStream(
document, page, PDPageContentStream.AppendMode.APPEND, true, true);
contentStream.beginText();
switch (font_type.toLowerCase()) {
case "helvetica":
contentStream.setFont(
new PDType1Font(Standard14Fonts.FontName.HELVETICA), fontSize);
break;
case "courier":
contentStream.setFont(
new PDType1Font(Standard14Fonts.FontName.COURIER), fontSize);
break;
case "times":
contentStream.setFont(
new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN), fontSize);
break;
}
contentStream.setFont(currentFont, fontSize);
contentStream.newLineAtOffset(x, y);
contentStream.showText(text);
contentStream.endText();

View File

@ -3,10 +3,10 @@ package stirling.software.SPDF.controller.api.misc;
import java.nio.charset.StandardCharsets;
import java.util.Map;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
@ -20,6 +20,7 @@ import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.PDFFile;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@ -27,6 +28,13 @@ import stirling.software.SPDF.utils.WebResponseUtils;
@Tag(name = "Misc", description = "Miscellaneous APIs")
public class ShowJavascript {
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public ShowJavascript(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@PostMapping(consumes = "multipart/form-data", value = "/show-javascript")
@Operation(
summary = "Grabs all JS from a PDF and returns a single JS file with all code",
@ -35,7 +43,7 @@ public class ShowJavascript {
MultipartFile inputFile = request.getFileInput();
String script = "";
try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) {
try (PDDocument document = pdfDocumentFactory.load(inputFile.getBytes())) {
if (document.getDocumentCatalog() != null
&& document.getDocumentCatalog().getNames() != null) {

View File

@ -130,8 +130,8 @@ public class CertSignController {
summary = "Sign PDF with a Digital Certificate",
description =
"This endpoint accepts a PDF file, a digital certificate and related"
+ " information to sign the PDF. It then returns the digitally signed PDF"
+ " file. Input:PDF Output:PDF Type:SISO")
+ " information to sign the PDF. It then returns the digitally signed PDF"
+ " file. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> signPDFWithCert(@ModelAttribute SignPDFWithCertRequest request)
throws Exception {
MultipartFile pdf = request.getFileInput();

View File

@ -6,7 +6,6 @@ import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.*;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSInputStream;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
@ -44,6 +43,7 @@ import org.apache.xmpbox.XMPMetadata;
import org.apache.xmpbox.xml.DomXmpParser;
import org.apache.xmpbox.xml.XmpParsingException;
import org.apache.xmpbox.xml.XmpSerializer;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
@ -62,6 +62,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.PDFFile;
import stirling.software.SPDF.service.CustomPDDocumentFactory;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@ -72,6 +73,13 @@ public class GetInfoOnPDF {
static ObjectMapper objectMapper = new ObjectMapper();
private final CustomPDDocumentFactory pdfDocumentFactory;
@Autowired
public GetInfoOnPDF(CustomPDDocumentFactory pdfDocumentFactory) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
private static void addOutlinesToArray(PDOutlineItem outline, ArrayNode arrayNode) {
if (outline == null) return;
@ -118,7 +126,7 @@ public class GetInfoOnPDF {
@Operation(summary = "Summary here", description = "desc. Input:PDF Output:JSON Type:SISO")
public ResponseEntity<byte[]> getPdfInfo(@ModelAttribute PDFFile request) throws IOException {
MultipartFile inputFile = request.getFileInput();
try (PDDocument pdfBoxDoc = Loader.loadPDF(inputFile.getBytes()); ) {
try (PDDocument pdfBoxDoc = pdfDocumentFactory.load(inputFile.getBytes()); ) {
ObjectMapper objectMapper = new ObjectMapper();
ObjectNode jsonOutput = objectMapper.createObjectNode();

View File

@ -3,8 +3,11 @@ package stirling.software.SPDF.controller.api.security;
import java.awt.*;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
@ -94,7 +97,10 @@ public class RedactController {
private void redactAreas(
List<RedactionArea> redactionAreas, PDDocument document, PDPageTree allPages)
throws IOException {
Color redactColor = null;
// Group redaction areas by page
Map<Integer, List<RedactionArea>> redactionsByPage = new HashMap<>();
// Process and validate each redaction area
for (RedactionArea redactionArea : redactionAreas) {
if (redactionArea.getPage() == null
|| redactionArea.getPage() <= 0
@ -102,23 +108,44 @@ public class RedactController {
|| redactionArea.getHeight() <= 0.0D
|| redactionArea.getWidth() == null
|| redactionArea.getWidth() <= 0.0D) continue;
PDPage page = allPages.get(redactionArea.getPage() - 1);
// Group by page number
redactionsByPage
.computeIfAbsent(redactionArea.getPage(), k -> new ArrayList<>())
.add(redactionArea);
}
// Process each page only once
for (Map.Entry<Integer, List<RedactionArea>> entry : redactionsByPage.entrySet()) {
Integer pageNumber = entry.getKey();
List<RedactionArea> areasForPage = entry.getValue();
if (pageNumber > allPages.getCount()) {
continue; // Skip if page number is out of bounds
}
PDPage page = allPages.get(pageNumber - 1);
PDRectangle box = page.getBBox();
// Create only one content stream per page
PDPageContentStream contentStream =
new PDPageContentStream(
document, page, PDPageContentStream.AppendMode.APPEND, true, true);
redactColor = decodeOrDefault(redactionArea.getColor(), Color.BLACK);
contentStream.setNonStrokingColor(redactColor);
float x = redactionArea.getX().floatValue();
float y = redactionArea.getY().floatValue();
float width = redactionArea.getWidth().floatValue();
float height = redactionArea.getHeight().floatValue();
// Process all redactions for this page
for (RedactionArea redactionArea : areasForPage) {
Color redactColor = decodeOrDefault(redactionArea.getColor(), Color.BLACK);
contentStream.setNonStrokingColor(redactColor);
PDRectangle box = page.getBBox();
float x = redactionArea.getX().floatValue();
float y = redactionArea.getY().floatValue();
float width = redactionArea.getWidth().floatValue();
float height = redactionArea.getHeight().floatValue();
contentStream.addRect(x, box.getHeight() - y - height, width, height);
contentStream.fill();
}
contentStream.addRect(x, box.getHeight() - y - height, width, height);
contentStream.fill();
contentStream.close();
}
}

View File

@ -45,7 +45,7 @@ public class SanitizeController {
summary = "Sanitize a PDF file",
description =
"This endpoint processes a PDF file and removes specific elements based on the"
+ " provided options. Input:PDF Output:PDF Type:SISO")
+ " provided options. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> sanitizePDF(@ModelAttribute SanitizePdfRequest request)
throws IOException {
MultipartFile inputFile = request.getFileInput();

View File

@ -1,5 +1,8 @@
package stirling.software.SPDF.model;
import lombok.Data;
@Data
public class PDFText {
private final int pageIndex;
private final float x1;
@ -7,37 +10,4 @@ public class PDFText {
private final float x2;
private final float y2;
private final String text;
public PDFText(int pageIndex, float x1, float y1, float x2, float y2, String text) {
this.pageIndex = pageIndex;
this.x1 = x1;
this.y1 = y1;
this.x2 = x2;
this.y2 = y2;
this.text = text;
}
public int getPageIndex() {
return pageIndex;
}
public float getX1() {
return x1;
}
public float getY1() {
return y1;
}
public float getX2() {
return x2;
}
public float getY2() {
return y2;
}
public String getText() {
return text;
}
}

View File

@ -4,142 +4,355 @@ import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
import org.apache.pdfbox.io.RandomAccessStreamCache.StreamCacheCreateFunction;
import org.apache.pdfbox.io.ScratchFile;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.PdfMetadata;
import stirling.software.SPDF.model.api.PDFFile;
/**
* Adaptive PDF document factory that optimizes memory usage based on file size and available system
* resources.
*/
@Component
@Slf4j
public class CustomPDDocumentFactory {
private final PdfMetadataService pdfMetadataService;
@Autowired
// Memory thresholds and limits
private static final long SMALL_FILE_THRESHOLD = 10 * 1024 * 1024; // 10 MB
// Files smaller than this threshold are loaded entirely in memory for better performance.
// These files use IOUtils.createMemoryOnlyStreamCache() which keeps all document data in RAM.
// No temp files are created for document data, reducing I/O operations but consuming more
// memory.
private static final long LARGE_FILE_THRESHOLD = 50 * 1024 * 1024; // 50 MB
// Files between SMALL and LARGE thresholds use file-based caching with ScratchFile,
// but are loaded directly from byte arrays if provided that way.
// When loading from byte arrays, once size exceeds this threshold, bytes are first
// written to temp files before loading to reduce memory pressure.
private static final long LARGE_FILE_USAGE = 10 * 1024 * 1024;
private static final long EXTREMELY_LARGE_THRESHOLD = 100 * 1024 * 1024; // 100 MB
// Files exceeding this threshold use specialized loading with RandomAccessReadBufferedFile
// which provides buffered access to the file without loading the entire content at once.
// These files are always processed using file-based caching with minimal memory footprint,
// trading some performance for significantly reduced memory usage.
// For extremely large PDFs, this prevents OutOfMemoryErrors at the cost of being more I/O
// bound.
private static final double MIN_FREE_MEMORY_PERCENTAGE = 30.0; // 30%
private static final long MIN_FREE_MEMORY_BYTES = 4L * 1024 * 1024 * 1024; // 4 GB
// Counter for tracking temporary resources
private static final AtomicLong tempCounter = new AtomicLong(0);
public CustomPDDocumentFactory(PdfMetadataService pdfMetadataService) {
this.pdfMetadataService = pdfMetadataService;
}
public PDDocument createNewDocument() throws IOException {
PDDocument document = new PDDocument();
pdfMetadataService.setMetadataToPdf(document, PdfMetadata.builder().build(), true);
/**
* Main entry point for loading a PDF document from a file. Automatically selects the most
* appropriate loading strategy.
*/
public PDDocument load(File file) throws IOException {
if (file == null) {
throw new IllegalArgumentException("File cannot be null");
}
long fileSize = file.length();
log.info("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024));
return loadAdaptively(file, fileSize);
}
/** Load a PDF from byte array with automatic optimization. */
public PDDocument load(byte[] input) throws IOException {
if (input == null) {
throw new IllegalArgumentException("Input bytes cannot be null");
}
long dataSize = input.length;
log.info("Loading PDF from byte array, size: {}MB", dataSize / (1024 * 1024));
return loadAdaptively(input, dataSize);
}
/** Load a PDF from InputStream with automatic optimization. */
public PDDocument load(InputStream input) throws IOException {
if (input == null) {
throw new IllegalArgumentException("InputStream cannot be null");
}
// Since we don't know the size upfront, buffer to a temp file
Path tempFile = createTempFile("pdf-stream-");
try {
Files.copy(input, tempFile, StandardCopyOption.REPLACE_EXISTING);
return loadAdaptively(tempFile.toFile(), Files.size(tempFile));
} catch (IOException e) {
cleanupFile(tempFile);
throw e;
}
}
private PDDocument loadAdaptively(Object source, long contentSize) throws IOException {
long maxMemory = Runtime.getRuntime().maxMemory();
long freeMemory = Runtime.getRuntime().freeMemory();
long totalMemory = Runtime.getRuntime().totalMemory();
long usedMemory = totalMemory - freeMemory;
// Calculate percentage of free memory
double freeMemoryPercent = (double) (maxMemory - usedMemory) / maxMemory * 100;
long actualFreeMemory = maxMemory - usedMemory;
// Log memory status
log.info(
"Memory status - Free: {}MB ({}%), Used: {}MB, Max: {}MB",
actualFreeMemory / (1024 * 1024),
String.format("%.2f", freeMemoryPercent),
usedMemory / (1024 * 1024),
maxMemory / (1024 * 1024));
// Determine caching strategy based on both file size and available memory
StreamCacheCreateFunction cacheFunction;
// If free memory is critically low, always use file-based caching
// In loadAdaptively method, replace current caching strategy decision with:
if (freeMemoryPercent < MIN_FREE_MEMORY_PERCENTAGE
|| actualFreeMemory < MIN_FREE_MEMORY_BYTES) {
log.info(
"Low memory detected ({}%), forcing file-based cache",
String.format("%.2f", freeMemoryPercent));
cacheFunction = createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly());
} else if (contentSize < SMALL_FILE_THRESHOLD) {
log.info("Using memory-only cache for small document ({}KB)", contentSize / 1024);
cacheFunction = IOUtils.createMemoryOnlyStreamCache();
} else if (contentSize < LARGE_FILE_THRESHOLD) {
// For medium files (10-50MB), use a mixed approach
log.info(
"Using mixed memory/file cache for medium document ({}MB)",
contentSize / (1024 * 1024));
cacheFunction =
createScratchFileCacheFunction(MemoryUsageSetting.setupMixed(LARGE_FILE_USAGE));
} else {
log.info("Using file-based cache for large document");
cacheFunction = createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly());
}
PDDocument document;
if (source instanceof File file) {
document = loadFromFile(file, contentSize, cacheFunction);
} else if (source instanceof byte[] bytes) {
document = loadFromBytes(bytes, contentSize, cacheFunction);
} else {
throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
}
postProcessDocument(document);
return document;
}
private StreamCacheCreateFunction createScratchFileCacheFunction(MemoryUsageSetting settings) {
return () -> {
try {
return new ScratchFile(settings);
} catch (IOException e) {
throw new RuntimeException("ScratchFile initialization failed", e);
}
};
}
private void postProcessDocument(PDDocument doc) throws IOException {
pdfMetadataService.setDefaultMetadata(doc);
removePassword(doc);
}
private PDDocument loadFromFile(File file, long size, StreamCacheCreateFunction cache)
throws IOException {
if (size >= EXTREMELY_LARGE_THRESHOLD) {
log.info("Loading extremely large file via buffered access");
return Loader.loadPDF(new RandomAccessReadBufferedFile(file), "", null, null, cache);
}
return Loader.loadPDF(file, "", null, null, cache);
}
private PDDocument loadFromBytes(byte[] bytes, long size, StreamCacheCreateFunction cache)
throws IOException {
if (size >= SMALL_FILE_THRESHOLD) {
log.info("Writing large byte array to temp file");
Path tempFile = createTempFile("pdf-bytes-");
try {
Files.write(tempFile, bytes);
return Loader.loadPDF(tempFile.toFile(), "", null, null, cache);
} finally {
cleanupFile(tempFile);
}
}
return Loader.loadPDF(bytes, "", null, null, cache);
}
public PDDocument createNewDocument(MemoryUsageSetting settings) throws IOException {
PDDocument doc = new PDDocument(createScratchFileCacheFunction(settings));
pdfMetadataService.setDefaultMetadata(doc);
return doc;
}
public PDDocument createNewDocument() throws IOException {
return createNewDocument(MemoryUsageSetting.setupTempFileOnly());
}
public byte[] saveToBytes(PDDocument document) throws IOException {
if (document.getNumberOfPages() < 10) { // Simple heuristic
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
document.save(baos);
return baos.toByteArray();
}
} else {
Path tempFile = createTempFile("pdf-save-");
try {
document.save(tempFile.toFile());
return Files.readAllBytes(tempFile);
} finally {
cleanupFile(tempFile);
}
}
}
// Improved password handling
private void removePassword(PDDocument document) throws IOException {
if (document.isEncrypted()) {
try {
document.setAllSecurityToBeRemoved(true);
} catch (Exception e) {
log.error("Decryption failed", e);
throw new IOException("PDF decryption failed", e);
}
}
}
// Temp file handling with enhanced logging
private Path createTempFile(String prefix) throws IOException {
Path file = Files.createTempFile(prefix + tempCounter.incrementAndGet() + "-", ".tmp");
log.info("Created temp file: {}", file);
return file;
}
/** Create a uniquely named temporary directory */
private Path createTempDirectory(String prefix) throws IOException {
return Files.createTempDirectory(prefix + tempCounter.incrementAndGet() + "-");
}
/** Clean up a temporary file */
private void cleanupFile(Path file) {
try {
if (Files.deleteIfExists(file)) {
log.info("Deleted temp file: {}", file);
}
} catch (IOException e) {
log.info("Error deleting temp file {}", file, e);
}
}
/** Create new document bytes based on an existing document */
public byte[] createNewBytesBasedOnOldDocument(byte[] oldDocument) throws IOException {
PDDocument document = Loader.loadPDF(oldDocument);
return createNewBytesBasedOnOldDocument(document);
try (PDDocument document = load(oldDocument)) {
return saveToBytes(document);
}
}
/** Create new document bytes based on an existing document file */
public byte[] createNewBytesBasedOnOldDocument(File oldDocument) throws IOException {
PDDocument document = Loader.loadPDF(oldDocument);
return createNewBytesBasedOnOldDocument(document);
try (PDDocument document = load(oldDocument)) {
return saveToBytes(document);
}
}
/** Create new document bytes based on an existing PDDocument */
public byte[] createNewBytesBasedOnOldDocument(PDDocument oldDocument) throws IOException {
pdfMetadataService.setMetadataToPdf(
oldDocument, pdfMetadataService.extractMetadataFromPdf(oldDocument), true);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
oldDocument.save(baos);
oldDocument.close();
return baos.toByteArray();
return saveToBytes(oldDocument);
}
/** Create a new document based on an existing document bytes */
public PDDocument createNewDocumentBasedOnOldDocument(byte[] oldDocument) throws IOException {
PDDocument document = Loader.loadPDF(oldDocument);
return createNewDocumentBasedOnOldDocument(document);
try (PDDocument document = load(oldDocument)) {
return createNewDocumentBasedOnOldDocument(document);
}
}
/** Create a new document based on an existing document file */
public PDDocument createNewDocumentBasedOnOldDocument(File oldDocument) throws IOException {
PDDocument document = Loader.loadPDF(oldDocument);
return createNewDocumentBasedOnOldDocument(document);
try (PDDocument document = load(oldDocument)) {
return createNewDocumentBasedOnOldDocument(document);
}
}
/** Create a new document based on an existing PDDocument */
public PDDocument createNewDocumentBasedOnOldDocument(PDDocument oldDocument)
throws IOException {
PDDocument document = new PDDocument();
PDDocument document = createNewDocument();
pdfMetadataService.setMetadataToPdf(
document, pdfMetadataService.extractMetadataFromPdf(oldDocument), true);
return document;
}
/** Load document from a file and convert it to bytes */
public byte[] loadToBytes(File file) throws IOException {
PDDocument document = load(file);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
document.save(baos);
// Close the document
document.close();
return baos.toByteArray();
try (PDDocument document = load(file)) {
return saveToBytes(document);
}
}
/** Load document from bytes and convert it back to bytes */
public byte[] loadToBytes(byte[] bytes) throws IOException {
PDDocument document = load(bytes);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
document.save(baos);
// Close the document
document.close();
return baos.toByteArray();
}
// if loading from a file, assume the file has been made with Stirling-PDF
public PDDocument load(File file) throws IOException {
PDDocument document = Loader.loadPDF(file);
pdfMetadataService.setMetadataToPdf(document, PdfMetadata.builder().build(), true);
return document;
}
public PDDocument load(InputStream input) throws IOException {
return load(input.readAllBytes());
}
public PDDocument load(byte[] input) throws IOException {
PDDocument document = Loader.loadPDF(input);
pdfMetadataService.setDefaultMetadata(document);
removezeropassword(document);
return document;
}
public PDDocument load(PDFFile pdfFile) throws IOException {
return load(pdfFile.getFileInput());
}
public PDDocument load(MultipartFile pdfFile) throws IOException {
return load(pdfFile.getBytes());
try (PDDocument document = load(bytes)) {
return saveToBytes(document);
}
}
/** Load from a file path string */
public PDDocument load(String path) throws IOException {
return load(new File(path));
}
/** Load from a PDFFile object */
public PDDocument load(PDFFile pdfFile) throws IOException {
return load(pdfFile.getFileInput());
}
/** Load from a MultipartFile */
public PDDocument load(MultipartFile pdfFile) throws IOException {
return load(pdfFile.getBytes());
}
/** Load with password from MultipartFile */
public PDDocument load(MultipartFile fileInput, String password) throws IOException {
return load(fileInput.getBytes(), password);
}
/** Load with password from byte array */
private PDDocument load(byte[] bytes, String password) throws IOException {
// Since we don't have direct password support in the adaptive loader,
// we'll need to use PDFBox's Loader directly
PDDocument document = Loader.loadPDF(bytes, password);
pdfMetadataService.setDefaultMetadata(document);
return document;
}
private PDDocument removezeropassword(PDDocument document) throws IOException {
if (document.isEncrypted()) {
try {
log.info("Removing security from the source document");
document.setAllSecurityToBeRemoved(true);
} catch (Exception e) {
log.warn("Cannot decrypt the pdf");
}
}
return document;
}
// Add other load methods as needed, following the same pattern
}

View File

@ -14,7 +14,6 @@ import java.util.zip.ZipOutputStream;
import javax.imageio.*;
import javax.imageio.stream.ImageOutputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
@ -128,6 +127,7 @@ public class PdfUtils {
}
public static byte[] convertFromPdf(
CustomPDDocumentFactory pdfDocumentFactory,
byte[] inputStream,
String imageType,
ImageType colorType,
@ -135,7 +135,7 @@ public class PdfUtils {
int DPI,
String filename)
throws IOException, Exception {
try (PDDocument document = Loader.loadPDF(inputStream)) {
try (PDDocument document = pdfDocumentFactory.load(inputStream)) {
PDFRenderer pdfRenderer = new PDFRenderer(document);
pdfRenderer.setSubsamplingAllowed(true);
int pageCount = document.getNumberOfPages();