mirror of
https://github.com/Stirling-Tools/Stirling-PDF.git
synced 2025-04-19 11:11:18 +00:00
Memory enhancements and PDF decompress API (#3129)
# Description of Changes - PDF split by size to check size of PDF as it splits, avoids issue were a PDFs size is different viewed vs saved due to compression caused by repeated data etc. - Additionally memory enhancements for PDF load to dynamically load in memory vs scratch - PDF Decompress API for PDF testing ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details.
This commit is contained in:
parent
33eb3fd034
commit
ed2ef01690
@ -124,7 +124,7 @@ These files provide pre-configured setups for different scenarios. For example,
|
||||
services:
|
||||
stirling-pdf:
|
||||
container_name: Stirling-PDF-Security
|
||||
image: stirlingtools/stirling-pdf:latest
|
||||
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
|
@ -25,7 +25,7 @@ ext {
|
||||
}
|
||||
|
||||
group = "stirling.software"
|
||||
version = "0.43.2"
|
||||
version = "0.44.0"
|
||||
|
||||
java {
|
||||
// 17 is lowest but we support and recommend 21
|
||||
|
@ -1,7 +1,7 @@
|
||||
services:
|
||||
stirling-pdf:
|
||||
container_name: Stirling-PDF-Security-Fat-Postgres
|
||||
image: stirlingtools/stirling-pdf:latest-fat-postgres
|
||||
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-fat-postgres
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
|
@ -1,7 +1,7 @@
|
||||
services:
|
||||
stirling-pdf:
|
||||
container_name: Stirling-PDF-Security-Fat
|
||||
image: stirlingtools/stirling-pdf:latest-fat
|
||||
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-fat
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
|
@ -1,7 +1,7 @@
|
||||
services:
|
||||
stirling-pdf:
|
||||
container_name: Stirling-PDF-Security
|
||||
image: stirlingtools/stirling-pdf:latest
|
||||
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
|
@ -1,7 +1,7 @@
|
||||
services:
|
||||
stirling-pdf:
|
||||
container_name: Stirling-PDF-Security
|
||||
image: stirlingtools/stirling-pdf:latest
|
||||
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
|
@ -1,7 +1,7 @@
|
||||
services:
|
||||
stirling-pdf:
|
||||
container_name: Stirling-PDF-Ultra-Lite-Security
|
||||
image: stirlingtools/stirling-pdf:latest-ultra-lite
|
||||
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-ultra-lite
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
|
@ -1,7 +1,7 @@
|
||||
services:
|
||||
stirling-pdf:
|
||||
container_name: Stirling-PDF-Ultra-Lite
|
||||
image: stirlingtools/stirling-pdf:latest-ultra-lite
|
||||
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest-ultra-lite
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
|
@ -1,7 +1,7 @@
|
||||
services:
|
||||
stirling-pdf:
|
||||
container_name: Stirling-PDF
|
||||
image: stirlingtools/stirling-pdf:latest
|
||||
image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
|
@ -11,6 +11,7 @@ import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.ApplicationProperties;
|
||||
|
||||
@Service
|
||||
|
@ -3,7 +3,6 @@ package stirling.software.SPDF.controller.api;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
||||
@ -12,24 +11,33 @@ import org.apache.pdfbox.pdmodel.PDPageTree;
|
||||
import org.apache.pdfbox.pdmodel.encryption.PDEncryption;
|
||||
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
|
||||
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
|
||||
import stirling.software.SPDF.model.api.PDFFile;
|
||||
import stirling.software.SPDF.service.CustomPDDocumentFactory;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/api/v1/analysis")
|
||||
@Tag(name = "Analysis", description = "Analysis APIs")
|
||||
public class AnalysisController {
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public AnalysisController(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
@PostMapping(value = "/page-count", consumes = "multipart/form-data")
|
||||
@Operation(
|
||||
summary = "Get PDF page count",
|
||||
description = "Returns total number of pages in PDF. Input:PDF Output:JSON Type:SISO")
|
||||
public Map<String, Integer> getPageCount(@ModelAttribute PDFFile file) throws IOException {
|
||||
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
|
||||
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
|
||||
return Map.of("pageCount", document.getNumberOfPages());
|
||||
}
|
||||
}
|
||||
@ -39,7 +47,7 @@ public class AnalysisController {
|
||||
summary = "Get basic PDF information",
|
||||
description = "Returns page count, version, file size. Input:PDF Output:JSON Type:SISO")
|
||||
public Map<String, Object> getBasicInfo(@ModelAttribute PDFFile file) throws IOException {
|
||||
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
|
||||
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
|
||||
Map<String, Object> info = new HashMap<>();
|
||||
info.put("pageCount", document.getNumberOfPages());
|
||||
info.put("pdfVersion", document.getVersion());
|
||||
@ -54,7 +62,7 @@ public class AnalysisController {
|
||||
description = "Returns title, author, subject, etc. Input:PDF Output:JSON Type:SISO")
|
||||
public Map<String, String> getDocumentProperties(@ModelAttribute PDFFile file)
|
||||
throws IOException {
|
||||
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
|
||||
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
|
||||
PDDocumentInformation info = document.getDocumentInformation();
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put("title", info.getTitle());
|
||||
@ -75,7 +83,7 @@ public class AnalysisController {
|
||||
description = "Returns width and height of each page. Input:PDF Output:JSON Type:SISO")
|
||||
public List<Map<String, Float>> getPageDimensions(@ModelAttribute PDFFile file)
|
||||
throws IOException {
|
||||
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
|
||||
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
|
||||
List<Map<String, Float>> dimensions = new ArrayList<>();
|
||||
PDPageTree pages = document.getPages();
|
||||
|
||||
@ -95,7 +103,7 @@ public class AnalysisController {
|
||||
description =
|
||||
"Returns count and details of form fields. Input:PDF Output:JSON Type:SISO")
|
||||
public Map<String, Object> getFormFields(@ModelAttribute PDFFile file) throws IOException {
|
||||
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
|
||||
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
|
||||
Map<String, Object> formInfo = new HashMap<>();
|
||||
PDAcroForm form = document.getDocumentCatalog().getAcroForm();
|
||||
|
||||
@ -117,7 +125,7 @@ public class AnalysisController {
|
||||
summary = "Get annotation information",
|
||||
description = "Returns count and types of annotations. Input:PDF Output:JSON Type:SISO")
|
||||
public Map<String, Object> getAnnotationInfo(@ModelAttribute PDFFile file) throws IOException {
|
||||
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
|
||||
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
|
||||
Map<String, Object> annotInfo = new HashMap<>();
|
||||
int totalAnnotations = 0;
|
||||
Map<String, Integer> annotationTypes = new HashMap<>();
|
||||
@ -142,7 +150,7 @@ public class AnalysisController {
|
||||
description =
|
||||
"Returns list of fonts used in the document. Input:PDF Output:JSON Type:SISO")
|
||||
public Map<String, Object> getFontInfo(@ModelAttribute PDFFile file) throws IOException {
|
||||
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
|
||||
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
|
||||
Map<String, Object> fontInfo = new HashMap<>();
|
||||
Set<String> fontNames = new HashSet<>();
|
||||
|
||||
@ -164,7 +172,7 @@ public class AnalysisController {
|
||||
description =
|
||||
"Returns encryption and permission details. Input:PDF Output:JSON Type:SISO")
|
||||
public Map<String, Object> getSecurityInfo(@ModelAttribute PDFFile file) throws IOException {
|
||||
try (PDDocument document = Loader.loadPDF(file.getFileInput().getBytes())) {
|
||||
try (PDDocument document = pdfDocumentFactory.load(file.getFileInput().getBytes())) {
|
||||
Map<String, Object> securityInfo = new HashMap<>();
|
||||
PDEncryption encryption = document.getEncryption();
|
||||
|
||||
|
@ -3,7 +3,6 @@ package stirling.software.SPDF.controller.api;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.multipdf.LayerUtility;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
@ -23,7 +22,6 @@ import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
|
||||
import stirling.software.SPDF.model.api.general.CropPdfForm;
|
||||
import stirling.software.SPDF.service.CustomPDDocumentFactory;
|
||||
import stirling.software.SPDF.service.PostHogService;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
|
||||
@RestController
|
||||
@ -33,13 +31,9 @@ public class CropController {
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
private final PostHogService postHogService;
|
||||
|
||||
@Autowired
|
||||
public CropController(
|
||||
CustomPDDocumentFactory pdfDocumentFactory, PostHogService postHogService) {
|
||||
public CropController(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
this.postHogService = postHogService;
|
||||
}
|
||||
|
||||
@PostMapping(value = "/crop", consumes = "multipart/form-data")
|
||||
@ -48,7 +42,7 @@ public class CropController {
|
||||
description =
|
||||
"This operation takes an input PDF file and crops it according to the given coordinates. Input:PDF Output:PDF Type:SISO")
|
||||
public ResponseEntity<byte[]> cropPdf(@ModelAttribute CropPdfForm form) throws IOException {
|
||||
PDDocument sourceDocument = Loader.loadPDF(form.getFileInput().getBytes());
|
||||
PDDocument sourceDocument = pdfDocumentFactory.load(form.getFileInput().getBytes());
|
||||
|
||||
PDDocument newDocument =
|
||||
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
|
||||
|
@ -12,7 +12,6 @@ import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
|
||||
@ -101,8 +100,8 @@ public class MergeController {
|
||||
};
|
||||
case "byPDFTitle":
|
||||
return (file1, file2) -> {
|
||||
try (PDDocument doc1 = Loader.loadPDF(file1.getBytes());
|
||||
PDDocument doc2 = Loader.loadPDF(file2.getBytes())) {
|
||||
try (PDDocument doc1 = pdfDocumentFactory.load(file1.getBytes());
|
||||
PDDocument doc2 = pdfDocumentFactory.load(file2.getBytes())) {
|
||||
String title1 = doc1.getDocumentInformation().getTitle();
|
||||
String title2 = doc2.getDocumentInformation().getTitle();
|
||||
return title1.compareTo(title2);
|
||||
@ -152,7 +151,7 @@ public class MergeController {
|
||||
byte[] mergedPdfBytes = docOutputstream.toByteArray(); // Get merged document bytes
|
||||
|
||||
// Load the merged PDF document
|
||||
mergedDocument = Loader.loadPDF(mergedPdfBytes);
|
||||
mergedDocument = pdfDocumentFactory.load(mergedPdfBytes);
|
||||
|
||||
// Remove signatures if removeCertSign is true
|
||||
if (removeCertSign) {
|
||||
|
@ -4,7 +4,6 @@ import java.awt.*;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.multipdf.LayerUtility;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
@ -64,7 +63,7 @@ public class MultiPageLayoutController {
|
||||
: (int) Math.sqrt(pagesPerSheet);
|
||||
int rows = pagesPerSheet == 2 || pagesPerSheet == 3 ? 1 : (int) Math.sqrt(pagesPerSheet);
|
||||
|
||||
PDDocument sourceDocument = Loader.loadPDF(file.getBytes());
|
||||
PDDocument sourceDocument = pdfDocumentFactory.load(file.getBytes());
|
||||
PDDocument newDocument =
|
||||
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
|
||||
PDPage newPage = new PDPage(PDRectangle.A4);
|
||||
|
@ -5,7 +5,6 @@ import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
@ -251,7 +250,7 @@ public class RearrangePagesPDFController {
|
||||
String sortType = request.getCustomMode();
|
||||
try {
|
||||
// Load the input PDF
|
||||
PDDocument document = Loader.loadPDF(pdfFile.getBytes());
|
||||
PDDocument document = pdfDocumentFactory.load(pdfFile.getBytes());
|
||||
|
||||
// Split the page order string into an array of page numbers or range of numbers
|
||||
String[] pageOrderArr = pageOrder != null ? pageOrder.split(",") : new String[0];
|
||||
|
@ -5,7 +5,6 @@ import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.multipdf.LayerUtility;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
@ -52,7 +51,7 @@ public class ScalePagesController {
|
||||
String targetPDRectangle = request.getPageSize();
|
||||
float scaleFactor = request.getScaleFactor();
|
||||
|
||||
PDDocument sourceDocument = Loader.loadPDF(file.getBytes());
|
||||
PDDocument sourceDocument = pdfDocumentFactory.load(file.getBytes());
|
||||
PDDocument outputDocument =
|
||||
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
|
||||
|
||||
|
@ -10,7 +10,6 @@ import java.util.stream.Collectors;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
@ -63,7 +62,7 @@ public class SplitPDFController {
|
||||
String pages = request.getPageNumbers();
|
||||
// open the pdf document
|
||||
|
||||
document = Loader.loadPDF(file.getBytes());
|
||||
document = pdfDocumentFactory.load(file.getBytes());
|
||||
// PdfMetadata metadata = PdfMetadataService.extractMetadataFromPdf(document);
|
||||
int totalPages = document.getNumberOfPages();
|
||||
List<Integer> pageNumbers = request.getPageNumbersList(document, false);
|
||||
|
@ -8,7 +8,6 @@ import java.util.List;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
|
||||
@ -34,6 +33,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.PdfMetadata;
|
||||
import stirling.software.SPDF.model.api.SplitPdfByChaptersRequest;
|
||||
import stirling.software.SPDF.service.CustomPDDocumentFactory;
|
||||
import stirling.software.SPDF.service.PdfMetadataService;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
|
||||
@ -45,9 +45,13 @@ public class SplitPdfByChaptersController {
|
||||
|
||||
private final PdfMetadataService pdfMetadataService;
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public SplitPdfByChaptersController(PdfMetadataService pdfMetadataService) {
|
||||
public SplitPdfByChaptersController(
|
||||
PdfMetadataService pdfMetadataService, CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfMetadataService = pdfMetadataService;
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
private static List<Bookmark> extractOutlineItems(
|
||||
@ -135,7 +139,7 @@ public class SplitPdfByChaptersController {
|
||||
if (bookmarkLevel < 0) {
|
||||
return ResponseEntity.badRequest().body("Invalid bookmark level".getBytes());
|
||||
}
|
||||
sourceDocument = Loader.loadPDF(file.getBytes());
|
||||
sourceDocument = pdfDocumentFactory.load(file.getBytes());
|
||||
|
||||
PDDocumentOutline outline = sourceDocument.getDocumentCatalog().getDocumentOutline();
|
||||
|
||||
|
@ -9,7 +9,6 @@ import java.util.List;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.multipdf.LayerUtility;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
@ -57,7 +56,7 @@ public class SplitPdfBySectionsController {
|
||||
List<ByteArrayOutputStream> splitDocumentsBoas = new ArrayList<>();
|
||||
|
||||
MultipartFile file = request.getFileInput();
|
||||
PDDocument sourceDocument = Loader.loadPDF(file.getBytes());
|
||||
PDDocument sourceDocument = pdfDocumentFactory.load(file.getBytes());
|
||||
|
||||
// Process the PDF based on split parameters
|
||||
int horiz = request.getHorizontalDivisions() + 1;
|
||||
|
@ -7,7 +7,6 @@ import java.nio.file.Path;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
@ -41,6 +40,9 @@ public class SplitPdfBySizeController {
|
||||
@Autowired
|
||||
public SplitPdfBySizeController(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
log.info(
|
||||
"SplitPdfBySizeController initialized with pdfDocumentFactory: {}",
|
||||
pdfDocumentFactory);
|
||||
}
|
||||
|
||||
@PostMapping(value = "/split-by-size-or-count", consumes = "multipart/form-data")
|
||||
@ -52,38 +54,92 @@ public class SplitPdfBySizeController {
|
||||
public ResponseEntity<byte[]> autoSplitPdf(@ModelAttribute SplitPdfBySizeOrCountRequest request)
|
||||
throws Exception {
|
||||
|
||||
log.info("Starting PDF split process with request: {}", request);
|
||||
MultipartFile file = request.getFileInput();
|
||||
log.info(
|
||||
"File received: name={}, size={} bytes",
|
||||
file.getOriginalFilename(),
|
||||
file.getSize());
|
||||
|
||||
Path zipFile = Files.createTempFile("split_documents", ".zip");
|
||||
log.info("Created temporary zip file: {}", zipFile);
|
||||
|
||||
String filename =
|
||||
Filenames.toSimpleFileName(file.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", "");
|
||||
log.info("Base filename for output: {}", filename);
|
||||
|
||||
byte[] data = null;
|
||||
try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile));
|
||||
PDDocument sourceDocument = Loader.loadPDF(file.getBytes())) {
|
||||
try {
|
||||
log.info("Reading input file bytes");
|
||||
byte[] pdfBytes = file.getBytes();
|
||||
log.info("Successfully read {} bytes from input file", pdfBytes.length);
|
||||
|
||||
log.info("Creating ZIP output stream");
|
||||
try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile))) {
|
||||
log.info("Loading PDF document");
|
||||
try (PDDocument sourceDocument = pdfDocumentFactory.load(pdfBytes)) {
|
||||
log.info(
|
||||
"Successfully loaded PDF with {} pages",
|
||||
sourceDocument.getNumberOfPages());
|
||||
|
||||
int type = request.getSplitType();
|
||||
String value = request.getSplitValue();
|
||||
log.info("Split type: {}, Split value: {}", type, value);
|
||||
|
||||
if (type == 0) {
|
||||
log.info("Processing split by size");
|
||||
long maxBytes = GeneralUtils.convertSizeToBytes(value);
|
||||
log.info("Max bytes per document: {}", maxBytes);
|
||||
handleSplitBySize(sourceDocument, maxBytes, zipOut, filename);
|
||||
} else if (type == 1) {
|
||||
log.info("Processing split by page count");
|
||||
int pageCount = Integer.parseInt(value);
|
||||
log.info("Pages per document: {}", pageCount);
|
||||
handleSplitByPageCount(sourceDocument, pageCount, zipOut, filename);
|
||||
} else if (type == 2) {
|
||||
log.info("Processing split by document count");
|
||||
int documentCount = Integer.parseInt(value);
|
||||
log.info("Total number of documents: {}", documentCount);
|
||||
handleSplitByDocCount(sourceDocument, documentCount, zipOut, filename);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Invalid argument for split type");
|
||||
log.error("Invalid split type: {}", type);
|
||||
throw new IllegalArgumentException(
|
||||
"Invalid argument for split type: " + type);
|
||||
}
|
||||
|
||||
log.info("PDF splitting completed successfully");
|
||||
} catch (Exception e) {
|
||||
log.error("Error loading or processing PDF document", e);
|
||||
throw e;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error("Error creating or writing to ZIP file", e);
|
||||
throw e;
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error("exception", e);
|
||||
log.error("Exception during PDF splitting process", e);
|
||||
throw e; // Re-throw to ensure proper error response
|
||||
} finally {
|
||||
try {
|
||||
log.info("Reading ZIP file data");
|
||||
data = Files.readAllBytes(zipFile);
|
||||
Files.deleteIfExists(zipFile);
|
||||
log.info("Successfully read {} bytes from ZIP file", data.length);
|
||||
} catch (IOException e) {
|
||||
log.error("Error reading ZIP file data", e);
|
||||
}
|
||||
|
||||
try {
|
||||
log.info("Deleting temporary ZIP file");
|
||||
boolean deleted = Files.deleteIfExists(zipFile);
|
||||
log.info("Temporary ZIP file deleted: {}", deleted);
|
||||
} catch (IOException e) {
|
||||
log.error("Error deleting temporary ZIP file", e);
|
||||
}
|
||||
}
|
||||
|
||||
log.info("Returning response with {} bytes of data", data != null ? data.length : 0);
|
||||
return WebResponseUtils.bytesToWebResponse(
|
||||
data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM);
|
||||
}
|
||||
@ -91,63 +147,230 @@ public class SplitPdfBySizeController {
|
||||
private void handleSplitBySize(
|
||||
PDDocument sourceDocument, long maxBytes, ZipOutputStream zipOut, String baseFilename)
|
||||
throws IOException {
|
||||
long currentSize = 0;
|
||||
log.info("Starting handleSplitBySize with maxBytes={}", maxBytes);
|
||||
|
||||
PDDocument currentDoc =
|
||||
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
|
||||
int fileIndex = 1;
|
||||
int totalPages = sourceDocument.getNumberOfPages();
|
||||
int pageAdded = 0;
|
||||
|
||||
for (int pageIndex = 0; pageIndex < sourceDocument.getNumberOfPages(); pageIndex++) {
|
||||
// Smart size check frequency - check more often with larger documents
|
||||
int baseCheckFrequency = 5;
|
||||
|
||||
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
|
||||
PDPage page = sourceDocument.getPage(pageIndex);
|
||||
ByteArrayOutputStream pageOutputStream = new ByteArrayOutputStream();
|
||||
log.info("Processing page {} of {}", pageIndex + 1, totalPages);
|
||||
|
||||
try (PDDocument tempDoc = new PDDocument()) {
|
||||
PDPage importedPage = tempDoc.importPage(page); // This creates a new PDPage object
|
||||
tempDoc.save(pageOutputStream);
|
||||
}
|
||||
|
||||
long pageSize = pageOutputStream.size();
|
||||
if (currentSize + pageSize > maxBytes) {
|
||||
if (currentDoc.getNumberOfPages() > 0) {
|
||||
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
|
||||
currentDoc.close(); // Make sure to close the document
|
||||
currentDoc = new PDDocument();
|
||||
currentSize = 0;
|
||||
}
|
||||
}
|
||||
|
||||
PDPage newPage = new PDPage(page.getCOSObject()); // Re-create the page
|
||||
// Add the page to current document
|
||||
PDPage newPage = new PDPage(page.getCOSObject());
|
||||
currentDoc.addPage(newPage);
|
||||
currentSize += pageSize;
|
||||
pageAdded++;
|
||||
|
||||
// Dynamic size checking based on document size and page count
|
||||
boolean shouldCheckSize =
|
||||
(pageAdded % baseCheckFrequency == 0)
|
||||
|| (pageIndex == totalPages - 1)
|
||||
|| (pageAdded >= 20); // Always check after 20 pages
|
||||
|
||||
if (shouldCheckSize) {
|
||||
log.info("Performing size check after {} pages", pageAdded);
|
||||
ByteArrayOutputStream checkSizeStream = new ByteArrayOutputStream();
|
||||
currentDoc.save(checkSizeStream);
|
||||
long actualSize = checkSizeStream.size();
|
||||
log.info("Current document size: {} bytes (max: {} bytes)", actualSize, maxBytes);
|
||||
|
||||
if (actualSize > maxBytes) {
|
||||
// We exceeded the limit - remove the last page and save
|
||||
if (currentDoc.getNumberOfPages() > 1) {
|
||||
currentDoc.removePage(currentDoc.getNumberOfPages() - 1);
|
||||
pageIndex--; // Process this page again in the next document
|
||||
log.info("Size limit exceeded - removed last page");
|
||||
}
|
||||
|
||||
if (currentDoc.getNumberOfPages() != 0) {
|
||||
log.info(
|
||||
"Saving document with {} pages as part {}",
|
||||
currentDoc.getNumberOfPages(),
|
||||
fileIndex);
|
||||
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
|
||||
currentDoc.close();
|
||||
currentDoc = new PDDocument();
|
||||
pageAdded = 0;
|
||||
} else if (pageIndex < totalPages - 1) {
|
||||
// We're under the limit, calculate if we might fit more pages
|
||||
// Try to predict how many more similar pages might fit
|
||||
if (actualSize < maxBytes * 0.75 && pageAdded > 0) {
|
||||
// Rather than using a ratio, look ahead to test actual upcoming pages
|
||||
int pagesToLookAhead = Math.min(5, totalPages - pageIndex - 1);
|
||||
|
||||
if (pagesToLookAhead > 0) {
|
||||
log.info(
|
||||
"Testing {} upcoming pages for potential addition",
|
||||
pagesToLookAhead);
|
||||
|
||||
// Create a temp document with current pages + look-ahead pages
|
||||
PDDocument testDoc = new PDDocument();
|
||||
// First copy existing pages
|
||||
for (int i = 0; i < currentDoc.getNumberOfPages(); i++) {
|
||||
testDoc.addPage(new PDPage(currentDoc.getPage(i).getCOSObject()));
|
||||
}
|
||||
|
||||
// Try adding look-ahead pages one by one
|
||||
int extraPagesAdded = 0;
|
||||
for (int i = 0; i < pagesToLookAhead; i++) {
|
||||
int testPageIndex = pageIndex + 1 + i;
|
||||
PDPage testPage = sourceDocument.getPage(testPageIndex);
|
||||
testDoc.addPage(new PDPage(testPage.getCOSObject()));
|
||||
|
||||
// Check if we're still under size
|
||||
ByteArrayOutputStream testStream = new ByteArrayOutputStream();
|
||||
testDoc.save(testStream);
|
||||
long testSize = testStream.size();
|
||||
|
||||
if (testSize <= maxBytes) {
|
||||
extraPagesAdded++;
|
||||
log.info(
|
||||
"Test: Can add page {} (size would be {})",
|
||||
testPageIndex + 1,
|
||||
testSize);
|
||||
} else {
|
||||
log.info(
|
||||
"Test: Cannot add page {} (size would be {})",
|
||||
testPageIndex + 1,
|
||||
testSize);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
testDoc.close();
|
||||
|
||||
// Add the pages we verified would fit
|
||||
if (extraPagesAdded > 0) {
|
||||
log.info("Adding {} verified pages ahead", extraPagesAdded);
|
||||
for (int i = 0; i < extraPagesAdded; i++) {
|
||||
int extraPageIndex = pageIndex + 1 + i;
|
||||
PDPage extraPage = sourceDocument.getPage(extraPageIndex);
|
||||
currentDoc.addPage(new PDPage(extraPage.getCOSObject()));
|
||||
}
|
||||
pageIndex += extraPagesAdded;
|
||||
pageAdded += extraPagesAdded;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Save final document if it has any pages
|
||||
if (currentDoc.getNumberOfPages() > 0) {
|
||||
log.info(
|
||||
"Saving final document with {} pages as part {}",
|
||||
currentDoc.getNumberOfPages(),
|
||||
fileIndex);
|
||||
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
|
||||
}
|
||||
|
||||
log.info("Completed handleSplitBySize with {} document parts created", fileIndex - 1);
|
||||
}
|
||||
|
||||
private void handleSplitByPageCount(
|
||||
PDDocument sourceDocument, int pageCount, ZipOutputStream zipOut, String baseFilename)
|
||||
throws IOException {
|
||||
log.info("Starting handleSplitByPageCount with pageCount={}", pageCount);
|
||||
int currentPageCount = 0;
|
||||
PDDocument currentDoc =
|
||||
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
|
||||
log.info("Creating initial output document");
|
||||
PDDocument currentDoc = null;
|
||||
try {
|
||||
currentDoc = pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
|
||||
log.info("Successfully created initial output document");
|
||||
} catch (Exception e) {
|
||||
log.error("Error creating initial output document", e);
|
||||
throw new IOException("Failed to create initial output document", e);
|
||||
}
|
||||
|
||||
int fileIndex = 1;
|
||||
int pageIndex = 0;
|
||||
int totalPages = sourceDocument.getNumberOfPages();
|
||||
log.info("Processing {} pages", totalPages);
|
||||
|
||||
try {
|
||||
for (PDPage page : sourceDocument.getPages()) {
|
||||
pageIndex++;
|
||||
log.info("Processing page {} of {}", pageIndex, totalPages);
|
||||
|
||||
try {
|
||||
log.info("Adding page {} to current document", pageIndex);
|
||||
currentDoc.addPage(page);
|
||||
log.info("Successfully added page {} to current document", pageIndex);
|
||||
} catch (Exception e) {
|
||||
log.error("Error adding page {} to current document", pageIndex, e);
|
||||
throw new IOException("Failed to add page to document", e);
|
||||
}
|
||||
|
||||
currentPageCount++;
|
||||
log.info("Current page count: {}/{}", currentPageCount, pageCount);
|
||||
|
||||
if (currentPageCount == pageCount) {
|
||||
// Save and reset current document
|
||||
log.info(
|
||||
"Reached target page count ({}), saving current document as part {}",
|
||||
pageCount,
|
||||
fileIndex);
|
||||
try {
|
||||
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
|
||||
log.info("Successfully saved document part {}", fileIndex - 1);
|
||||
} catch (Exception e) {
|
||||
log.error("Error saving document part {}", fileIndex - 1, e);
|
||||
throw e;
|
||||
}
|
||||
|
||||
try {
|
||||
log.info("Creating new document for next part");
|
||||
currentDoc = new PDDocument();
|
||||
log.info("Successfully created new document");
|
||||
} catch (Exception e) {
|
||||
log.error("Error creating new document for next part", e);
|
||||
throw new IOException("Failed to create new document", e);
|
||||
}
|
||||
|
||||
currentPageCount = 0;
|
||||
log.info("Reset current page count to 0");
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("Error iterating through pages", e);
|
||||
throw new IOException("Failed to iterate through pages", e);
|
||||
}
|
||||
|
||||
// Add the last document if it contains any pages
|
||||
try {
|
||||
if (currentDoc.getPages().getCount() != 0) {
|
||||
log.info(
|
||||
"Saving final document with {} pages as part {}",
|
||||
currentDoc.getPages().getCount(),
|
||||
fileIndex);
|
||||
try {
|
||||
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
|
||||
log.info("Successfully saved final document part {}", fileIndex - 1);
|
||||
} catch (Exception e) {
|
||||
log.error("Error saving final document part {}", fileIndex - 1, e);
|
||||
throw e;
|
||||
}
|
||||
} else {
|
||||
log.info("Final document has no pages, skipping");
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("Error checking or saving final document", e);
|
||||
throw new IOException("Failed to process final document", e);
|
||||
} finally {
|
||||
try {
|
||||
log.info("Closing final document");
|
||||
currentDoc.close();
|
||||
log.info("Successfully closed final document");
|
||||
} catch (Exception e) {
|
||||
log.error("Error closing final document", e);
|
||||
}
|
||||
}
|
||||
|
||||
log.info("Completed handleSplitByPageCount with {} document parts created", fileIndex - 1);
|
||||
}
|
||||
|
||||
private void handleSplitByDocCount(
|
||||
@ -156,35 +379,101 @@ public class SplitPdfBySizeController {
|
||||
ZipOutputStream zipOut,
|
||||
String baseFilename)
|
||||
throws IOException {
|
||||
log.info("Starting handleSplitByDocCount with documentCount={}", documentCount);
|
||||
int totalPageCount = sourceDocument.getNumberOfPages();
|
||||
log.info("Total pages in source document: {}", totalPageCount);
|
||||
|
||||
int pagesPerDocument = totalPageCount / documentCount;
|
||||
int extraPages = totalPageCount % documentCount;
|
||||
log.info("Pages per document: {}, Extra pages: {}", pagesPerDocument, extraPages);
|
||||
|
||||
int currentPageIndex = 0;
|
||||
int fileIndex = 1;
|
||||
|
||||
for (int i = 0; i < documentCount; i++) {
|
||||
PDDocument currentDoc =
|
||||
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
|
||||
log.info("Creating document {} of {}", i + 1, documentCount);
|
||||
PDDocument currentDoc = null;
|
||||
try {
|
||||
currentDoc = pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument);
|
||||
log.info("Successfully created document {} of {}", i + 1, documentCount);
|
||||
} catch (Exception e) {
|
||||
log.error("Error creating document {} of {}", i + 1, documentCount, e);
|
||||
throw new IOException("Failed to create document", e);
|
||||
}
|
||||
|
||||
int pagesToAdd = pagesPerDocument + (i < extraPages ? 1 : 0);
|
||||
log.info("Adding {} pages to document {}", pagesToAdd, i + 1);
|
||||
|
||||
for (int j = 0; j < pagesToAdd; j++) {
|
||||
currentDoc.addPage(sourceDocument.getPage(currentPageIndex++));
|
||||
try {
|
||||
log.info(
|
||||
"Adding page {} (index {}) to document {}",
|
||||
j + 1,
|
||||
currentPageIndex,
|
||||
i + 1);
|
||||
currentDoc.addPage(sourceDocument.getPage(currentPageIndex));
|
||||
log.info("Successfully added page {} to document {}", j + 1, i + 1);
|
||||
currentPageIndex++;
|
||||
} catch (Exception e) {
|
||||
log.error("Error adding page {} to document {}", j + 1, i + 1, e);
|
||||
throw new IOException("Failed to add page to document", e);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
log.info("Saving document {} with {} pages", i + 1, pagesToAdd);
|
||||
saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++);
|
||||
log.info("Successfully saved document {}", i + 1);
|
||||
} catch (Exception e) {
|
||||
log.error("Error saving document {}", i + 1, e);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
log.info("Completed handleSplitByDocCount with {} documents created", documentCount);
|
||||
}
|
||||
|
||||
private void saveDocumentToZip(
|
||||
PDDocument document, ZipOutputStream zipOut, String baseFilename, int index)
|
||||
throws IOException {
|
||||
log.info("Starting saveDocumentToZip for document part {}", index);
|
||||
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
|
||||
document.save(outStream);
|
||||
document.close(); // Close the document to free resources
|
||||
|
||||
try {
|
||||
log.info("Saving document part {} to byte array", index);
|
||||
document.save(outStream);
|
||||
log.info("Successfully saved document part {} ({} bytes)", index, outStream.size());
|
||||
} catch (Exception e) {
|
||||
log.error("Error saving document part {} to byte array", index, e);
|
||||
throw new IOException("Failed to save document to byte array", e);
|
||||
}
|
||||
|
||||
try {
|
||||
log.info("Closing document part {}", index);
|
||||
document.close();
|
||||
log.info("Successfully closed document part {}", index);
|
||||
} catch (Exception e) {
|
||||
log.error("Error closing document part {}", index, e);
|
||||
// Continue despite close error
|
||||
}
|
||||
|
||||
try {
|
||||
// Create a new zip entry
|
||||
ZipEntry zipEntry = new ZipEntry(baseFilename + "_" + index + ".pdf");
|
||||
String entryName = baseFilename + "_" + index + ".pdf";
|
||||
log.info("Creating ZIP entry: {}", entryName);
|
||||
ZipEntry zipEntry = new ZipEntry(entryName);
|
||||
zipOut.putNextEntry(zipEntry);
|
||||
zipOut.write(outStream.toByteArray());
|
||||
|
||||
byte[] bytes = outStream.toByteArray();
|
||||
log.info("Writing {} bytes to ZIP entry", bytes.length);
|
||||
zipOut.write(bytes);
|
||||
|
||||
log.info("Closing ZIP entry");
|
||||
zipOut.closeEntry();
|
||||
log.info("Successfully added document part {} to ZIP", index);
|
||||
} catch (Exception e) {
|
||||
log.error("Error adding document part {} to ZIP", index, e);
|
||||
throw new IOException("Failed to add document to ZIP file", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -4,7 +4,6 @@ import java.awt.geom.AffineTransform;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.multipdf.LayerUtility;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
@ -46,7 +45,7 @@ public class ToSinglePageController {
|
||||
throws IOException {
|
||||
|
||||
// Load the source document
|
||||
PDDocument sourceDocument = Loader.loadPDF(request.getFileInput().getBytes());
|
||||
PDDocument sourceDocument = pdfDocumentFactory.load(request.getFileInput().getBytes());
|
||||
|
||||
// Calculate total height and max width
|
||||
float totalHeight = 0;
|
||||
|
@ -13,7 +13,6 @@ import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.rendering.ImageType;
|
||||
@ -91,6 +90,7 @@ public class ConvertImgPDFController {
|
||||
|
||||
result =
|
||||
PdfUtils.convertFromPdf(
|
||||
pdfDocumentFactory,
|
||||
newPdfBytes,
|
||||
"webp".equalsIgnoreCase(imageFormat)
|
||||
? "png"
|
||||
@ -245,7 +245,7 @@ public class ConvertImgPDFController {
|
||||
*/
|
||||
private byte[] rearrangePdfPages(byte[] pdfBytes, String[] pageOrderArr) throws IOException {
|
||||
// Load the input PDF
|
||||
PDDocument document = Loader.loadPDF(pdfBytes);
|
||||
PDDocument document = pdfDocumentFactory.load(pdfBytes);
|
||||
int totalPages = document.getNumberOfPages();
|
||||
List<Integer> newPageOrder = GeneralUtils.parsePageList(pageOrderArr, totalPages, false);
|
||||
|
||||
|
@ -2,9 +2,9 @@ package stirling.software.SPDF.controller.api.converters;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.ModelAttribute;
|
||||
@ -21,6 +21,7 @@ import stirling.software.SPDF.model.api.PDFFile;
|
||||
import stirling.software.SPDF.model.api.converters.PdfToPresentationRequest;
|
||||
import stirling.software.SPDF.model.api.converters.PdfToTextOrRTFRequest;
|
||||
import stirling.software.SPDF.model.api.converters.PdfToWordRequest;
|
||||
import stirling.software.SPDF.service.CustomPDDocumentFactory;
|
||||
import stirling.software.SPDF.utils.PDFToFile;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
|
||||
@ -29,6 +30,13 @@ import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
@Tag(name = "Convert", description = "Convert APIs")
|
||||
public class ConvertPDFToOffice {
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public ConvertPDFToOffice(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/pdf/presentation")
|
||||
@Operation(
|
||||
summary = "Convert PDF to Presentation format",
|
||||
@ -54,7 +62,7 @@ public class ConvertPDFToOffice {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
String outputFormat = request.getOutputFormat();
|
||||
if ("txt".equals(request.getOutputFormat())) {
|
||||
try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) {
|
||||
try (PDDocument document = pdfDocumentFactory.load(inputFile.getBytes())) {
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
String text = stripper.getText(document);
|
||||
return WebResponseUtils.bytesToWebResponse(
|
||||
|
@ -12,8 +12,8 @@ import java.util.zip.ZipOutputStream;
|
||||
|
||||
import org.apache.commons.csv.CSVFormat;
|
||||
import org.apache.commons.csv.QuoteMode;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.ContentDisposition;
|
||||
import org.springframework.http.HttpHeaders;
|
||||
import org.springframework.http.MediaType;
|
||||
@ -30,6 +30,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.api.PDFWithPageNums;
|
||||
import stirling.software.SPDF.pdf.FlexibleCSVWriter;
|
||||
import stirling.software.SPDF.service.CustomPDDocumentFactory;
|
||||
|
||||
import technology.tabula.ObjectExtractor;
|
||||
import technology.tabula.Page;
|
||||
@ -42,6 +43,13 @@ import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
||||
@Slf4j
|
||||
public class ExtractCSVController {
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public ExtractCSVController(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
@PostMapping(value = "/pdf/csv", consumes = "multipart/form-data")
|
||||
@Operation(
|
||||
summary = "Extracts a CSV document from a PDF",
|
||||
@ -51,7 +59,7 @@ public class ExtractCSVController {
|
||||
String baseName = getBaseName(form.getFileInput().getOriginalFilename());
|
||||
List<CsvEntry> csvEntries = new ArrayList<>();
|
||||
|
||||
try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
|
||||
try (PDDocument document = pdfDocumentFactory.load(form.getFileInput().getBytes())) {
|
||||
List<Integer> pages = form.getPageNumbersList(document, true);
|
||||
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
|
||||
CSVFormat format =
|
||||
|
@ -2,10 +2,10 @@ package stirling.software.SPDF.controller.api.filters;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.ModelAttribute;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
@ -23,6 +23,7 @@ import stirling.software.SPDF.model.api.filter.ContainsTextRequest;
|
||||
import stirling.software.SPDF.model.api.filter.FileSizeRequest;
|
||||
import stirling.software.SPDF.model.api.filter.PageRotationRequest;
|
||||
import stirling.software.SPDF.model.api.filter.PageSizeRequest;
|
||||
import stirling.software.SPDF.service.CustomPDDocumentFactory;
|
||||
import stirling.software.SPDF.utils.PdfUtils;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
|
||||
@ -31,6 +32,13 @@ import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
@Tag(name = "Filter", description = "Filter APIs")
|
||||
public class FilterController {
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public FilterController(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/filter-contains-text")
|
||||
@Operation(
|
||||
summary = "Checks if a PDF contains set text, returns true if does",
|
||||
@ -41,7 +49,7 @@ public class FilterController {
|
||||
String text = request.getText();
|
||||
String pageNumber = request.getPageNumbers();
|
||||
|
||||
PDDocument pdfDocument = Loader.loadPDF(inputFile.getBytes());
|
||||
PDDocument pdfDocument = pdfDocumentFactory.load(inputFile.getBytes());
|
||||
if (PdfUtils.hasText(pdfDocument, pageNumber, text))
|
||||
return WebResponseUtils.pdfDocToWebResponse(
|
||||
pdfDocument, Filenames.toSimpleFileName(inputFile.getOriginalFilename()));
|
||||
@ -58,7 +66,7 @@ public class FilterController {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
String pageNumber = request.getPageNumbers();
|
||||
|
||||
PDDocument pdfDocument = Loader.loadPDF(inputFile.getBytes());
|
||||
PDDocument pdfDocument = pdfDocumentFactory.load(inputFile.getBytes());
|
||||
if (PdfUtils.hasImages(pdfDocument, pageNumber))
|
||||
return WebResponseUtils.pdfDocToWebResponse(
|
||||
pdfDocument, Filenames.toSimpleFileName(inputFile.getOriginalFilename()));
|
||||
@ -75,7 +83,7 @@ public class FilterController {
|
||||
String pageCount = request.getPageCount();
|
||||
String comparator = request.getComparator();
|
||||
// Load the PDF
|
||||
PDDocument document = Loader.loadPDF(inputFile.getBytes());
|
||||
PDDocument document = pdfDocumentFactory.load(inputFile.getBytes());
|
||||
int actualPageCount = document.getNumberOfPages();
|
||||
|
||||
boolean valid = false;
|
||||
@ -109,7 +117,7 @@ public class FilterController {
|
||||
String comparator = request.getComparator();
|
||||
|
||||
// Load the PDF
|
||||
PDDocument document = Loader.loadPDF(inputFile.getBytes());
|
||||
PDDocument document = pdfDocumentFactory.load(inputFile.getBytes());
|
||||
|
||||
PDPage firstPage = document.getPage(0);
|
||||
PDRectangle actualPageSize = firstPage.getMediaBox();
|
||||
@ -185,7 +193,7 @@ public class FilterController {
|
||||
String comparator = request.getComparator();
|
||||
|
||||
// Load the PDF
|
||||
PDDocument document = Loader.loadPDF(inputFile.getBytes());
|
||||
PDDocument document = pdfDocumentFactory.load(inputFile.getBytes());
|
||||
|
||||
// Get the rotation of the first page
|
||||
PDPage firstPage = document.getPage(0);
|
||||
|
@ -5,10 +5,10 @@ import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.ModelAttribute;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
@ -23,6 +23,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.api.misc.ExtractHeaderRequest;
|
||||
import stirling.software.SPDF.service.CustomPDDocumentFactory;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
|
||||
@RestController
|
||||
@ -34,6 +35,13 @@ public class AutoRenameController {
|
||||
private static final float TITLE_FONT_SIZE_THRESHOLD = 20.0f;
|
||||
private static final int LINE_LIMIT = 200;
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public AutoRenameController(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/auto-rename")
|
||||
@Operation(
|
||||
summary = "Extract header from PDF file",
|
||||
@ -44,7 +52,7 @@ public class AutoRenameController {
|
||||
MultipartFile file = request.getFileInput();
|
||||
Boolean useFirstTextAsFallback = request.isUseFirstTextAsFallback();
|
||||
|
||||
PDDocument document = Loader.loadPDF(file.getBytes());
|
||||
PDDocument document = pdfDocumentFactory.load(file.getBytes());
|
||||
PDFTextStripper reader =
|
||||
new PDFTextStripper() {
|
||||
List<LineInfo> lineInfos = new ArrayList<>();
|
||||
|
@ -8,7 +8,6 @@ import java.util.List;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageTree;
|
||||
@ -85,7 +84,7 @@ public class BlankPageController {
|
||||
int threshold = request.getThreshold();
|
||||
float whitePercent = request.getWhitePercent();
|
||||
|
||||
try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) {
|
||||
try (PDDocument document = pdfDocumentFactory.load(inputFile.getBytes())) {
|
||||
PDPageTree pages = document.getDocumentCatalog().getPages();
|
||||
PDFTextStripper textStripper = new PDFTextStripper();
|
||||
|
||||
|
@ -18,7 +18,6 @@ import javax.imageio.ImageWriter;
|
||||
import javax.imageio.plugins.jpeg.JPEGImageWriteParam;
|
||||
import javax.imageio.stream.ImageOutputStream;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
@ -59,7 +58,8 @@ public class CompressController {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
private void compressImagesInPDF(Path pdfFile, double scaleFactor, float jpegQuality) throws Exception {
|
||||
private void compressImagesInPDF(Path pdfFile, double scaleFactor, float jpegQuality)
|
||||
throws Exception {
|
||||
byte[] fileBytes = Files.readAllBytes(pdfFile);
|
||||
long originalFileSize = fileBytes.length;
|
||||
log.info(
|
||||
@ -71,7 +71,7 @@ public class CompressController {
|
||||
// Track processed images to avoid recompression
|
||||
Set<String> processedImages = new HashSet<>();
|
||||
|
||||
try (PDDocument doc = Loader.loadPDF(fileBytes)) {
|
||||
try (PDDocument doc = pdfDocumentFactory.load(fileBytes)) {
|
||||
int totalImages = 0;
|
||||
int compressedImages = 0;
|
||||
int skippedImages = 0;
|
||||
@ -204,10 +204,12 @@ public class CompressController {
|
||||
// Choose appropriate format and compression
|
||||
String format = bufferedImage.getColorModel().hasAlpha() ? "png" : "jpeg";
|
||||
|
||||
// First get the actual size of the original image by encoding it to the chosen format
|
||||
// First get the actual size of the original image by encoding it to the chosen
|
||||
// format
|
||||
ByteArrayOutputStream originalImageStream = new ByteArrayOutputStream();
|
||||
if (format.equals("jpeg")) {
|
||||
// Get the best available JPEG writer (prioritizes TwelveMonkeys if available)
|
||||
// Get the best available JPEG writer (prioritizes TwelveMonkeys if
|
||||
// available)
|
||||
Iterator<ImageWriter> writers = ImageIO.getImageWritersByFormatName("jpeg");
|
||||
ImageWriter writer = null;
|
||||
|
||||
@ -472,8 +474,7 @@ public class CompressController {
|
||||
double qpdfReduction = 100.0 - ((postQpdfSize * 100.0) / preQpdfSize);
|
||||
log.info(
|
||||
"Post-QPDF file size: {} (reduced by {:.1f}%)",
|
||||
GeneralUtils.formatBytes(postQpdfSize),
|
||||
qpdfReduction);
|
||||
GeneralUtils.formatBytes(postQpdfSize), qpdfReduction);
|
||||
|
||||
} else {
|
||||
tempOutputFile = tempInputFile;
|
||||
|
@ -0,0 +1,145 @@
|
||||
package stirling.software.SPDF.controller.api.misc;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.pdfbox.cos.*;
|
||||
import org.apache.pdfbox.io.IOUtils;
|
||||
import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.ModelAttribute;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.api.PDFFile;
|
||||
import stirling.software.SPDF.service.CustomPDDocumentFactory;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/api/v1/misc")
|
||||
@Slf4j
|
||||
@Tag(name = "Misc", description = "Miscellaneous APIs")
|
||||
public class DecompressPdfController {
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public DecompressPdfController(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
@PostMapping(value = "/decompress-pdf", consumes = "multipart/form-data")
|
||||
@Operation(
|
||||
summary = "Decompress PDF streams",
|
||||
description = "Fully decompresses all PDF streams including text content")
|
||||
public ResponseEntity<byte[]> decompressPdf(@ModelAttribute PDFFile request)
|
||||
throws IOException {
|
||||
|
||||
MultipartFile file = request.getFileInput();
|
||||
|
||||
try (PDDocument document = pdfDocumentFactory.load(file.getBytes())) {
|
||||
// Process all objects in document
|
||||
processAllObjects(document);
|
||||
|
||||
// Save with explicit no compression
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
document.save(baos, CompressParameters.NO_COMPRESSION);
|
||||
|
||||
String outputFilename =
|
||||
file.getOriginalFilename().replaceFirst("\\.(?=[^.]+$)", "_decompressed.");
|
||||
return WebResponseUtils.bytesToWebResponse(
|
||||
baos.toByteArray(), outputFilename, MediaType.APPLICATION_PDF);
|
||||
}
|
||||
}
|
||||
|
||||
private void processAllObjects(PDDocument document) {
|
||||
Set<COSBase> processed = new HashSet<>();
|
||||
COSDocument cosDoc = document.getDocument();
|
||||
|
||||
// Process all objects in the document
|
||||
for (COSObjectKey key : cosDoc.getXrefTable().keySet()) {
|
||||
COSObject obj = cosDoc.getObjectFromPool(key);
|
||||
processObject(obj, processed);
|
||||
}
|
||||
}
|
||||
|
||||
private void processObject(COSBase obj, Set<COSBase> processed) {
|
||||
// Skip null objects or already processed objects to avoid infinite recursion
|
||||
if (obj == null || processed.contains(obj)) return;
|
||||
processed.add(obj);
|
||||
|
||||
if (obj instanceof COSObject cosObj) {
|
||||
processObject(cosObj.getObject(), processed);
|
||||
} else if (obj instanceof COSDictionary dict) {
|
||||
processDictionary(dict, processed);
|
||||
} else if (obj instanceof COSArray array) {
|
||||
processArray(array, processed);
|
||||
}
|
||||
}
|
||||
|
||||
private void processDictionary(COSDictionary dict, Set<COSBase> processed) {
|
||||
// Process all dictionary entries
|
||||
for (COSName key : dict.keySet()) {
|
||||
processObject(dict.getDictionaryObject(key), processed);
|
||||
}
|
||||
|
||||
// If this is a stream, decompress it
|
||||
if (dict instanceof COSStream stream) {
|
||||
decompressStream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
private void processArray(COSArray array, Set<COSBase> processed) {
|
||||
// Process all array elements
|
||||
for (int i = 0; i < array.size(); i++) {
|
||||
processObject(array.get(i), processed);
|
||||
}
|
||||
}
|
||||
|
||||
private void decompressStream(COSStream stream) {
|
||||
try {
|
||||
log.debug("Processing stream: {}", stream);
|
||||
|
||||
// Only remove filter information if it exists
|
||||
if (stream.containsKey(COSName.FILTER)
|
||||
|| stream.containsKey(COSName.DECODE_PARMS)
|
||||
|| stream.containsKey(COSName.D)) {
|
||||
|
||||
// Read the decompressed content first
|
||||
byte[] decompressedBytes;
|
||||
try (COSInputStream is = stream.createInputStream()) {
|
||||
decompressedBytes = IOUtils.toByteArray(is);
|
||||
}
|
||||
|
||||
// Now remove filter information
|
||||
stream.removeItem(COSName.FILTER);
|
||||
stream.removeItem(COSName.DECODE_PARMS);
|
||||
stream.removeItem(COSName.D);
|
||||
|
||||
// Write the raw content back
|
||||
try (OutputStream out = stream.createRawOutputStream()) {
|
||||
out.write(decompressedBytes);
|
||||
}
|
||||
|
||||
// Set the Length to reflect the new stream size
|
||||
stream.setInt(COSName.LENGTH, decompressedBytes.length);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error("Error decompressing stream", e);
|
||||
// Continue processing other streams even if this one fails
|
||||
}
|
||||
}
|
||||
}
|
@ -14,9 +14,9 @@ import java.util.zip.ZipOutputStream;
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
@ -32,6 +32,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.api.misc.ExtractImageScansRequest;
|
||||
import stirling.software.SPDF.service.CustomPDDocumentFactory;
|
||||
import stirling.software.SPDF.utils.CheckProgramInstall;
|
||||
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
|
||||
@ -45,6 +46,13 @@ public class ExtractImageScansController {
|
||||
|
||||
private static final String REPLACEFIRST = "[.][^.]+$";
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public ExtractImageScansController(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/extract-image-scans")
|
||||
@Operation(
|
||||
summary = "Extract image scans from an input file",
|
||||
@ -87,7 +95,8 @@ public class ExtractImageScansController {
|
||||
// Check if input file is a PDF
|
||||
if ("pdf".equalsIgnoreCase(extension)) {
|
||||
// Load PDF document
|
||||
try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
|
||||
try (PDDocument document =
|
||||
pdfDocumentFactory.load(form.getFileInput().getBytes())) {
|
||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||
pdfRenderer.setSubsamplingAllowed(true);
|
||||
int pageCount = document.getNumberOfPages();
|
||||
|
@ -20,11 +20,11 @@ import java.util.zip.ZipOutputStream;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.ModelAttribute;
|
||||
@ -40,6 +40,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.api.PDFExtractImagesRequest;
|
||||
import stirling.software.SPDF.service.CustomPDDocumentFactory;
|
||||
import stirling.software.SPDF.utils.ImageProcessingUtils;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
|
||||
@ -49,6 +50,13 @@ import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
@Tag(name = "Misc", description = "Miscellaneous APIs")
|
||||
public class ExtractImagesController {
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public ExtractImagesController(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/extract-images")
|
||||
@Operation(
|
||||
summary = "Extract images from a PDF file",
|
||||
@ -59,7 +67,7 @@ public class ExtractImagesController {
|
||||
MultipartFile file = request.getFileInput();
|
||||
String format = request.getFormat();
|
||||
boolean allowDuplicates = request.isAllowDuplicates();
|
||||
PDDocument document = Loader.loadPDF(file.getBytes());
|
||||
PDDocument document = pdfDocumentFactory.load(file.getBytes());
|
||||
|
||||
// Determine if multithreading should be used based on PDF size or number of pages
|
||||
boolean useMultithreading = shouldUseMultithreading(file, document);
|
||||
|
@ -3,7 +3,6 @@ package stirling.software.SPDF.controller.api.misc;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
@ -51,7 +50,7 @@ public class FlattenController {
|
||||
public ResponseEntity<byte[]> flatten(@ModelAttribute FlattenRequest request) throws Exception {
|
||||
MultipartFile file = request.getFileInput();
|
||||
|
||||
PDDocument document = Loader.loadPDF(file.getBytes());
|
||||
PDDocument document = pdfDocumentFactory.load(file.getBytes());
|
||||
Boolean flattenOnlyForms = request.getFlattenOnlyForms();
|
||||
|
||||
if (Boolean.TRUE.equals(flattenOnlyForms)) {
|
||||
|
@ -7,10 +7,10 @@ import java.util.Calendar;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.WebDataBinder;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
@ -23,6 +23,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.api.misc.MetadataRequest;
|
||||
import stirling.software.SPDF.service.CustomPDDocumentFactory;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
import stirling.software.SPDF.utils.propertyeditor.StringToMapPropertyEditor;
|
||||
|
||||
@ -32,6 +33,13 @@ import stirling.software.SPDF.utils.propertyeditor.StringToMapPropertyEditor;
|
||||
@Tag(name = "Misc", description = "Miscellaneous APIs")
|
||||
public class MetadataController {
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public MetadataController(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
private String checkUndefined(String entry) {
|
||||
// Check if the string is "undefined"
|
||||
if ("undefined".equals(entry)) {
|
||||
@ -76,7 +84,7 @@ public class MetadataController {
|
||||
allRequestParams = new java.util.HashMap<String, String>();
|
||||
}
|
||||
// Load the PDF file into a PDDocument
|
||||
PDDocument document = Loader.loadPDF(pdfFile.getBytes());
|
||||
PDDocument document = pdfDocumentFactory.load(pdfFile.getBytes());
|
||||
|
||||
// Get the document information from the PDF
|
||||
PDDocumentInformation info = document.getDocumentInformation();
|
||||
|
@ -73,17 +73,16 @@ public class PageNumbersController {
|
||||
case "x-large":
|
||||
marginFactor = 0.075f;
|
||||
break;
|
||||
|
||||
default:
|
||||
marginFactor = 0.035f;
|
||||
break;
|
||||
}
|
||||
|
||||
float fontSize = font_size;
|
||||
if (pagesToNumber == null || pagesToNumber.length() == 0) {
|
||||
if (pagesToNumber == null || pagesToNumber.isEmpty()) {
|
||||
pagesToNumber = "all";
|
||||
}
|
||||
if (customText == null || customText.length() == 0) {
|
||||
if (customText == null || customText.isEmpty()) {
|
||||
customText = "{n}";
|
||||
}
|
||||
List<Integer> pagesToNumberList =
|
||||
@ -94,63 +93,69 @@ public class PageNumbersController {
|
||||
PDRectangle pageSize = page.getMediaBox();
|
||||
|
||||
String text =
|
||||
customText != null
|
||||
? customText
|
||||
customText
|
||||
.replace("{n}", String.valueOf(pageNumber))
|
||||
.replace("{total}", String.valueOf(document.getNumberOfPages()))
|
||||
.replace(
|
||||
"{filename}",
|
||||
Filenames.toSimpleFileName(file.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", ""))
|
||||
: String.valueOf(pageNumber);
|
||||
.replaceFirst("[.][^.]+$", ""));
|
||||
|
||||
PDType1Font currentFont =
|
||||
switch (font_type.toLowerCase()) {
|
||||
case "courier" -> new PDType1Font(Standard14Fonts.FontName.COURIER);
|
||||
case "times" -> new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN);
|
||||
default -> new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
||||
};
|
||||
|
||||
float x, y;
|
||||
|
||||
if (position == 5) {
|
||||
// Calculate text width and font metrics
|
||||
float textWidth = currentFont.getStringWidth(text) / 1000 * fontSize;
|
||||
|
||||
float ascent = currentFont.getFontDescriptor().getAscent() / 1000 * fontSize;
|
||||
float descent = currentFont.getFontDescriptor().getDescent() / 1000 * fontSize;
|
||||
|
||||
float centerX = pageSize.getLowerLeftX() + (pageSize.getWidth() / 2);
|
||||
float centerY = pageSize.getLowerLeftY() + (pageSize.getHeight() / 2);
|
||||
|
||||
x = centerX - (textWidth / 2);
|
||||
y = centerY - (ascent + descent) / 2;
|
||||
} else {
|
||||
int xGroup = (position - 1) % 3;
|
||||
int yGroup = 2 - (position - 1) / 3;
|
||||
|
||||
x =
|
||||
switch (xGroup) {
|
||||
case 0: // left
|
||||
x = pageSize.getLowerLeftX() + marginFactor * pageSize.getWidth();
|
||||
break;
|
||||
case 1: // center
|
||||
x = pageSize.getLowerLeftX() + (pageSize.getWidth() / 2);
|
||||
break;
|
||||
default: // right
|
||||
x = pageSize.getUpperRightX() - marginFactor * pageSize.getWidth();
|
||||
break;
|
||||
}
|
||||
case 0 ->
|
||||
pageSize.getLowerLeftX()
|
||||
+ marginFactor * pageSize.getWidth(); // left
|
||||
case 1 ->
|
||||
pageSize.getLowerLeftX() + (pageSize.getWidth() / 2); // center
|
||||
default ->
|
||||
pageSize.getUpperRightX()
|
||||
- marginFactor * pageSize.getWidth(); // right
|
||||
};
|
||||
|
||||
y =
|
||||
switch (yGroup) {
|
||||
case 0: // bottom
|
||||
y = pageSize.getLowerLeftY() + marginFactor * pageSize.getHeight();
|
||||
break;
|
||||
case 1: // middle
|
||||
y = pageSize.getLowerLeftY() + (pageSize.getHeight() / 2);
|
||||
break;
|
||||
default: // top
|
||||
y = pageSize.getUpperRightY() - marginFactor * pageSize.getHeight();
|
||||
break;
|
||||
case 0 ->
|
||||
pageSize.getLowerLeftY()
|
||||
+ marginFactor * pageSize.getHeight(); // bottom
|
||||
case 1 ->
|
||||
pageSize.getLowerLeftY() + (pageSize.getHeight() / 2); // middle
|
||||
default ->
|
||||
pageSize.getUpperRightY()
|
||||
- marginFactor * pageSize.getHeight(); // top
|
||||
};
|
||||
}
|
||||
|
||||
PDPageContentStream contentStream =
|
||||
new PDPageContentStream(
|
||||
document, page, PDPageContentStream.AppendMode.APPEND, true, true);
|
||||
contentStream.beginText();
|
||||
switch (font_type.toLowerCase()) {
|
||||
case "helvetica":
|
||||
contentStream.setFont(
|
||||
new PDType1Font(Standard14Fonts.FontName.HELVETICA), fontSize);
|
||||
break;
|
||||
case "courier":
|
||||
contentStream.setFont(
|
||||
new PDType1Font(Standard14Fonts.FontName.COURIER), fontSize);
|
||||
break;
|
||||
case "times":
|
||||
contentStream.setFont(
|
||||
new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN), fontSize);
|
||||
break;
|
||||
}
|
||||
contentStream.setFont(currentFont, fontSize);
|
||||
contentStream.newLineAtOffset(x, y);
|
||||
contentStream.showText(text);
|
||||
contentStream.endText();
|
||||
|
@ -3,10 +3,10 @@ package stirling.software.SPDF.controller.api.misc;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
|
||||
import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.ModelAttribute;
|
||||
@ -20,6 +20,7 @@ import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
|
||||
import stirling.software.SPDF.model.api.PDFFile;
|
||||
import stirling.software.SPDF.service.CustomPDDocumentFactory;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
|
||||
@RestController
|
||||
@ -27,6 +28,13 @@ import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
@Tag(name = "Misc", description = "Miscellaneous APIs")
|
||||
public class ShowJavascript {
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public ShowJavascript(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/show-javascript")
|
||||
@Operation(
|
||||
summary = "Grabs all JS from a PDF and returns a single JS file with all code",
|
||||
@ -35,7 +43,7 @@ public class ShowJavascript {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
String script = "";
|
||||
|
||||
try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) {
|
||||
try (PDDocument document = pdfDocumentFactory.load(inputFile.getBytes())) {
|
||||
|
||||
if (document.getDocumentCatalog() != null
|
||||
&& document.getDocumentCatalog().getNames() != null) {
|
||||
|
@ -6,7 +6,6 @@ import java.nio.charset.StandardCharsets;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSInputStream;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.cos.COSString;
|
||||
@ -44,6 +43,7 @@ import org.apache.xmpbox.XMPMetadata;
|
||||
import org.apache.xmpbox.xml.DomXmpParser;
|
||||
import org.apache.xmpbox.xml.XmpParsingException;
|
||||
import org.apache.xmpbox.xml.XmpSerializer;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.ModelAttribute;
|
||||
@ -62,6 +62,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.api.PDFFile;
|
||||
import stirling.software.SPDF.service.CustomPDDocumentFactory;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
|
||||
@RestController
|
||||
@ -72,6 +73,13 @@ public class GetInfoOnPDF {
|
||||
|
||||
static ObjectMapper objectMapper = new ObjectMapper();
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public GetInfoOnPDF(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
private static void addOutlinesToArray(PDOutlineItem outline, ArrayNode arrayNode) {
|
||||
if (outline == null) return;
|
||||
|
||||
@ -118,7 +126,7 @@ public class GetInfoOnPDF {
|
||||
@Operation(summary = "Summary here", description = "desc. Input:PDF Output:JSON Type:SISO")
|
||||
public ResponseEntity<byte[]> getPdfInfo(@ModelAttribute PDFFile request) throws IOException {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
try (PDDocument pdfBoxDoc = Loader.loadPDF(inputFile.getBytes()); ) {
|
||||
try (PDDocument pdfBoxDoc = pdfDocumentFactory.load(inputFile.getBytes()); ) {
|
||||
ObjectMapper objectMapper = new ObjectMapper();
|
||||
ObjectNode jsonOutput = objectMapper.createObjectNode();
|
||||
|
||||
|
@ -3,8 +3,11 @@ package stirling.software.SPDF.controller.api.security;
|
||||
import java.awt.*;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
@ -94,7 +97,10 @@ public class RedactController {
|
||||
private void redactAreas(
|
||||
List<RedactionArea> redactionAreas, PDDocument document, PDPageTree allPages)
|
||||
throws IOException {
|
||||
Color redactColor = null;
|
||||
// Group redaction areas by page
|
||||
Map<Integer, List<RedactionArea>> redactionsByPage = new HashMap<>();
|
||||
|
||||
// Process and validate each redaction area
|
||||
for (RedactionArea redactionArea : redactionAreas) {
|
||||
if (redactionArea.getPage() == null
|
||||
|| redactionArea.getPage() <= 0
|
||||
@ -102,12 +108,33 @@ public class RedactController {
|
||||
|| redactionArea.getHeight() <= 0.0D
|
||||
|| redactionArea.getWidth() == null
|
||||
|| redactionArea.getWidth() <= 0.0D) continue;
|
||||
PDPage page = allPages.get(redactionArea.getPage() - 1);
|
||||
|
||||
// Group by page number
|
||||
redactionsByPage
|
||||
.computeIfAbsent(redactionArea.getPage(), k -> new ArrayList<>())
|
||||
.add(redactionArea);
|
||||
}
|
||||
|
||||
// Process each page only once
|
||||
for (Map.Entry<Integer, List<RedactionArea>> entry : redactionsByPage.entrySet()) {
|
||||
Integer pageNumber = entry.getKey();
|
||||
List<RedactionArea> areasForPage = entry.getValue();
|
||||
|
||||
if (pageNumber > allPages.getCount()) {
|
||||
continue; // Skip if page number is out of bounds
|
||||
}
|
||||
|
||||
PDPage page = allPages.get(pageNumber - 1);
|
||||
PDRectangle box = page.getBBox();
|
||||
|
||||
// Create only one content stream per page
|
||||
PDPageContentStream contentStream =
|
||||
new PDPageContentStream(
|
||||
document, page, PDPageContentStream.AppendMode.APPEND, true, true);
|
||||
redactColor = decodeOrDefault(redactionArea.getColor(), Color.BLACK);
|
||||
|
||||
// Process all redactions for this page
|
||||
for (RedactionArea redactionArea : areasForPage) {
|
||||
Color redactColor = decodeOrDefault(redactionArea.getColor(), Color.BLACK);
|
||||
contentStream.setNonStrokingColor(redactColor);
|
||||
|
||||
float x = redactionArea.getX().floatValue();
|
||||
@ -115,10 +142,10 @@ public class RedactController {
|
||||
float width = redactionArea.getWidth().floatValue();
|
||||
float height = redactionArea.getHeight().floatValue();
|
||||
|
||||
PDRectangle box = page.getBBox();
|
||||
|
||||
contentStream.addRect(x, box.getHeight() - y - height, width, height);
|
||||
contentStream.fill();
|
||||
}
|
||||
|
||||
contentStream.close();
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,8 @@
|
||||
package stirling.software.SPDF.model;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class PDFText {
|
||||
private final int pageIndex;
|
||||
private final float x1;
|
||||
@ -7,37 +10,4 @@ public class PDFText {
|
||||
private final float x2;
|
||||
private final float y2;
|
||||
private final String text;
|
||||
|
||||
public PDFText(int pageIndex, float x1, float y1, float x2, float y2, String text) {
|
||||
this.pageIndex = pageIndex;
|
||||
this.x1 = x1;
|
||||
this.y1 = y1;
|
||||
this.x2 = x2;
|
||||
this.y2 = y2;
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
public int getPageIndex() {
|
||||
return pageIndex;
|
||||
}
|
||||
|
||||
public float getX1() {
|
||||
return x1;
|
||||
}
|
||||
|
||||
public float getY1() {
|
||||
return y1;
|
||||
}
|
||||
|
||||
public float getX2() {
|
||||
return x2;
|
||||
}
|
||||
|
||||
public float getY2() {
|
||||
return y2;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
return text;
|
||||
}
|
||||
}
|
||||
|
@ -4,142 +4,355 @@ import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.io.IOUtils;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
||||
import org.apache.pdfbox.io.RandomAccessStreamCache.StreamCacheCreateFunction;
|
||||
import org.apache.pdfbox.io.ScratchFile;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.PdfMetadata;
|
||||
import stirling.software.SPDF.model.api.PDFFile;
|
||||
|
||||
/**
|
||||
* Adaptive PDF document factory that optimizes memory usage based on file size and available system
|
||||
* resources.
|
||||
*/
|
||||
@Component
|
||||
@Slf4j
|
||||
public class CustomPDDocumentFactory {
|
||||
|
||||
private final PdfMetadataService pdfMetadataService;
|
||||
|
||||
@Autowired
|
||||
// Memory thresholds and limits
|
||||
|
||||
private static final long SMALL_FILE_THRESHOLD = 10 * 1024 * 1024; // 10 MB
|
||||
// Files smaller than this threshold are loaded entirely in memory for better performance.
|
||||
// These files use IOUtils.createMemoryOnlyStreamCache() which keeps all document data in RAM.
|
||||
// No temp files are created for document data, reducing I/O operations but consuming more
|
||||
// memory.
|
||||
|
||||
private static final long LARGE_FILE_THRESHOLD = 50 * 1024 * 1024; // 50 MB
|
||||
// Files between SMALL and LARGE thresholds use file-based caching with ScratchFile,
|
||||
// but are loaded directly from byte arrays if provided that way.
|
||||
// When loading from byte arrays, once size exceeds this threshold, bytes are first
|
||||
// written to temp files before loading to reduce memory pressure.
|
||||
|
||||
private static final long LARGE_FILE_USAGE = 10 * 1024 * 1024;
|
||||
|
||||
private static final long EXTREMELY_LARGE_THRESHOLD = 100 * 1024 * 1024; // 100 MB
|
||||
// Files exceeding this threshold use specialized loading with RandomAccessReadBufferedFile
|
||||
// which provides buffered access to the file without loading the entire content at once.
|
||||
// These files are always processed using file-based caching with minimal memory footprint,
|
||||
// trading some performance for significantly reduced memory usage.
|
||||
// For extremely large PDFs, this prevents OutOfMemoryErrors at the cost of being more I/O
|
||||
// bound.
|
||||
|
||||
private static final double MIN_FREE_MEMORY_PERCENTAGE = 30.0; // 30%
|
||||
private static final long MIN_FREE_MEMORY_BYTES = 4L * 1024 * 1024 * 1024; // 4 GB
|
||||
|
||||
// Counter for tracking temporary resources
|
||||
private static final AtomicLong tempCounter = new AtomicLong(0);
|
||||
|
||||
public CustomPDDocumentFactory(PdfMetadataService pdfMetadataService) {
|
||||
this.pdfMetadataService = pdfMetadataService;
|
||||
}
|
||||
|
||||
public PDDocument createNewDocument() throws IOException {
|
||||
PDDocument document = new PDDocument();
|
||||
pdfMetadataService.setMetadataToPdf(document, PdfMetadata.builder().build(), true);
|
||||
/**
|
||||
* Main entry point for loading a PDF document from a file. Automatically selects the most
|
||||
* appropriate loading strategy.
|
||||
*/
|
||||
public PDDocument load(File file) throws IOException {
|
||||
if (file == null) {
|
||||
throw new IllegalArgumentException("File cannot be null");
|
||||
}
|
||||
|
||||
long fileSize = file.length();
|
||||
log.info("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024));
|
||||
|
||||
return loadAdaptively(file, fileSize);
|
||||
}
|
||||
|
||||
/** Load a PDF from byte array with automatic optimization. */
|
||||
public PDDocument load(byte[] input) throws IOException {
|
||||
if (input == null) {
|
||||
throw new IllegalArgumentException("Input bytes cannot be null");
|
||||
}
|
||||
|
||||
long dataSize = input.length;
|
||||
log.info("Loading PDF from byte array, size: {}MB", dataSize / (1024 * 1024));
|
||||
|
||||
return loadAdaptively(input, dataSize);
|
||||
}
|
||||
|
||||
/** Load a PDF from InputStream with automatic optimization. */
|
||||
public PDDocument load(InputStream input) throws IOException {
|
||||
if (input == null) {
|
||||
throw new IllegalArgumentException("InputStream cannot be null");
|
||||
}
|
||||
|
||||
// Since we don't know the size upfront, buffer to a temp file
|
||||
Path tempFile = createTempFile("pdf-stream-");
|
||||
try {
|
||||
Files.copy(input, tempFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
return loadAdaptively(tempFile.toFile(), Files.size(tempFile));
|
||||
} catch (IOException e) {
|
||||
cleanupFile(tempFile);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
private PDDocument loadAdaptively(Object source, long contentSize) throws IOException {
|
||||
long maxMemory = Runtime.getRuntime().maxMemory();
|
||||
long freeMemory = Runtime.getRuntime().freeMemory();
|
||||
long totalMemory = Runtime.getRuntime().totalMemory();
|
||||
long usedMemory = totalMemory - freeMemory;
|
||||
|
||||
// Calculate percentage of free memory
|
||||
double freeMemoryPercent = (double) (maxMemory - usedMemory) / maxMemory * 100;
|
||||
long actualFreeMemory = maxMemory - usedMemory;
|
||||
|
||||
// Log memory status
|
||||
log.info(
|
||||
"Memory status - Free: {}MB ({}%), Used: {}MB, Max: {}MB",
|
||||
actualFreeMemory / (1024 * 1024),
|
||||
String.format("%.2f", freeMemoryPercent),
|
||||
usedMemory / (1024 * 1024),
|
||||
maxMemory / (1024 * 1024));
|
||||
|
||||
// Determine caching strategy based on both file size and available memory
|
||||
StreamCacheCreateFunction cacheFunction;
|
||||
|
||||
// If free memory is critically low, always use file-based caching
|
||||
// In loadAdaptively method, replace current caching strategy decision with:
|
||||
if (freeMemoryPercent < MIN_FREE_MEMORY_PERCENTAGE
|
||||
|| actualFreeMemory < MIN_FREE_MEMORY_BYTES) {
|
||||
log.info(
|
||||
"Low memory detected ({}%), forcing file-based cache",
|
||||
String.format("%.2f", freeMemoryPercent));
|
||||
cacheFunction = createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly());
|
||||
} else if (contentSize < SMALL_FILE_THRESHOLD) {
|
||||
log.info("Using memory-only cache for small document ({}KB)", contentSize / 1024);
|
||||
cacheFunction = IOUtils.createMemoryOnlyStreamCache();
|
||||
} else if (contentSize < LARGE_FILE_THRESHOLD) {
|
||||
// For medium files (10-50MB), use a mixed approach
|
||||
log.info(
|
||||
"Using mixed memory/file cache for medium document ({}MB)",
|
||||
contentSize / (1024 * 1024));
|
||||
cacheFunction =
|
||||
createScratchFileCacheFunction(MemoryUsageSetting.setupMixed(LARGE_FILE_USAGE));
|
||||
} else {
|
||||
log.info("Using file-based cache for large document");
|
||||
cacheFunction = createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly());
|
||||
}
|
||||
|
||||
PDDocument document;
|
||||
if (source instanceof File file) {
|
||||
document = loadFromFile(file, contentSize, cacheFunction);
|
||||
} else if (source instanceof byte[] bytes) {
|
||||
document = loadFromBytes(bytes, contentSize, cacheFunction);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
|
||||
}
|
||||
|
||||
postProcessDocument(document);
|
||||
return document;
|
||||
}
|
||||
|
||||
private StreamCacheCreateFunction createScratchFileCacheFunction(MemoryUsageSetting settings) {
|
||||
return () -> {
|
||||
try {
|
||||
return new ScratchFile(settings);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("ScratchFile initialization failed", e);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private void postProcessDocument(PDDocument doc) throws IOException {
|
||||
pdfMetadataService.setDefaultMetadata(doc);
|
||||
removePassword(doc);
|
||||
}
|
||||
|
||||
private PDDocument loadFromFile(File file, long size, StreamCacheCreateFunction cache)
|
||||
throws IOException {
|
||||
if (size >= EXTREMELY_LARGE_THRESHOLD) {
|
||||
log.info("Loading extremely large file via buffered access");
|
||||
return Loader.loadPDF(new RandomAccessReadBufferedFile(file), "", null, null, cache);
|
||||
}
|
||||
return Loader.loadPDF(file, "", null, null, cache);
|
||||
}
|
||||
|
||||
private PDDocument loadFromBytes(byte[] bytes, long size, StreamCacheCreateFunction cache)
|
||||
throws IOException {
|
||||
if (size >= SMALL_FILE_THRESHOLD) {
|
||||
log.info("Writing large byte array to temp file");
|
||||
Path tempFile = createTempFile("pdf-bytes-");
|
||||
try {
|
||||
Files.write(tempFile, bytes);
|
||||
return Loader.loadPDF(tempFile.toFile(), "", null, null, cache);
|
||||
} finally {
|
||||
cleanupFile(tempFile);
|
||||
}
|
||||
}
|
||||
return Loader.loadPDF(bytes, "", null, null, cache);
|
||||
}
|
||||
|
||||
public PDDocument createNewDocument(MemoryUsageSetting settings) throws IOException {
|
||||
PDDocument doc = new PDDocument(createScratchFileCacheFunction(settings));
|
||||
pdfMetadataService.setDefaultMetadata(doc);
|
||||
return doc;
|
||||
}
|
||||
|
||||
public PDDocument createNewDocument() throws IOException {
|
||||
return createNewDocument(MemoryUsageSetting.setupTempFileOnly());
|
||||
}
|
||||
|
||||
public byte[] saveToBytes(PDDocument document) throws IOException {
|
||||
if (document.getNumberOfPages() < 10) { // Simple heuristic
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
document.save(baos);
|
||||
return baos.toByteArray();
|
||||
}
|
||||
} else {
|
||||
Path tempFile = createTempFile("pdf-save-");
|
||||
try {
|
||||
document.save(tempFile.toFile());
|
||||
return Files.readAllBytes(tempFile);
|
||||
} finally {
|
||||
cleanupFile(tempFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Improved password handling
|
||||
private void removePassword(PDDocument document) throws IOException {
|
||||
if (document.isEncrypted()) {
|
||||
try {
|
||||
document.setAllSecurityToBeRemoved(true);
|
||||
} catch (Exception e) {
|
||||
log.error("Decryption failed", e);
|
||||
throw new IOException("PDF decryption failed", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Temp file handling with enhanced logging
|
||||
private Path createTempFile(String prefix) throws IOException {
|
||||
Path file = Files.createTempFile(prefix + tempCounter.incrementAndGet() + "-", ".tmp");
|
||||
log.info("Created temp file: {}", file);
|
||||
return file;
|
||||
}
|
||||
|
||||
/** Create a uniquely named temporary directory */
|
||||
private Path createTempDirectory(String prefix) throws IOException {
|
||||
return Files.createTempDirectory(prefix + tempCounter.incrementAndGet() + "-");
|
||||
}
|
||||
|
||||
/** Clean up a temporary file */
|
||||
private void cleanupFile(Path file) {
|
||||
try {
|
||||
if (Files.deleteIfExists(file)) {
|
||||
log.info("Deleted temp file: {}", file);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.info("Error deleting temp file {}", file, e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Create new document bytes based on an existing document */
|
||||
public byte[] createNewBytesBasedOnOldDocument(byte[] oldDocument) throws IOException {
|
||||
PDDocument document = Loader.loadPDF(oldDocument);
|
||||
return createNewBytesBasedOnOldDocument(document);
|
||||
try (PDDocument document = load(oldDocument)) {
|
||||
return saveToBytes(document);
|
||||
}
|
||||
}
|
||||
|
||||
/** Create new document bytes based on an existing document file */
|
||||
public byte[] createNewBytesBasedOnOldDocument(File oldDocument) throws IOException {
|
||||
PDDocument document = Loader.loadPDF(oldDocument);
|
||||
return createNewBytesBasedOnOldDocument(document);
|
||||
try (PDDocument document = load(oldDocument)) {
|
||||
return saveToBytes(document);
|
||||
}
|
||||
}
|
||||
|
||||
/** Create new document bytes based on an existing PDDocument */
|
||||
public byte[] createNewBytesBasedOnOldDocument(PDDocument oldDocument) throws IOException {
|
||||
pdfMetadataService.setMetadataToPdf(
|
||||
oldDocument, pdfMetadataService.extractMetadataFromPdf(oldDocument), true);
|
||||
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
oldDocument.save(baos);
|
||||
oldDocument.close();
|
||||
return baos.toByteArray();
|
||||
return saveToBytes(oldDocument);
|
||||
}
|
||||
|
||||
/** Create a new document based on an existing document bytes */
|
||||
public PDDocument createNewDocumentBasedOnOldDocument(byte[] oldDocument) throws IOException {
|
||||
PDDocument document = Loader.loadPDF(oldDocument);
|
||||
try (PDDocument document = load(oldDocument)) {
|
||||
return createNewDocumentBasedOnOldDocument(document);
|
||||
}
|
||||
}
|
||||
|
||||
/** Create a new document based on an existing document file */
|
||||
public PDDocument createNewDocumentBasedOnOldDocument(File oldDocument) throws IOException {
|
||||
PDDocument document = Loader.loadPDF(oldDocument);
|
||||
try (PDDocument document = load(oldDocument)) {
|
||||
return createNewDocumentBasedOnOldDocument(document);
|
||||
}
|
||||
}
|
||||
|
||||
/** Create a new document based on an existing PDDocument */
|
||||
public PDDocument createNewDocumentBasedOnOldDocument(PDDocument oldDocument)
|
||||
throws IOException {
|
||||
PDDocument document = new PDDocument();
|
||||
PDDocument document = createNewDocument();
|
||||
pdfMetadataService.setMetadataToPdf(
|
||||
document, pdfMetadataService.extractMetadataFromPdf(oldDocument), true);
|
||||
return document;
|
||||
}
|
||||
|
||||
/** Load document from a file and convert it to bytes */
|
||||
public byte[] loadToBytes(File file) throws IOException {
|
||||
PDDocument document = load(file);
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
document.save(baos);
|
||||
// Close the document
|
||||
document.close();
|
||||
return baos.toByteArray();
|
||||
try (PDDocument document = load(file)) {
|
||||
return saveToBytes(document);
|
||||
}
|
||||
}
|
||||
|
||||
/** Load document from bytes and convert it back to bytes */
|
||||
public byte[] loadToBytes(byte[] bytes) throws IOException {
|
||||
PDDocument document = load(bytes);
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
document.save(baos);
|
||||
// Close the document
|
||||
document.close();
|
||||
return baos.toByteArray();
|
||||
}
|
||||
|
||||
// if loading from a file, assume the file has been made with Stirling-PDF
|
||||
public PDDocument load(File file) throws IOException {
|
||||
PDDocument document = Loader.loadPDF(file);
|
||||
pdfMetadataService.setMetadataToPdf(document, PdfMetadata.builder().build(), true);
|
||||
return document;
|
||||
}
|
||||
|
||||
public PDDocument load(InputStream input) throws IOException {
|
||||
return load(input.readAllBytes());
|
||||
}
|
||||
|
||||
public PDDocument load(byte[] input) throws IOException {
|
||||
PDDocument document = Loader.loadPDF(input);
|
||||
pdfMetadataService.setDefaultMetadata(document);
|
||||
removezeropassword(document);
|
||||
return document;
|
||||
}
|
||||
|
||||
public PDDocument load(PDFFile pdfFile) throws IOException {
|
||||
return load(pdfFile.getFileInput());
|
||||
}
|
||||
|
||||
public PDDocument load(MultipartFile pdfFile) throws IOException {
|
||||
return load(pdfFile.getBytes());
|
||||
try (PDDocument document = load(bytes)) {
|
||||
return saveToBytes(document);
|
||||
}
|
||||
}
|
||||
|
||||
/** Load from a file path string */
|
||||
public PDDocument load(String path) throws IOException {
|
||||
return load(new File(path));
|
||||
}
|
||||
|
||||
/** Load from a PDFFile object */
|
||||
public PDDocument load(PDFFile pdfFile) throws IOException {
|
||||
return load(pdfFile.getFileInput());
|
||||
}
|
||||
|
||||
/** Load from a MultipartFile */
|
||||
public PDDocument load(MultipartFile pdfFile) throws IOException {
|
||||
return load(pdfFile.getBytes());
|
||||
}
|
||||
|
||||
/** Load with password from MultipartFile */
|
||||
public PDDocument load(MultipartFile fileInput, String password) throws IOException {
|
||||
return load(fileInput.getBytes(), password);
|
||||
}
|
||||
|
||||
/** Load with password from byte array */
|
||||
private PDDocument load(byte[] bytes, String password) throws IOException {
|
||||
// Since we don't have direct password support in the adaptive loader,
|
||||
// we'll need to use PDFBox's Loader directly
|
||||
PDDocument document = Loader.loadPDF(bytes, password);
|
||||
pdfMetadataService.setDefaultMetadata(document);
|
||||
return document;
|
||||
}
|
||||
|
||||
private PDDocument removezeropassword(PDDocument document) throws IOException {
|
||||
if (document.isEncrypted()) {
|
||||
try {
|
||||
log.info("Removing security from the source document");
|
||||
document.setAllSecurityToBeRemoved(true);
|
||||
} catch (Exception e) {
|
||||
log.warn("Cannot decrypt the pdf");
|
||||
}
|
||||
}
|
||||
return document;
|
||||
}
|
||||
|
||||
// Add other load methods as needed, following the same pattern
|
||||
}
|
||||
|
@ -14,7 +14,6 @@ import java.util.zip.ZipOutputStream;
|
||||
import javax.imageio.*;
|
||||
import javax.imageio.stream.ImageOutputStream;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
@ -128,6 +127,7 @@ public class PdfUtils {
|
||||
}
|
||||
|
||||
public static byte[] convertFromPdf(
|
||||
CustomPDDocumentFactory pdfDocumentFactory,
|
||||
byte[] inputStream,
|
||||
String imageType,
|
||||
ImageType colorType,
|
||||
@ -135,7 +135,7 @@ public class PdfUtils {
|
||||
int DPI,
|
||||
String filename)
|
||||
throws IOException, Exception {
|
||||
try (PDDocument document = Loader.loadPDF(inputStream)) {
|
||||
try (PDDocument document = pdfDocumentFactory.load(inputStream)) {
|
||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||
pdfRenderer.setSubsamplingAllowed(true);
|
||||
int pageCount = document.getNumberOfPages();
|
||||
|
Loading…
x
Reference in New Issue
Block a user