mirror of
https://github.com/Stirling-Tools/Stirling-PDF.git
synced 2025-06-10 17:45:02 +00:00

# Description of Changes Please provide a summary of the changes, including: - What was changed - Why the change was made - Any challenges encountered Closes #(issue_number) --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details. --------- Co-authored-by: a <a>
477 lines
19 KiB
Java
477 lines
19 KiB
Java
package stirling.software.SPDF.service;
|
|
|
|
import java.io.ByteArrayOutputStream;
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
import java.nio.file.StandardCopyOption;
|
|
import java.util.concurrent.atomic.AtomicLong;
|
|
|
|
import org.apache.pdfbox.Loader;
|
|
import org.apache.pdfbox.examples.util.DeletingRandomAccessFile;
|
|
import org.apache.pdfbox.io.IOUtils;
|
|
import org.apache.pdfbox.io.MemoryUsageSetting;
|
|
import org.apache.pdfbox.io.RandomAccessStreamCache.StreamCacheCreateFunction;
|
|
import org.apache.pdfbox.io.ScratchFile;
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
import org.springframework.stereotype.Component;
|
|
import org.springframework.web.multipart.MultipartFile;
|
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
|
import stirling.software.SPDF.model.api.PDFFile;
|
|
|
|
/**
|
|
* Adaptive PDF document factory that optimizes memory usage based on file size and available system
|
|
* resources.
|
|
*/
|
|
@Component
|
|
@Slf4j
|
|
public class CustomPDFDocumentFactory {
|
|
|
|
private final PdfMetadataService pdfMetadataService;
|
|
|
|
// Memory thresholds and limits
|
|
|
|
private static final long SMALL_FILE_THRESHOLD = 10 * 1024 * 1024; // 10 MB
|
|
// Files smaller than this threshold are loaded entirely in memory for better performance.
|
|
// These files use IOUtils.createMemoryOnlyStreamCache() which keeps all document data in RAM.
|
|
// No temp files are created for document data, reducing I/O operations but consuming more
|
|
// memory.
|
|
|
|
private static final long LARGE_FILE_THRESHOLD = 50 * 1024 * 1024; // 50 MB
|
|
// Files between SMALL and LARGE thresholds use file-based caching with ScratchFile,
|
|
// but are loaded directly from byte arrays if provided that way.
|
|
// When loading from byte arrays, once size exceeds this threshold, bytes are first
|
|
// written to temp files before loading to reduce memory pressure.
|
|
|
|
private static final long LARGE_FILE_USAGE = 10 * 1024 * 1024;
|
|
|
|
private static final long EXTREMELY_LARGE_THRESHOLD = 100 * 1024 * 1024; // 100 MB
|
|
// Files exceeding this threshold use specialized loading with RandomAccessReadBufferedFile
|
|
// which provides buffered access to the file without loading the entire content at once.
|
|
// These files are always processed using file-based caching with minimal memory footprint,
|
|
// trading some performance for significantly reduced memory usage.
|
|
// For extremely large PDFs, this prevents OutOfMemoryErrors at the cost of being more I/O
|
|
// bound.
|
|
|
|
private static final double MIN_FREE_MEMORY_PERCENTAGE = 30.0; // 30%
|
|
private static final long MIN_FREE_MEMORY_BYTES = 4L * 1024 * 1024 * 1024; // 4 GB
|
|
|
|
// Counter for tracking temporary resources
|
|
private static final AtomicLong tempCounter = new AtomicLong(0);
|
|
|
|
public CustomPDFDocumentFactory(PdfMetadataService pdfMetadataService) {
|
|
this.pdfMetadataService = pdfMetadataService;
|
|
}
|
|
|
|
/**
|
|
* Main entry point for loading a PDF document from a file. Automatically selects the most
|
|
* appropriate loading strategy.
|
|
*/
|
|
public PDDocument load(File file) throws IOException {
|
|
return load(file, false);
|
|
}
|
|
|
|
/**
|
|
* Main entry point for loading a PDF document from a file with read-only option. Automatically
|
|
* selects the most appropriate loading strategy.
|
|
*/
|
|
public PDDocument load(File file, boolean readOnly) throws IOException {
|
|
if (file == null) {
|
|
throw new IllegalArgumentException("File cannot be null");
|
|
}
|
|
|
|
long fileSize = file.length();
|
|
log.debug("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024));
|
|
|
|
PDDocument doc = loadAdaptively(file, fileSize);
|
|
if (!readOnly) {
|
|
postProcessDocument(doc);
|
|
}
|
|
return doc;
|
|
}
|
|
|
|
/**
|
|
* Main entry point for loading a PDF document from a Path. Automatically selects the most
|
|
* appropriate loading strategy.
|
|
*/
|
|
public PDDocument load(Path path) throws IOException {
|
|
return load(path, false);
|
|
}
|
|
|
|
/**
|
|
* Main entry point for loading a PDF document from a Path with read-only option. Automatically
|
|
* selects the most appropriate loading strategy.
|
|
*/
|
|
public PDDocument load(Path path, boolean readOnly) throws IOException {
|
|
if (path == null) {
|
|
throw new IllegalArgumentException("File cannot be null");
|
|
}
|
|
|
|
long fileSize = Files.size(path);
|
|
log.debug("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024));
|
|
|
|
PDDocument doc = loadAdaptively(path.toFile(), fileSize);
|
|
if (!readOnly) {
|
|
postProcessDocument(doc);
|
|
}
|
|
return doc;
|
|
}
|
|
|
|
/** Load a PDF from byte array with automatic optimization. */
|
|
public PDDocument load(byte[] input) throws IOException {
|
|
return load(input, false);
|
|
}
|
|
|
|
/** Load a PDF from byte array with automatic optimization and read-only option. */
|
|
public PDDocument load(byte[] input, boolean readOnly) throws IOException {
|
|
if (input == null) {
|
|
throw new IllegalArgumentException("Input bytes cannot be null");
|
|
}
|
|
|
|
long dataSize = input.length;
|
|
log.debug("Loading PDF from byte array, size: {}MB", dataSize / (1024 * 1024));
|
|
|
|
PDDocument doc = loadAdaptively(input, dataSize);
|
|
if (!readOnly) {
|
|
postProcessDocument(doc);
|
|
}
|
|
return doc;
|
|
}
|
|
|
|
/** Load a PDF from InputStream with automatic optimization. */
|
|
public PDDocument load(InputStream input) throws IOException {
|
|
return load(input, false);
|
|
}
|
|
|
|
/** Load a PDF from InputStream with automatic optimization and read-only option. */
|
|
public PDDocument load(InputStream input, boolean readOnly) throws IOException {
|
|
if (input == null) {
|
|
throw new IllegalArgumentException("InputStream cannot be null");
|
|
}
|
|
|
|
// Since we don't know the size upfront, buffer to a temp file
|
|
Path tempFile = createTempFile("pdf-stream-");
|
|
|
|
Files.copy(input, tempFile, StandardCopyOption.REPLACE_EXISTING);
|
|
PDDocument doc = loadAdaptively(tempFile.toFile(), Files.size(tempFile));
|
|
if (!readOnly) {
|
|
postProcessDocument(doc);
|
|
}
|
|
return doc;
|
|
}
|
|
|
|
/** Load with password from InputStream */
|
|
public PDDocument load(InputStream input, String password) throws IOException {
|
|
return load(input, password, false);
|
|
}
|
|
|
|
/** Load with password from InputStream and read-only option */
|
|
public PDDocument load(InputStream input, String password, boolean readOnly)
|
|
throws IOException {
|
|
if (input == null) {
|
|
throw new IllegalArgumentException("InputStream cannot be null");
|
|
}
|
|
|
|
// Since we don't know the size upfront, buffer to a temp file
|
|
Path tempFile = createTempFile("pdf-stream-");
|
|
|
|
Files.copy(input, tempFile, StandardCopyOption.REPLACE_EXISTING);
|
|
PDDocument doc =
|
|
loadAdaptivelyWithPassword(tempFile.toFile(), Files.size(tempFile), password);
|
|
if (!readOnly) {
|
|
postProcessDocument(doc);
|
|
}
|
|
return doc;
|
|
}
|
|
|
|
/** Load from a file path string */
|
|
public PDDocument load(String path) throws IOException {
|
|
return load(path, false);
|
|
}
|
|
|
|
/** Load from a file path string with read-only option */
|
|
public PDDocument load(String path, boolean readOnly) throws IOException {
|
|
return load(new File(path), readOnly);
|
|
}
|
|
|
|
/** Load from a PDFFile object */
|
|
public PDDocument load(PDFFile pdfFile) throws IOException {
|
|
return load(pdfFile, false);
|
|
}
|
|
|
|
/** Load from a PDFFile object with read-only option */
|
|
public PDDocument load(PDFFile pdfFile, boolean readOnly) throws IOException {
|
|
return load(pdfFile.getFileInput(), readOnly);
|
|
}
|
|
|
|
/** Load from a MultipartFile */
|
|
public PDDocument load(MultipartFile pdfFile) throws IOException {
|
|
return load(pdfFile, false);
|
|
}
|
|
|
|
/** Load from a MultipartFile with read-only option */
|
|
public PDDocument load(MultipartFile pdfFile, boolean readOnly) throws IOException {
|
|
return load(pdfFile.getInputStream(), readOnly);
|
|
}
|
|
|
|
/** Load with password from MultipartFile */
|
|
public PDDocument load(MultipartFile fileInput, String password) throws IOException {
|
|
return load(fileInput, password, false);
|
|
}
|
|
|
|
/** Load with password from MultipartFile with read-only option */
|
|
public PDDocument load(MultipartFile fileInput, String password, boolean readOnly)
|
|
throws IOException {
|
|
return load(fileInput.getInputStream(), password, readOnly);
|
|
}
|
|
|
|
/**
|
|
* Determine the appropriate caching strategy based on file size and available memory. This
|
|
* common method is used by both password and non-password loading paths.
|
|
*/
|
|
public StreamCacheCreateFunction getStreamCacheFunction(long contentSize) {
|
|
long maxMemory = Runtime.getRuntime().maxMemory();
|
|
long freeMemory = Runtime.getRuntime().freeMemory();
|
|
long totalMemory = Runtime.getRuntime().totalMemory();
|
|
long usedMemory = totalMemory - freeMemory;
|
|
|
|
// Calculate percentage of free memory
|
|
double freeMemoryPercent = (double) (maxMemory - usedMemory) / maxMemory * 100;
|
|
long actualFreeMemory = maxMemory - usedMemory;
|
|
|
|
// Log memory status
|
|
log.debug(
|
|
"Memory status - Free: {}MB ({}%), Used: {}MB, Max: {}MB",
|
|
actualFreeMemory / (1024 * 1024),
|
|
String.format("%.2f", freeMemoryPercent),
|
|
usedMemory / (1024 * 1024),
|
|
maxMemory / (1024 * 1024));
|
|
|
|
// If free memory is critically low, always use file-based caching
|
|
if (freeMemoryPercent < MIN_FREE_MEMORY_PERCENTAGE
|
|
|| actualFreeMemory < MIN_FREE_MEMORY_BYTES) {
|
|
log.debug(
|
|
"Low memory detected ({}%), forcing file-based cache",
|
|
String.format("%.2f", freeMemoryPercent));
|
|
return createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly());
|
|
} else if (contentSize < SMALL_FILE_THRESHOLD) {
|
|
log.debug("Using memory-only cache for small document ({}KB)", contentSize / 1024);
|
|
return IOUtils.createMemoryOnlyStreamCache();
|
|
} else if (contentSize < LARGE_FILE_THRESHOLD) {
|
|
// For medium files (10-50MB), use a mixed approach
|
|
log.debug(
|
|
"Using mixed memory/file cache for medium document ({}MB)",
|
|
contentSize / (1024 * 1024));
|
|
return createScratchFileCacheFunction(MemoryUsageSetting.setupMixed(LARGE_FILE_USAGE));
|
|
} else {
|
|
log.debug("Using file-based cache for large document");
|
|
return createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly());
|
|
}
|
|
}
|
|
|
|
/** Update the existing loadAdaptively method to use the common function */
|
|
private PDDocument loadAdaptively(Object source, long contentSize) throws IOException {
|
|
// Get the appropriate caching strategy
|
|
StreamCacheCreateFunction cacheFunction = getStreamCacheFunction(contentSize);
|
|
|
|
// If small handle as bytes and remove original file
|
|
if (contentSize <= SMALL_FILE_THRESHOLD && source instanceof File file) {
|
|
source = Files.readAllBytes(file.toPath());
|
|
file.delete();
|
|
}
|
|
PDDocument document;
|
|
if (source instanceof File file) {
|
|
document = loadFromFile(file, contentSize, cacheFunction);
|
|
} else if (source instanceof byte[] bytes) {
|
|
document = loadFromBytes(bytes, contentSize, cacheFunction);
|
|
} else {
|
|
throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
|
|
}
|
|
return document;
|
|
}
|
|
|
|
/** Load a PDF with password protection using adaptive loading strategies */
|
|
private PDDocument loadAdaptivelyWithPassword(Object source, long contentSize, String password)
|
|
throws IOException {
|
|
// Get the appropriate caching strategy
|
|
StreamCacheCreateFunction cacheFunction = getStreamCacheFunction(contentSize);
|
|
// If small handle as bytes and remove original file
|
|
if (contentSize <= SMALL_FILE_THRESHOLD && source instanceof File file) {
|
|
source = Files.readAllBytes(file.toPath());
|
|
file.delete();
|
|
}
|
|
PDDocument document;
|
|
if (source instanceof File file) {
|
|
document = loadFromFileWithPassword(file, contentSize, cacheFunction, password);
|
|
} else if (source instanceof byte[] bytes) {
|
|
document = loadFromBytesWithPassword(bytes, contentSize, cacheFunction, password);
|
|
} else {
|
|
throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
|
|
}
|
|
return document;
|
|
}
|
|
|
|
/** Load a file with password */
|
|
private PDDocument loadFromFileWithPassword(
|
|
File file, long size, StreamCacheCreateFunction cache, String password)
|
|
throws IOException {
|
|
return Loader.loadPDF(new DeletingRandomAccessFile(file), password, null, null, cache);
|
|
}
|
|
|
|
/** Load bytes with password */
|
|
private PDDocument loadFromBytesWithPassword(
|
|
byte[] bytes, long size, StreamCacheCreateFunction cache, String password)
|
|
throws IOException {
|
|
if (size >= SMALL_FILE_THRESHOLD) {
|
|
log.debug("Writing large byte array to temp file for password-protected PDF");
|
|
Path tempFile = createTempFile("pdf-bytes-");
|
|
|
|
Files.write(tempFile, bytes);
|
|
return Loader.loadPDF(tempFile.toFile(), password, null, null, cache);
|
|
}
|
|
return Loader.loadPDF(bytes, password, null, null, cache);
|
|
}
|
|
|
|
private StreamCacheCreateFunction createScratchFileCacheFunction(MemoryUsageSetting settings) {
|
|
return () -> {
|
|
try {
|
|
return new ScratchFile(settings);
|
|
} catch (IOException e) {
|
|
throw new RuntimeException("ScratchFile initialization failed", e);
|
|
}
|
|
};
|
|
}
|
|
|
|
private void postProcessDocument(PDDocument doc) throws IOException {
|
|
pdfMetadataService.setDefaultMetadata(doc);
|
|
removePassword(doc);
|
|
}
|
|
|
|
private PDDocument loadFromFile(File file, long size, StreamCacheCreateFunction cache)
|
|
throws IOException {
|
|
return Loader.loadPDF(new DeletingRandomAccessFile(file), "", null, null, cache);
|
|
}
|
|
|
|
private PDDocument loadFromBytes(byte[] bytes, long size, StreamCacheCreateFunction cache)
|
|
throws IOException {
|
|
if (size >= SMALL_FILE_THRESHOLD) {
|
|
log.debug("Writing large byte array to temp file");
|
|
Path tempFile = createTempFile("pdf-bytes-");
|
|
|
|
Files.write(tempFile, bytes);
|
|
return loadFromFile(tempFile.toFile(), size, cache);
|
|
}
|
|
return Loader.loadPDF(bytes, "", null, null, cache);
|
|
}
|
|
|
|
public PDDocument createNewDocument(MemoryUsageSetting settings) throws IOException {
|
|
PDDocument doc = new PDDocument(createScratchFileCacheFunction(settings));
|
|
pdfMetadataService.setDefaultMetadata(doc);
|
|
return doc;
|
|
}
|
|
|
|
public PDDocument createNewDocument() throws IOException {
|
|
return createNewDocument(MemoryUsageSetting.setupTempFileOnly());
|
|
}
|
|
|
|
public byte[] saveToBytes(PDDocument document) throws IOException {
|
|
if (document.getNumberOfPages() < 10) { // Simple heuristic
|
|
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
|
document.save(baos);
|
|
return baos.toByteArray();
|
|
}
|
|
} else {
|
|
Path tempFile = createTempFile("pdf-save-");
|
|
|
|
document.save(tempFile.toFile());
|
|
return Files.readAllBytes(tempFile);
|
|
}
|
|
}
|
|
|
|
// Improved password handling
|
|
private void removePassword(PDDocument document) throws IOException {
|
|
if (document.isEncrypted()) {
|
|
try {
|
|
document.setAllSecurityToBeRemoved(true);
|
|
} catch (Exception e) {
|
|
log.error("Decryption failed", e);
|
|
throw new IOException("PDF decryption failed", e);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Temp file handling with enhanced logging
|
|
private Path createTempFile(String prefix) throws IOException {
|
|
Path file = Files.createTempFile(prefix + tempCounter.incrementAndGet() + "-", ".tmp");
|
|
log.debug("Created temp file: {}", file);
|
|
return file;
|
|
}
|
|
|
|
/** Create a uniquely named temporary directory */
|
|
private Path createTempDirectory(String prefix) throws IOException {
|
|
return Files.createTempDirectory(prefix + tempCounter.incrementAndGet() + "-");
|
|
}
|
|
|
|
/** Create new document bytes based on an existing document */
|
|
public byte[] createNewBytesBasedOnOldDocument(byte[] oldDocument) throws IOException {
|
|
try (PDDocument document = load(oldDocument)) {
|
|
return saveToBytes(document);
|
|
}
|
|
}
|
|
|
|
/** Create new document bytes based on an existing document file */
|
|
public byte[] createNewBytesBasedOnOldDocument(File oldDocument) throws IOException {
|
|
try (PDDocument document = load(oldDocument)) {
|
|
return saveToBytes(document);
|
|
}
|
|
}
|
|
|
|
/** Create new document bytes based on an existing PDDocument */
|
|
public byte[] createNewBytesBasedOnOldDocument(PDDocument oldDocument) throws IOException {
|
|
pdfMetadataService.setMetadataToPdf(
|
|
oldDocument, pdfMetadataService.extractMetadataFromPdf(oldDocument), true);
|
|
return saveToBytes(oldDocument);
|
|
}
|
|
|
|
/** Create a new document based on an existing document bytes */
|
|
public PDDocument createNewDocumentBasedOnOldDocument(byte[] oldDocument) throws IOException {
|
|
try (PDDocument document = load(oldDocument)) {
|
|
return createNewDocumentBasedOnOldDocument(document);
|
|
}
|
|
}
|
|
|
|
/** Create a new document based on an existing document file */
|
|
public PDDocument createNewDocumentBasedOnOldDocument(File oldDocument) throws IOException {
|
|
try (PDDocument document = load(oldDocument)) {
|
|
return createNewDocumentBasedOnOldDocument(document);
|
|
}
|
|
}
|
|
|
|
/** Create a new document based on an existing PDDocument */
|
|
public PDDocument createNewDocumentBasedOnOldDocument(PDDocument oldDocument)
|
|
throws IOException {
|
|
PDDocument document = createNewDocument();
|
|
pdfMetadataService.setMetadataToPdf(
|
|
document, pdfMetadataService.extractMetadataFromPdf(oldDocument), true);
|
|
return document;
|
|
}
|
|
|
|
/** Load document from a file and convert it to bytes */
|
|
public byte[] loadToBytes(File file) throws IOException {
|
|
try (PDDocument document = load(file)) {
|
|
return saveToBytes(document);
|
|
}
|
|
}
|
|
|
|
/** Load document from bytes and convert it back to bytes */
|
|
public byte[] loadToBytes(byte[] bytes) throws IOException {
|
|
try (PDDocument document = load(bytes)) {
|
|
return saveToBytes(document);
|
|
}
|
|
}
|
|
}
|