Exception cleanup, resource change and OCR Defaults (#3876)

# Description of Changes

Please provide a summary of the changes, including:

- What was changed
- Why the change was made
- Any challenges encountered

Closes #(issue_number)

---

## Checklist

### General

- [ ] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [ ] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md)
(if applicable)
- [ ] I have performed a self-review of my own code
- [ ] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### UI Changes (if applicable)

- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [ ] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing)
for more details.

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: a <a>
This commit is contained in:
Anthony Stirling 2025-07-05 17:33:25 +01:00 committed by GitHub
parent b4a7b5d520
commit 14a4bdfb1b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 137 additions and 42 deletions

View File

@ -75,7 +75,9 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
ocrmypdf \
py3-pip \
py3-pillow@testing \
py3-pdf2image@testing && \
py3-pdf2image@testing \
# URW Base 35 fonts for better PDF rendering
font-urw-base35 && \
python3 -m venv /opt/venv && \
/opt/venv/bin/pip install --upgrade pip setuptools && \
/opt/venv/bin/pip install --no-cache-dir --upgrade unoserver weasyprint && \
@ -84,6 +86,8 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
ln -s /usr/lib/libreoffice/program /opt/venv/lib/python3.12/site-packages/LibreOffice && \
mv /usr/share/tessdata /usr/share/tessdata-original && \
mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders /tmp/stirling-pdf && \
# Configure URW Base 35 fonts
ln -s /usr/share/fontconfig/conf.avail/69-urw-*.conf /etc/fonts/conf.d/ && \
fc-cache -f -v && \
chmod +x /scripts/* && \
chmod +x /scripts/init.sh && \

View File

@ -19,7 +19,7 @@ RUN apt-get update && apt-get install -y \
# settings.yml | tessdataDir: /usr/share/tesseract-ocr/5/tessdata
tesseract-ocr \
tesseract-ocr-eng \
fonts-terminus fonts-dejavu fonts-font-awesome fonts-noto fonts-noto-core fonts-noto-cjk fonts-noto-extra fonts-liberation fonts-linuxlibertine \
fonts-terminus fonts-dejavu fonts-font-awesome fonts-noto fonts-noto-core fonts-noto-cjk fonts-noto-extra fonts-liberation fonts-linuxlibertine fonts-urw-base35 \
python3-uno \
python3-venv \
# ss -tln
@ -45,6 +45,7 @@ ENV PATH="/opt/venv/bin:$PATH"
COPY . /workspace
RUN mkdir -p /tmp/stirling-pdf \
&& fc-cache -f -v \
&& adduser --disabled-password --gecos '' devuser \
&& chown -R devuser:devuser /home/devuser /workspace /tmp/stirling-pdf
RUN echo "devuser ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/devuser \

View File

@ -82,7 +82,7 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
tesseract-ocr-data-fra \
tesseract-ocr-data-por \
unpaper \
font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \
font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine font-urw-base35 \
# CV
py3-opencv \
python3 \
@ -98,6 +98,8 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
ln -s /usr/lib/libreoffice/program /opt/venv/lib/python3.12/site-packages/LibreOffice && \
mv /usr/share/tessdata /usr/share/tessdata-original && \
mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders /tmp/stirling-pdf && \
# Configure URW Base 35 fonts
ln -s /usr/share/fontconfig/conf.avail/69-urw-*.conf /etc/fonts/conf.d/ && \
fc-cache -f -v && \
chmod +x /scripts/* && \
chmod +x /scripts/init.sh && \

View File

@ -52,4 +52,4 @@ EXPOSE 8080/tcp
# Run the application
ENTRYPOINT ["tini", "--", "/scripts/init-without-ocr.sh"]
CMD ["java", "-Dfile.encoding=UTF-8", "-Djava.io.tmpdir=/tmp/stirling-pdf", "-jar", "/app.jar"]
CMD ["java", "-Dfile.encoding=UTF-8", "-Djava.io.tmpdir=/tmp/stirling-pdf", "-jar", "/app.jar"]

View File

@ -293,9 +293,32 @@ public class CustomPDFDocumentFactory {
} else {
throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
}
configureResourceCacheIfNeeded(document, contentSize);
return document;
}
/**
* Configure resource cache based on content size and memory constraints. Disables resource
* cache for large files or when memory is low to prevent OOM errors.
*/
private void configureResourceCacheIfNeeded(PDDocument document, long contentSize) {
if (contentSize > LARGE_FILE_THRESHOLD) {
document.setResourceCache(null);
} else {
// Check current memory status for smaller files
long maxMemory = Runtime.getRuntime().maxMemory();
long usedMemory =
Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory();
double freeMemoryPercent = (double) (maxMemory - usedMemory) / maxMemory * 100;
if (freeMemoryPercent < MIN_FREE_MEMORY_PERCENTAGE) {
document.setResourceCache(null);
}
}
}
/** Load a PDF with password protection using adaptive loading strategies */
private PDDocument loadAdaptivelyWithPassword(Object source, long contentSize, String password)
throws IOException {
@ -314,6 +337,9 @@ public class CustomPDFDocumentFactory {
} else {
throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
}
configureResourceCacheIfNeeded(document, contentSize);
return document;
}

View File

@ -153,13 +153,15 @@ public class TempFileCleanupService {
// Clean up unregistered temp files based on our cleanup strategy
boolean containerMode = isContainerMode();
int unregisteredDeletedCount = cleanupUnregisteredFiles(containerMode, true, maxAgeMillis);
if(registeredDeletedCount >0 || unregisteredDeletedCount >0 || directoriesDeletedCount >0) {
log.info(
"Scheduled cleanup complete. Deleted {} registered files, {} unregistered files, {} directories",
registeredDeletedCount,
unregisteredDeletedCount,
directoriesDeletedCount);
if (registeredDeletedCount > 0
|| unregisteredDeletedCount > 0
|| directoriesDeletedCount > 0) {
log.info(
"Scheduled cleanup complete. Deleted {} registered files, {} unregistered files, {} directories",
registeredDeletedCount,
unregisteredDeletedCount,
directoriesDeletedCount);
}
}

View File

@ -300,7 +300,8 @@ public class ExceptionUtils {
public static void logException(String operation, Exception e) {
if (PdfErrorUtils.isCorruptedPdfError(e)) {
log.warn("PDF corruption detected during {}: {}", operation, e.getMessage());
} else if (e instanceof IOException && (isEncryptionError((IOException) e) || isPasswordError((IOException) e))) {
} else if (e instanceof IOException
&& (isEncryptionError((IOException) e) || isPasswordError((IOException) e))) {
log.info("PDF security issue during {}: {}", operation, e.getMessage());
} else {
log.error("Unexpected error during {}", operation, e);

View File

@ -49,6 +49,7 @@ public class PdfErrorUtils {
|| message.contains("Invalid dictionary, found:")
|| message.contains("AES initialization vector not fully read")
|| message.contains("BadPaddingException")
|| message.contains("Given final block not properly padded");
|| message.contains("Given final block not properly padded")
|| message.contains("End-of-File, expected line");
}
}

View File

@ -72,7 +72,7 @@ public class SplitPDFController {
pageNumbers.add(totalPages - 1);
}
log.info(
log.debug(
"Splitting PDF into pages: {}",
pageNumbers.stream().map(String::valueOf).collect(Collectors.joining(",")));

View File

@ -70,17 +70,17 @@ public class ToSinglePageController {
float yOffset = totalHeight;
// For each page, copy its content to the new page at the correct offset
int pageIndex = 0;
for (PDPage page : sourceDocument.getPages()) {
PDFormXObject form =
layerUtility.importPageAsForm(
sourceDocument, sourceDocument.getPages().indexOf(page));
PDFormXObject form = layerUtility.importPageAsForm(sourceDocument, pageIndex);
AffineTransform af =
AffineTransform.getTranslateInstance(
0, yOffset - page.getMediaBox().getHeight());
layerUtility.wrapInSaveRestore(newPage);
String defaultLayerName = "Layer" + sourceDocument.getPages().indexOf(page);
String defaultLayerName = "Layer" + pageIndex;
layerUtility.appendFormAsLayer(newPage, form, af, defaultLayerName);
yOffset -= page.getMediaBox().getHeight();
pageIndex++;
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();

View File

@ -95,9 +95,9 @@ public class ExtractImagesController {
try {
int pageCount = document.getPages().getCount();
log.debug("Document reports {} pages", pageCount);
int consecutiveFailures = 0;
for (int pgNum = 0; pgNum < pageCount; pgNum++) {
try {
PDPage page = document.getPage(pgNum);
@ -118,7 +118,10 @@ public class ExtractImagesController {
allowDuplicates);
} catch (Exception e) {
// Log the error and continue processing other pages
ExceptionUtils.logException("image extraction from page " + currentPageNum, e);
ExceptionUtils.logException(
"image extraction from page "
+ currentPageNum,
e);
}
return null; // Callable requires a return type
@ -129,7 +132,7 @@ public class ExtractImagesController {
} catch (Exception e) {
consecutiveFailures++;
ExceptionUtils.logException("page access for page " + (pgNum + 1), e);
if (consecutiveFailures >= 3) {
log.warn("Stopping page iteration after 3 consecutive failures");
break;

View File

@ -47,6 +47,11 @@ public class FakeScanController {
private final CustomPDFDocumentFactory pdfDocumentFactory;
private static final Random RANDOM = new Random();
// Size limits to prevent OutOfMemoryError
private static final int MAX_IMAGE_WIDTH = 8192;
private static final int MAX_IMAGE_HEIGHT = 8192;
private static final long MAX_IMAGE_PIXELS = 16_777_216; // 4096x4096
@PostMapping(value = "/fake-scan", consumes = "multipart/form-data")
@Operation(
summary = "Convert PDF to look like a scanned document",
@ -82,8 +87,46 @@ public class FakeScanController {
PDFRenderer pdfRenderer = new PDFRenderer(document);
for (int i = 0; i < document.getNumberOfPages(); i++) {
// Render page to image with specified resolution
BufferedImage image = pdfRenderer.renderImageWithDPI(i, resolution);
// Get page dimensions to calculate safe resolution
PDRectangle pageSize = document.getPage(i).getMediaBox();
float pageWidthPts = pageSize.getWidth();
float pageHeightPts = pageSize.getHeight();
// Calculate what the image dimensions would be at the requested resolution
int projectedWidth = (int) Math.ceil(pageWidthPts * resolution / 72.0);
int projectedHeight = (int) Math.ceil(pageHeightPts * resolution / 72.0);
long projectedPixels = (long) projectedWidth * projectedHeight;
// Calculate safe resolution that stays within limits
int safeResolution = resolution;
if (projectedWidth > MAX_IMAGE_WIDTH
|| projectedHeight > MAX_IMAGE_HEIGHT
|| projectedPixels > MAX_IMAGE_PIXELS) {
double widthScale = (double) MAX_IMAGE_WIDTH / projectedWidth;
double heightScale = (double) MAX_IMAGE_HEIGHT / projectedHeight;
double pixelScale = Math.sqrt((double) MAX_IMAGE_PIXELS / projectedPixels);
double minScale = Math.min(Math.min(widthScale, heightScale), pixelScale);
safeResolution = (int) Math.max(72, resolution * minScale);
log.warn(
"Page {} would be too large at {}dpi ({}x{} pixels). Reducing to {}dpi",
i + 1,
resolution,
projectedWidth,
projectedHeight,
safeResolution);
}
// Render page to image with safe resolution
BufferedImage image = pdfRenderer.renderImageWithDPI(i, safeResolution);
log.debug(
"Processing page {} with dimensions {}x{} ({} pixels) at {}dpi",
i + 1,
image.getWidth(),
image.getHeight(),
(long) image.getWidth() * image.getHeight(),
safeResolution);
// 1. Convert to grayscale or keep color
BufferedImage processed;

View File

@ -153,15 +153,19 @@ public class PipelineProcessor {
String filename = file.getFilename();
String providedExtension = "no extension";
if (filename != null && filename.contains(".")) {
providedExtension = filename.substring(filename.lastIndexOf(".")).toLowerCase();
providedExtension =
filename.substring(filename.lastIndexOf(".")).toLowerCase();
}
logPrintStream.println(
"No files with extension "
+ String.join(", ", inputFileTypes)
+ " found for operation "
+ operation
+ ". Provided file '" + filename + "' has extension: " + providedExtension);
+ ". Provided file '"
+ filename
+ "' has extension: "
+ providedExtension);
hasErrors = true;
}
}
@ -211,17 +215,21 @@ public class PipelineProcessor {
}
} else {
// Get details about what files were actually provided
List<String> providedExtensions = outputFiles.stream()
.map(file -> {
String filename = file.getFilename();
if (filename != null && filename.contains(".")) {
return filename.substring(filename.lastIndexOf(".")).toLowerCase();
}
return "no extension";
})
.distinct()
.toList();
List<String> providedExtensions =
outputFiles.stream()
.map(
file -> {
String filename = file.getFilename();
if (filename != null && filename.contains(".")) {
return filename.substring(
filename.lastIndexOf("."))
.toLowerCase();
}
return "no extension";
})
.distinct()
.toList();
logPrintStream.println(
"No files with extension "
+ String.join(", ", inputFileTypes)
@ -229,7 +237,9 @@ public class PipelineProcessor {
+ operation
+ ". Provided files have extensions: "
+ String.join(", ", providedExtensions)
+ " (total files: " + outputFiles.size() + ")");
+ " (total files: "
+ outputFiles.size()
+ ")");
hasErrors = true;
}
}

View File

@ -65,6 +65,7 @@ import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.github.pixee.security.Filenames;
import io.micrometer.common.util.StringUtils;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
@ -166,7 +167,7 @@ public class CertSignController {
Integer pageNumber = request.getPageNumber() != null ? (request.getPageNumber() - 1) : null;
Boolean showLogo = request.getShowLogo();
if (certType == null) {
if (StringUtils.isBlank(certType)) {
throw ExceptionUtils.createIllegalArgumentException(
"error.optionsNotSpecified",
"{0} options are not specified",

View File

@ -226,7 +226,8 @@ function setupFileInput(chooser) {
try {
const { isEncrypted, requiresPassword } = await decryptFile.checkFileEncrypted(file);
if (file.type === 'application/pdf' && isEncrypted) {
if (file.type === 'application/pdf' && isEncrypted &&
!window.location.pathname.includes('remove-password')) {
decryptedFile = await decryptFile.decryptFile(file, requiresPassword);
if (!decryptedFile) throw new Error('File decryption failed.');
}

View File

@ -65,7 +65,7 @@
<label th:text="#{ocr.selectText.10}"></label>
<select class="form-control" name="ocrType">
<option value="skip-text" th:text="#{ocr.selectText.6}"></option>
<option value="force-ocr" th:text="#{ocr.selectText.7}"></option>
<option selected value="force-ocr" th:text="#{ocr.selectText.7}"></option>
<option value="Normal" th:text="#{ocr.selectText.8}"></option>
</select>
</div>