Exception cleanup, resource change and OCR Defaults

This commit is contained in:
Anthony Stirling 2025-07-04 21:32:56 +01:00
parent 25c9504fed
commit 41dd04a4cc
13 changed files with 136 additions and 39 deletions

View File

@ -75,7 +75,9 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
ocrmypdf \
py3-pip \
py3-pillow@testing \
py3-pdf2image@testing && \
py3-pdf2image@testing \
# URW Base 35 fonts for better PDF rendering
font-urw-base35 && \
python3 -m venv /opt/venv && \
/opt/venv/bin/pip install --upgrade pip setuptools && \
/opt/venv/bin/pip install --no-cache-dir --upgrade unoserver weasyprint && \
@ -84,6 +86,8 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
ln -s /usr/lib/libreoffice/program /opt/venv/lib/python3.12/site-packages/LibreOffice && \
mv /usr/share/tessdata /usr/share/tessdata-original && \
mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders /tmp/stirling-pdf && \
# Configure URW Base 35 fonts
ln -s /usr/share/fontconfig/conf.avail/69-urw-*.conf /etc/fonts/conf.d/ && \
fc-cache -f -v && \
chmod +x /scripts/* && \
chmod +x /scripts/init.sh && \

View File

@ -19,7 +19,7 @@ RUN apt-get update && apt-get install -y \
# settings.yml | tessdataDir: /usr/share/tesseract-ocr/5/tessdata
tesseract-ocr \
tesseract-ocr-eng \
fonts-terminus fonts-dejavu fonts-font-awesome fonts-noto fonts-noto-core fonts-noto-cjk fonts-noto-extra fonts-liberation fonts-linuxlibertine \
fonts-terminus fonts-dejavu fonts-font-awesome fonts-noto fonts-noto-core fonts-noto-cjk fonts-noto-extra fonts-liberation fonts-linuxlibertine fonts-urw-base35 \
python3-uno \
python3-venv \
# ss -tln
@ -45,6 +45,7 @@ ENV PATH="/opt/venv/bin:$PATH"
COPY . /workspace
RUN mkdir -p /tmp/stirling-pdf \
&& fc-cache -f -v \
&& adduser --disabled-password --gecos '' devuser \
&& chown -R devuser:devuser /home/devuser /workspace /tmp/stirling-pdf
RUN echo "devuser ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/devuser \

View File

@ -82,7 +82,7 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
tesseract-ocr-data-fra \
tesseract-ocr-data-por \
unpaper \
font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \
font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine font-urw-base35 \
# CV
py3-opencv \
python3 \
@ -98,6 +98,8 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
ln -s /usr/lib/libreoffice/program /opt/venv/lib/python3.12/site-packages/LibreOffice && \
mv /usr/share/tessdata /usr/share/tessdata-original && \
mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders /tmp/stirling-pdf && \
# Configure URW Base 35 fonts
ln -s /usr/share/fontconfig/conf.avail/69-urw-*.conf /etc/fonts/conf.d/ && \
fc-cache -f -v && \
chmod +x /scripts/* && \
chmod +x /scripts/init.sh && \

View File

@ -37,9 +37,14 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et
curl \
shadow \
su-exec \
openjdk21-jre && \
openjdk21-jre \
# URW Base 35 fonts for better PDF rendering
font-urw-base35 && \
# User permissions
mkdir -p /configs /logs /customFiles /usr/share/fonts/opentype/noto /tmp/stirling-pdf && \
# Configure URW Base 35 fonts
ln -s /usr/share/fontconfig/conf.avail/69-urw-*.conf /etc/fonts/conf.d/ && \
fc-cache -f -v && \
chmod +x /scripts/*.sh && \
addgroup -S stirlingpdfgroup && adduser -S stirlingpdfuser -G stirlingpdfgroup && \
chown -R stirlingpdfuser:stirlingpdfgroup $HOME /scripts /configs /customFiles /pipeline /tmp/stirling-pdf && \

View File

@ -293,9 +293,32 @@ public class CustomPDFDocumentFactory {
} else {
throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
}
configureResourceCacheIfNeeded(document, contentSize);
return document;
}
/**
* Configure resource cache based on content size and memory constraints. Disables resource
* cache for large files or when memory is low to prevent OOM errors.
*/
private void configureResourceCacheIfNeeded(PDDocument document, long contentSize) {
if (contentSize > LARGE_FILE_THRESHOLD) {
document.setResourceCache(null);
} else {
// Check current memory status for smaller files
long maxMemory = Runtime.getRuntime().maxMemory();
long usedMemory =
Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory();
double freeMemoryPercent = (double) (maxMemory - usedMemory) / maxMemory * 100;
if (freeMemoryPercent < MIN_FREE_MEMORY_PERCENTAGE) {
document.setResourceCache(null);
}
}
}
/** Load a PDF with password protection using adaptive loading strategies */
private PDDocument loadAdaptivelyWithPassword(Object source, long contentSize, String password)
throws IOException {
@ -314,6 +337,9 @@ public class CustomPDFDocumentFactory {
} else {
throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
}
configureResourceCacheIfNeeded(document, contentSize);
return document;
}

View File

@ -154,7 +154,9 @@ public class TempFileCleanupService {
boolean containerMode = isContainerMode();
int unregisteredDeletedCount = cleanupUnregisteredFiles(containerMode, true, maxAgeMillis);
if(registeredDeletedCount >0 || unregisteredDeletedCount >0 || directoriesDeletedCount >0) {
if (registeredDeletedCount > 0
|| unregisteredDeletedCount > 0
|| directoriesDeletedCount > 0) {
log.info(
"Scheduled cleanup complete. Deleted {} registered files, {} unregistered files, {} directories",
registeredDeletedCount,

View File

@ -300,7 +300,8 @@ public class ExceptionUtils {
public static void logException(String operation, Exception e) {
if (PdfErrorUtils.isCorruptedPdfError(e)) {
log.warn("PDF corruption detected during {}: {}", operation, e.getMessage());
} else if (e instanceof IOException && (isEncryptionError((IOException) e) || isPasswordError((IOException) e))) {
} else if (e instanceof IOException
&& (isEncryptionError((IOException) e) || isPasswordError((IOException) e))) {
log.info("PDF security issue during {}: {}", operation, e.getMessage());
} else {
log.error("Unexpected error during {}", operation, e);

View File

@ -70,17 +70,17 @@ public class ToSinglePageController {
float yOffset = totalHeight;
// For each page, copy its content to the new page at the correct offset
int pageIndex = 0;
for (PDPage page : sourceDocument.getPages()) {
PDFormXObject form =
layerUtility.importPageAsForm(
sourceDocument, sourceDocument.getPages().indexOf(page));
PDFormXObject form = layerUtility.importPageAsForm(sourceDocument, pageIndex);
AffineTransform af =
AffineTransform.getTranslateInstance(
0, yOffset - page.getMediaBox().getHeight());
layerUtility.wrapInSaveRestore(newPage);
String defaultLayerName = "Layer" + sourceDocument.getPages().indexOf(page);
String defaultLayerName = "Layer" + pageIndex;
layerUtility.appendFormAsLayer(newPage, form, af, defaultLayerName);
yOffset -= page.getMediaBox().getHeight();
pageIndex++;
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();

View File

@ -118,7 +118,10 @@ public class ExtractImagesController {
allowDuplicates);
} catch (Exception e) {
// Log the error and continue processing other pages
ExceptionUtils.logException("image extraction from page " + currentPageNum, e);
ExceptionUtils.logException(
"image extraction from page "
+ currentPageNum,
e);
}
return null; // Callable requires a return type

View File

@ -47,6 +47,11 @@ public class FakeScanController {
private final CustomPDFDocumentFactory pdfDocumentFactory;
private static final Random RANDOM = new Random();
// Size limits to prevent OutOfMemoryError
private static final int MAX_IMAGE_WIDTH = 8192;
private static final int MAX_IMAGE_HEIGHT = 8192;
private static final long MAX_IMAGE_PIXELS = 16_777_216; // 4096x4096
@PostMapping(value = "/fake-scan", consumes = "multipart/form-data")
@Operation(
summary = "Convert PDF to look like a scanned document",
@ -82,8 +87,46 @@ public class FakeScanController {
PDFRenderer pdfRenderer = new PDFRenderer(document);
for (int i = 0; i < document.getNumberOfPages(); i++) {
// Render page to image with specified resolution
BufferedImage image = pdfRenderer.renderImageWithDPI(i, resolution);
// Get page dimensions to calculate safe resolution
PDRectangle pageSize = document.getPage(i).getMediaBox();
float pageWidthPts = pageSize.getWidth();
float pageHeightPts = pageSize.getHeight();
// Calculate what the image dimensions would be at the requested resolution
int projectedWidth = (int) Math.ceil(pageWidthPts * resolution / 72.0);
int projectedHeight = (int) Math.ceil(pageHeightPts * resolution / 72.0);
long projectedPixels = (long) projectedWidth * projectedHeight;
// Calculate safe resolution that stays within limits
int safeResolution = resolution;
if (projectedWidth > MAX_IMAGE_WIDTH
|| projectedHeight > MAX_IMAGE_HEIGHT
|| projectedPixels > MAX_IMAGE_PIXELS) {
double widthScale = (double) MAX_IMAGE_WIDTH / projectedWidth;
double heightScale = (double) MAX_IMAGE_HEIGHT / projectedHeight;
double pixelScale = Math.sqrt((double) MAX_IMAGE_PIXELS / projectedPixels);
double minScale = Math.min(Math.min(widthScale, heightScale), pixelScale);
safeResolution = (int) Math.max(72, resolution * minScale);
log.warn(
"Page {} would be too large at {}dpi ({}x{} pixels). Reducing to {}dpi",
i + 1,
resolution,
projectedWidth,
projectedHeight,
safeResolution);
}
// Render page to image with safe resolution
BufferedImage image = pdfRenderer.renderImageWithDPI(i, safeResolution);
log.debug(
"Processing page {} with dimensions {}x{} ({} pixels) at {}dpi",
i + 1,
image.getWidth(),
image.getHeight(),
(long) image.getWidth() * image.getHeight(),
safeResolution);
// 1. Convert to grayscale or keep color
BufferedImage processed;

View File

@ -153,7 +153,8 @@ public class PipelineProcessor {
String filename = file.getFilename();
String providedExtension = "no extension";
if (filename != null && filename.contains(".")) {
providedExtension = filename.substring(filename.lastIndexOf(".")).toLowerCase();
providedExtension =
filename.substring(filename.lastIndexOf(".")).toLowerCase();
}
logPrintStream.println(
@ -161,7 +162,10 @@ public class PipelineProcessor {
+ String.join(", ", inputFileTypes)
+ " found for operation "
+ operation
+ ". Provided file '" + filename + "' has extension: " + providedExtension);
+ ". Provided file '"
+ filename
+ "' has extension: "
+ providedExtension);
hasErrors = true;
}
}
@ -211,11 +215,15 @@ public class PipelineProcessor {
}
} else {
// Get details about what files were actually provided
List<String> providedExtensions = outputFiles.stream()
.map(file -> {
List<String> providedExtensions =
outputFiles.stream()
.map(
file -> {
String filename = file.getFilename();
if (filename != null && filename.contains(".")) {
return filename.substring(filename.lastIndexOf(".")).toLowerCase();
return filename.substring(
filename.lastIndexOf("."))
.toLowerCase();
}
return "no extension";
})
@ -229,7 +237,9 @@ public class PipelineProcessor {
+ operation
+ ". Provided files have extensions: "
+ String.join(", ", providedExtensions)
+ " (total files: " + outputFiles.size() + ")");
+ " (total files: "
+ outputFiles.size()
+ ")");
hasErrors = true;
}
}

View File

@ -166,7 +166,7 @@ public class CertSignController {
Integer pageNumber = request.getPageNumber() != null ? (request.getPageNumber() - 1) : null;
Boolean showLogo = request.getShowLogo();
if (certType == null) {
if (certType == null || certType.trim().isEmpty()) {
throw ExceptionUtils.createIllegalArgumentException(
"error.optionsNotSpecified",
"{0} options are not specified",

View File

@ -65,7 +65,7 @@
<label th:text="#{ocr.selectText.10}"></label>
<select class="form-control" name="ocrType">
<option value="skip-text" th:text="#{ocr.selectText.6}"></option>
<option value="force-ocr" th:text="#{ocr.selectText.7}"></option>
<option selected value="force-ocr" th:text="#{ocr.selectText.7}"></option>
<option value="Normal" th:text="#{ocr.selectText.8}"></option>
</select>
</div>