mirror of
https://github.com/Stirling-Tools/Stirling-PDF.git
synced 2025-07-23 05:35:23 +00:00
Exception cleanup, resource change and OCR Defaults (#3876)
# Description of Changes Please provide a summary of the changes, including: - What was changed - Why the change was made - Any challenges encountered Closes #(issue_number) --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details. --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: a <a>
This commit is contained in:
parent
b4a7b5d520
commit
14a4bdfb1b
@ -75,7 +75,9 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
|
||||
ocrmypdf \
|
||||
py3-pip \
|
||||
py3-pillow@testing \
|
||||
py3-pdf2image@testing && \
|
||||
py3-pdf2image@testing \
|
||||
# URW Base 35 fonts for better PDF rendering
|
||||
font-urw-base35 && \
|
||||
python3 -m venv /opt/venv && \
|
||||
/opt/venv/bin/pip install --upgrade pip setuptools && \
|
||||
/opt/venv/bin/pip install --no-cache-dir --upgrade unoserver weasyprint && \
|
||||
@ -84,6 +86,8 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
|
||||
ln -s /usr/lib/libreoffice/program /opt/venv/lib/python3.12/site-packages/LibreOffice && \
|
||||
mv /usr/share/tessdata /usr/share/tessdata-original && \
|
||||
mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders /tmp/stirling-pdf && \
|
||||
# Configure URW Base 35 fonts
|
||||
ln -s /usr/share/fontconfig/conf.avail/69-urw-*.conf /etc/fonts/conf.d/ && \
|
||||
fc-cache -f -v && \
|
||||
chmod +x /scripts/* && \
|
||||
chmod +x /scripts/init.sh && \
|
||||
|
@ -19,7 +19,7 @@ RUN apt-get update && apt-get install -y \
|
||||
# settings.yml | tessdataDir: /usr/share/tesseract-ocr/5/tessdata
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-eng \
|
||||
fonts-terminus fonts-dejavu fonts-font-awesome fonts-noto fonts-noto-core fonts-noto-cjk fonts-noto-extra fonts-liberation fonts-linuxlibertine \
|
||||
fonts-terminus fonts-dejavu fonts-font-awesome fonts-noto fonts-noto-core fonts-noto-cjk fonts-noto-extra fonts-liberation fonts-linuxlibertine fonts-urw-base35 \
|
||||
python3-uno \
|
||||
python3-venv \
|
||||
# ss -tln
|
||||
@ -45,6 +45,7 @@ ENV PATH="/opt/venv/bin:$PATH"
|
||||
COPY . /workspace
|
||||
|
||||
RUN mkdir -p /tmp/stirling-pdf \
|
||||
&& fc-cache -f -v \
|
||||
&& adduser --disabled-password --gecos '' devuser \
|
||||
&& chown -R devuser:devuser /home/devuser /workspace /tmp/stirling-pdf
|
||||
RUN echo "devuser ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/devuser \
|
||||
|
@ -82,7 +82,7 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
|
||||
tesseract-ocr-data-fra \
|
||||
tesseract-ocr-data-por \
|
||||
unpaper \
|
||||
font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine \
|
||||
font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra font-liberation font-linux-libertine font-urw-base35 \
|
||||
# CV
|
||||
py3-opencv \
|
||||
python3 \
|
||||
@ -98,6 +98,8 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a
|
||||
ln -s /usr/lib/libreoffice/program /opt/venv/lib/python3.12/site-packages/LibreOffice && \
|
||||
mv /usr/share/tessdata /usr/share/tessdata-original && \
|
||||
mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders /tmp/stirling-pdf && \
|
||||
# Configure URW Base 35 fonts
|
||||
ln -s /usr/share/fontconfig/conf.avail/69-urw-*.conf /etc/fonts/conf.d/ && \
|
||||
fc-cache -f -v && \
|
||||
chmod +x /scripts/* && \
|
||||
chmod +x /scripts/init.sh && \
|
||||
|
@ -52,4 +52,4 @@ EXPOSE 8080/tcp
|
||||
|
||||
# Run the application
|
||||
ENTRYPOINT ["tini", "--", "/scripts/init-without-ocr.sh"]
|
||||
CMD ["java", "-Dfile.encoding=UTF-8", "-Djava.io.tmpdir=/tmp/stirling-pdf", "-jar", "/app.jar"]
|
||||
CMD ["java", "-Dfile.encoding=UTF-8", "-Djava.io.tmpdir=/tmp/stirling-pdf", "-jar", "/app.jar"]
|
@ -293,9 +293,32 @@ public class CustomPDFDocumentFactory {
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
|
||||
}
|
||||
|
||||
configureResourceCacheIfNeeded(document, contentSize);
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure resource cache based on content size and memory constraints. Disables resource
|
||||
* cache for large files or when memory is low to prevent OOM errors.
|
||||
*/
|
||||
private void configureResourceCacheIfNeeded(PDDocument document, long contentSize) {
|
||||
if (contentSize > LARGE_FILE_THRESHOLD) {
|
||||
document.setResourceCache(null);
|
||||
} else {
|
||||
// Check current memory status for smaller files
|
||||
long maxMemory = Runtime.getRuntime().maxMemory();
|
||||
long usedMemory =
|
||||
Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory();
|
||||
double freeMemoryPercent = (double) (maxMemory - usedMemory) / maxMemory * 100;
|
||||
|
||||
if (freeMemoryPercent < MIN_FREE_MEMORY_PERCENTAGE) {
|
||||
document.setResourceCache(null);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Load a PDF with password protection using adaptive loading strategies */
|
||||
private PDDocument loadAdaptivelyWithPassword(Object source, long contentSize, String password)
|
||||
throws IOException {
|
||||
@ -314,6 +337,9 @@ public class CustomPDFDocumentFactory {
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
|
||||
}
|
||||
|
||||
configureResourceCacheIfNeeded(document, contentSize);
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
|
@ -153,13 +153,15 @@ public class TempFileCleanupService {
|
||||
// Clean up unregistered temp files based on our cleanup strategy
|
||||
boolean containerMode = isContainerMode();
|
||||
int unregisteredDeletedCount = cleanupUnregisteredFiles(containerMode, true, maxAgeMillis);
|
||||
|
||||
if(registeredDeletedCount >0 || unregisteredDeletedCount >0 || directoriesDeletedCount >0) {
|
||||
log.info(
|
||||
"Scheduled cleanup complete. Deleted {} registered files, {} unregistered files, {} directories",
|
||||
registeredDeletedCount,
|
||||
unregisteredDeletedCount,
|
||||
directoriesDeletedCount);
|
||||
|
||||
if (registeredDeletedCount > 0
|
||||
|| unregisteredDeletedCount > 0
|
||||
|| directoriesDeletedCount > 0) {
|
||||
log.info(
|
||||
"Scheduled cleanup complete. Deleted {} registered files, {} unregistered files, {} directories",
|
||||
registeredDeletedCount,
|
||||
unregisteredDeletedCount,
|
||||
directoriesDeletedCount);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -300,7 +300,8 @@ public class ExceptionUtils {
|
||||
public static void logException(String operation, Exception e) {
|
||||
if (PdfErrorUtils.isCorruptedPdfError(e)) {
|
||||
log.warn("PDF corruption detected during {}: {}", operation, e.getMessage());
|
||||
} else if (e instanceof IOException && (isEncryptionError((IOException) e) || isPasswordError((IOException) e))) {
|
||||
} else if (e instanceof IOException
|
||||
&& (isEncryptionError((IOException) e) || isPasswordError((IOException) e))) {
|
||||
log.info("PDF security issue during {}: {}", operation, e.getMessage());
|
||||
} else {
|
||||
log.error("Unexpected error during {}", operation, e);
|
||||
|
@ -49,6 +49,7 @@ public class PdfErrorUtils {
|
||||
|| message.contains("Invalid dictionary, found:")
|
||||
|| message.contains("AES initialization vector not fully read")
|
||||
|| message.contains("BadPaddingException")
|
||||
|| message.contains("Given final block not properly padded");
|
||||
|| message.contains("Given final block not properly padded")
|
||||
|| message.contains("End-of-File, expected line");
|
||||
}
|
||||
}
|
||||
|
@ -72,7 +72,7 @@ public class SplitPDFController {
|
||||
pageNumbers.add(totalPages - 1);
|
||||
}
|
||||
|
||||
log.info(
|
||||
log.debug(
|
||||
"Splitting PDF into pages: {}",
|
||||
pageNumbers.stream().map(String::valueOf).collect(Collectors.joining(",")));
|
||||
|
||||
|
@ -70,17 +70,17 @@ public class ToSinglePageController {
|
||||
float yOffset = totalHeight;
|
||||
|
||||
// For each page, copy its content to the new page at the correct offset
|
||||
int pageIndex = 0;
|
||||
for (PDPage page : sourceDocument.getPages()) {
|
||||
PDFormXObject form =
|
||||
layerUtility.importPageAsForm(
|
||||
sourceDocument, sourceDocument.getPages().indexOf(page));
|
||||
PDFormXObject form = layerUtility.importPageAsForm(sourceDocument, pageIndex);
|
||||
AffineTransform af =
|
||||
AffineTransform.getTranslateInstance(
|
||||
0, yOffset - page.getMediaBox().getHeight());
|
||||
layerUtility.wrapInSaveRestore(newPage);
|
||||
String defaultLayerName = "Layer" + sourceDocument.getPages().indexOf(page);
|
||||
String defaultLayerName = "Layer" + pageIndex;
|
||||
layerUtility.appendFormAsLayer(newPage, form, af, defaultLayerName);
|
||||
yOffset -= page.getMediaBox().getHeight();
|
||||
pageIndex++;
|
||||
}
|
||||
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
|
@ -95,9 +95,9 @@ public class ExtractImagesController {
|
||||
try {
|
||||
int pageCount = document.getPages().getCount();
|
||||
log.debug("Document reports {} pages", pageCount);
|
||||
|
||||
|
||||
int consecutiveFailures = 0;
|
||||
|
||||
|
||||
for (int pgNum = 0; pgNum < pageCount; pgNum++) {
|
||||
try {
|
||||
PDPage page = document.getPage(pgNum);
|
||||
@ -118,7 +118,10 @@ public class ExtractImagesController {
|
||||
allowDuplicates);
|
||||
} catch (Exception e) {
|
||||
// Log the error and continue processing other pages
|
||||
ExceptionUtils.logException("image extraction from page " + currentPageNum, e);
|
||||
ExceptionUtils.logException(
|
||||
"image extraction from page "
|
||||
+ currentPageNum,
|
||||
e);
|
||||
}
|
||||
|
||||
return null; // Callable requires a return type
|
||||
@ -129,7 +132,7 @@ public class ExtractImagesController {
|
||||
} catch (Exception e) {
|
||||
consecutiveFailures++;
|
||||
ExceptionUtils.logException("page access for page " + (pgNum + 1), e);
|
||||
|
||||
|
||||
if (consecutiveFailures >= 3) {
|
||||
log.warn("Stopping page iteration after 3 consecutive failures");
|
||||
break;
|
||||
|
@ -47,6 +47,11 @@ public class FakeScanController {
|
||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||
private static final Random RANDOM = new Random();
|
||||
|
||||
// Size limits to prevent OutOfMemoryError
|
||||
private static final int MAX_IMAGE_WIDTH = 8192;
|
||||
private static final int MAX_IMAGE_HEIGHT = 8192;
|
||||
private static final long MAX_IMAGE_PIXELS = 16_777_216; // 4096x4096
|
||||
|
||||
@PostMapping(value = "/fake-scan", consumes = "multipart/form-data")
|
||||
@Operation(
|
||||
summary = "Convert PDF to look like a scanned document",
|
||||
@ -82,8 +87,46 @@ public class FakeScanController {
|
||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||
|
||||
for (int i = 0; i < document.getNumberOfPages(); i++) {
|
||||
// Render page to image with specified resolution
|
||||
BufferedImage image = pdfRenderer.renderImageWithDPI(i, resolution);
|
||||
// Get page dimensions to calculate safe resolution
|
||||
PDRectangle pageSize = document.getPage(i).getMediaBox();
|
||||
float pageWidthPts = pageSize.getWidth();
|
||||
float pageHeightPts = pageSize.getHeight();
|
||||
|
||||
// Calculate what the image dimensions would be at the requested resolution
|
||||
int projectedWidth = (int) Math.ceil(pageWidthPts * resolution / 72.0);
|
||||
int projectedHeight = (int) Math.ceil(pageHeightPts * resolution / 72.0);
|
||||
long projectedPixels = (long) projectedWidth * projectedHeight;
|
||||
|
||||
// Calculate safe resolution that stays within limits
|
||||
int safeResolution = resolution;
|
||||
if (projectedWidth > MAX_IMAGE_WIDTH
|
||||
|| projectedHeight > MAX_IMAGE_HEIGHT
|
||||
|| projectedPixels > MAX_IMAGE_PIXELS) {
|
||||
double widthScale = (double) MAX_IMAGE_WIDTH / projectedWidth;
|
||||
double heightScale = (double) MAX_IMAGE_HEIGHT / projectedHeight;
|
||||
double pixelScale = Math.sqrt((double) MAX_IMAGE_PIXELS / projectedPixels);
|
||||
double minScale = Math.min(Math.min(widthScale, heightScale), pixelScale);
|
||||
safeResolution = (int) Math.max(72, resolution * minScale);
|
||||
|
||||
log.warn(
|
||||
"Page {} would be too large at {}dpi ({}x{} pixels). Reducing to {}dpi",
|
||||
i + 1,
|
||||
resolution,
|
||||
projectedWidth,
|
||||
projectedHeight,
|
||||
safeResolution);
|
||||
}
|
||||
|
||||
// Render page to image with safe resolution
|
||||
BufferedImage image = pdfRenderer.renderImageWithDPI(i, safeResolution);
|
||||
|
||||
log.debug(
|
||||
"Processing page {} with dimensions {}x{} ({} pixels) at {}dpi",
|
||||
i + 1,
|
||||
image.getWidth(),
|
||||
image.getHeight(),
|
||||
(long) image.getWidth() * image.getHeight(),
|
||||
safeResolution);
|
||||
|
||||
// 1. Convert to grayscale or keep color
|
||||
BufferedImage processed;
|
||||
|
@ -153,15 +153,19 @@ public class PipelineProcessor {
|
||||
String filename = file.getFilename();
|
||||
String providedExtension = "no extension";
|
||||
if (filename != null && filename.contains(".")) {
|
||||
providedExtension = filename.substring(filename.lastIndexOf(".")).toLowerCase();
|
||||
providedExtension =
|
||||
filename.substring(filename.lastIndexOf(".")).toLowerCase();
|
||||
}
|
||||
|
||||
|
||||
logPrintStream.println(
|
||||
"No files with extension "
|
||||
+ String.join(", ", inputFileTypes)
|
||||
+ " found for operation "
|
||||
+ operation
|
||||
+ ". Provided file '" + filename + "' has extension: " + providedExtension);
|
||||
+ ". Provided file '"
|
||||
+ filename
|
||||
+ "' has extension: "
|
||||
+ providedExtension);
|
||||
hasErrors = true;
|
||||
}
|
||||
}
|
||||
@ -211,17 +215,21 @@ public class PipelineProcessor {
|
||||
}
|
||||
} else {
|
||||
// Get details about what files were actually provided
|
||||
List<String> providedExtensions = outputFiles.stream()
|
||||
.map(file -> {
|
||||
String filename = file.getFilename();
|
||||
if (filename != null && filename.contains(".")) {
|
||||
return filename.substring(filename.lastIndexOf(".")).toLowerCase();
|
||||
}
|
||||
return "no extension";
|
||||
})
|
||||
.distinct()
|
||||
.toList();
|
||||
|
||||
List<String> providedExtensions =
|
||||
outputFiles.stream()
|
||||
.map(
|
||||
file -> {
|
||||
String filename = file.getFilename();
|
||||
if (filename != null && filename.contains(".")) {
|
||||
return filename.substring(
|
||||
filename.lastIndexOf("."))
|
||||
.toLowerCase();
|
||||
}
|
||||
return "no extension";
|
||||
})
|
||||
.distinct()
|
||||
.toList();
|
||||
|
||||
logPrintStream.println(
|
||||
"No files with extension "
|
||||
+ String.join(", ", inputFileTypes)
|
||||
@ -229,7 +237,9 @@ public class PipelineProcessor {
|
||||
+ operation
|
||||
+ ". Provided files have extensions: "
|
||||
+ String.join(", ", providedExtensions)
|
||||
+ " (total files: " + outputFiles.size() + ")");
|
||||
+ " (total files: "
|
||||
+ outputFiles.size()
|
||||
+ ")");
|
||||
hasErrors = true;
|
||||
}
|
||||
}
|
||||
|
@ -65,6 +65,7 @@ import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import io.github.pixee.security.Filenames;
|
||||
import io.micrometer.common.util.StringUtils;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
|
||||
@ -166,7 +167,7 @@ public class CertSignController {
|
||||
Integer pageNumber = request.getPageNumber() != null ? (request.getPageNumber() - 1) : null;
|
||||
Boolean showLogo = request.getShowLogo();
|
||||
|
||||
if (certType == null) {
|
||||
if (StringUtils.isBlank(certType)) {
|
||||
throw ExceptionUtils.createIllegalArgumentException(
|
||||
"error.optionsNotSpecified",
|
||||
"{0} options are not specified",
|
||||
|
@ -226,7 +226,8 @@ function setupFileInput(chooser) {
|
||||
|
||||
try {
|
||||
const { isEncrypted, requiresPassword } = await decryptFile.checkFileEncrypted(file);
|
||||
if (file.type === 'application/pdf' && isEncrypted) {
|
||||
if (file.type === 'application/pdf' && isEncrypted &&
|
||||
!window.location.pathname.includes('remove-password')) {
|
||||
decryptedFile = await decryptFile.decryptFile(file, requiresPassword);
|
||||
if (!decryptedFile) throw new Error('File decryption failed.');
|
||||
}
|
||||
|
@ -65,7 +65,7 @@
|
||||
<label th:text="#{ocr.selectText.10}"></label>
|
||||
<select class="form-control" name="ocrType">
|
||||
<option value="skip-text" th:text="#{ocr.selectText.6}"></option>
|
||||
<option value="force-ocr" th:text="#{ocr.selectText.7}"></option>
|
||||
<option selected value="force-ocr" th:text="#{ocr.selectText.7}"></option>
|
||||
<option value="Normal" th:text="#{ocr.selectText.8}"></option>
|
||||
</select>
|
||||
</div>
|
||||
|
Loading…
x
Reference in New Issue
Block a user