2023-03-20 21:55:11 +00:00
|
|
|
<!DOCTYPE html>
|
2024-05-22 21:48:23 +01:00
|
|
|
<html th:lang="${#locale.language}" th:dir="#{language.direction}" th:data-language="${#locale.toString()}" xmlns:th="https://www.thymeleaf.org">
|
2024-02-16 22:49:06 +01:00
|
|
|
<head>
|
|
|
|
<th:block th:insert="~{fragments/common :: head(title=#{ocr.title}, header=#{ocr.header})}"></th:block>
|
2025-05-27 13:50:16 +01:00
|
|
|
<style>
|
|
|
|
#languages {
|
|
|
|
max-height: 400px;
|
|
|
|
overflow-y: auto;
|
|
|
|
border: 1px solid var(--md-sys-color-surface-3);
|
|
|
|
border-radius: 5px;
|
|
|
|
padding: 10px;
|
|
|
|
}
|
|
|
|
</style>
|
2023-05-22 18:28:16 +03:00
|
|
|
<script>
|
2024-02-16 22:49:06 +01:00
|
|
|
function handleLangSelection() {
|
|
|
|
let checkboxes = document.getElementsByName("languages");
|
|
|
|
let selected = false;
|
|
|
|
for (let i = 0; i < checkboxes.length; i++) {
|
|
|
|
if (checkboxes[i].checked) {
|
|
|
|
selected = true;
|
|
|
|
checkboxes[i].setAttribute('required', 'false');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (selected) {
|
|
|
|
for (let i = 0; i < checkboxes.length; i++) {
|
|
|
|
checkboxes[i].removeAttribute('required');
|
|
|
|
}
|
2023-05-22 18:28:16 +03:00
|
|
|
}
|
2024-02-16 22:49:06 +01:00
|
|
|
else {
|
|
|
|
for (let i = 0; i < checkboxes.length; i++) {
|
|
|
|
checkboxes[i].setAttribute('required', 'true');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-05-22 18:28:16 +03:00
|
|
|
</script>
|
2024-02-16 22:49:06 +01:00
|
|
|
</head>
|
2023-03-20 21:55:11 +00:00
|
|
|
|
2024-02-16 22:49:06 +01:00
|
|
|
<body>
|
2023-05-04 00:07:51 +03:00
|
|
|
<th:block th:insert="~{fragments/common :: game}"></th:block>
|
2023-03-20 21:55:11 +00:00
|
|
|
<div id="page-container">
|
2024-02-16 22:49:06 +01:00
|
|
|
<div id="content-wrap">
|
|
|
|
<th:block th:insert="~{fragments/navbar.html :: navbar}"></th:block>
|
2024-03-21 21:58:01 +01:00
|
|
|
<br><br>
|
2024-02-16 22:49:06 +01:00
|
|
|
<div class="container">
|
|
|
|
<div class="row justify-content-center">
|
2024-05-19 12:44:54 +02:00
|
|
|
<div class="col-md-6 bg-card">
|
2024-05-05 15:19:53 +04:00
|
|
|
<div class="tool-header">
|
|
|
|
<span class="material-symbols-rounded tool-header-icon other">quick_reference_all</span>
|
|
|
|
<span class="tool-header-text" th:text="#{ocr.header}"></span>
|
|
|
|
</div>
|
2024-06-15 23:07:09 +02:00
|
|
|
<form th:if="${#lists.size(languages) > 0}" action="#" th:action="@{'/api/v1/misc/ocr-pdf'}" method="post" enctype="multipart/form-data" class="mb-3">
|
2024-08-23 10:17:50 +02:00
|
|
|
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multipleInputsForSingleRequest=false, accept='application/pdf')}"></div>
|
2024-02-16 22:49:06 +01:00
|
|
|
<div class="mb-3">
|
|
|
|
<label for="languages" class="form-label" th:text="#{ocr.selectText.1}"></label>
|
2024-03-21 21:58:01 +01:00
|
|
|
<hr>
|
2024-02-16 22:49:06 +01:00
|
|
|
<div id="languages">
|
2024-05-05 15:19:53 +04:00
|
|
|
<div class="form-check" th:each="language, iterStat : ${languages}">
|
2024-02-16 22:49:06 +01:00
|
|
|
<input type="checkbox" th:name="languages" th:value="${language}" required th:id="${'language-' + language}" onchange="handleLangSelection()" />
|
2024-05-05 15:19:53 +04:00
|
|
|
<label th:for="${'language-' + language}" th:text="${language}"></label>
|
2023-03-20 21:55:11 +00:00
|
|
|
</div>
|
2024-02-16 22:49:06 +01:00
|
|
|
</div>
|
2024-03-21 21:58:01 +01:00
|
|
|
<hr>
|
2024-02-16 22:49:06 +01:00
|
|
|
</div>
|
|
|
|
<div class="mb-3">
|
|
|
|
<label th:text="#{ocr.selectText.10}"></label>
|
|
|
|
<select class="form-control" name="ocrType">
|
|
|
|
<option value="skip-text" th:text="#{ocr.selectText.6}"></option>
|
2025-07-05 17:33:25 +01:00
|
|
|
<option selected value="force-ocr" th:text="#{ocr.selectText.7}"></option>
|
2024-02-16 22:49:06 +01:00
|
|
|
<option value="Normal" th:text="#{ocr.selectText.8}"></option>
|
|
|
|
</select>
|
|
|
|
</div>
|
|
|
|
<br>
|
|
|
|
<label for="languages" class="form-label" th:text="#{ocr.selectText.9}"></label>
|
|
|
|
<div class="mb-3">
|
|
|
|
<label th:text="#{ocr.selectText.12}"></label>
|
|
|
|
<select class="form-control" name="ocrRenderType">
|
|
|
|
<option value="hocr">HOCR (Latin/Roman alphabet only)</option>
|
|
|
|
<option value="sandwich">Sandwich</option>
|
|
|
|
</select>
|
|
|
|
</div>
|
|
|
|
<br>
|
restore OCRMyPDF and ghostscript compression (#3846)
# Description of Changes
This pull request introduces enhancements to tool dependencies, endpoint
configurations, and Dockerfile packages. The changes primarily focus on
adding support for new tools (Ghostscript and OCRmyPDF), improving
endpoint management, and refining dependency checks. Below is a summary
of the most important changes grouped by theme:
### Tool Integration and Dependency Management:
* Added support for Ghostscript and OCRmyPDF in the session limits and
timeout configurations (`ApplicationProperties.java`).
[[1]](diffhunk://#diff-0642cd5c54d57a80f1f9c26cb26677f58778dc889cb59ebfc07b7512e74a8886R548-R549)
[[2]](diffhunk://#diff-0642cd5c54d57a80f1f9c26cb26677f58778dc889cb59ebfc07b7512e74a8886R582-R589)
[[3]](diffhunk://#diff-0642cd5c54d57a80f1f9c26cb26677f58778dc889cb59ebfc07b7512e74a8886R602-R603)
[[4]](diffhunk://#diff-0642cd5c54d57a80f1f9c26cb26677f58778dc889cb59ebfc07b7512e74a8886R636-R643)
* Updated `Processes` enum and `ProcessExecutor` logic to include
Ghostscript and OCRmyPDF, enabling their session limits and timeout
handling.
[[1]](diffhunk://#diff-b0afb37bdac8b0f1a10aca87b0244b133ac9b05518088b6f4c6c2bf48f859fdaR87-R96)
[[2]](diffhunk://#diff-b0afb37bdac8b0f1a10aca87b0244b133ac9b05518088b6f4c6c2bf48f859fdaR141-R150)
[[3]](diffhunk://#diff-b0afb37bdac8b0f1a10aca87b0244b133ac9b05518088b6f4c6c2bf48f859fdaL281-R303)
* Integrated Ghostscript and OCRmyPDF into the external dependency check
mechanism (`ExternalAppDepConfig.java`).
[[1]](diffhunk://#diff-511713d04e0545670bfb44e70e84235288e91db0ef219e423628746f28e67cfcR37-R38)
[[2]](diffhunk://#diff-511713d04e0545670bfb44e70e84235288e91db0ef219e423628746f28e67cfcR114-R115)
### Endpoint Configuration Enhancements:
* Improved endpoint management by introducing functional group
overrides, tool group fallbacks, and alternative tool group handling
(`EndpointConfiguration.java`).
[[1]](diffhunk://#diff-d6f76bfe88b1a7f347c10cc6956f8e7874b66772c6ac0ac432bedd3c7e8c372dR24-R25)
[[2]](diffhunk://#diff-d6f76bfe88b1a7f347c10cc6956f8e7874b66772c6ac0ac432bedd3c7e8c372dR39-R178)
[[3]](diffhunk://#diff-d6f76bfe88b1a7f347c10cc6956f8e7874b66772c6ac0ac432bedd3c7e8c372dL264-R357)
[[4]](diffhunk://#diff-d6f76bfe88b1a7f347c10cc6956f8e7874b66772c6ac0ac432bedd3c7e8c372dR399-R437)
* Added new endpoint groups and alternatives for multi-tool endpoints
like "repair," "compress-pdf," and "ocr-pdf," supporting both
Ghostscript and OCRmyPDF.
[[1]](diffhunk://#diff-d6f76bfe88b1a7f347c10cc6956f8e7874b66772c6ac0ac432bedd3c7e8c372dL215-L216)
[[2]](diffhunk://#diff-d6f76bfe88b1a7f347c10cc6956f8e7874b66772c6ac0ac432bedd3c7e8c372dL264-R357)
### Dockerfile Updates:
* Removed `qpdf` and added `unpaper` and `ocrmypdf` to the package list
in `Dockerfile` and `Dockerfile.fat` to enable advanced OCR and PDF
manipulation features.
[[1]](diffhunk://#diff-dd2c0eb6ea5cfc6c4bd4eac30934e2d5746747af48fef6da689e85b752f39557L54)
[[2]](diffhunk://#diff-dd2c0eb6ea5cfc6c4bd4eac30934e2d5746747af48fef6da689e85b752f39557R71-R75)
[[3]](diffhunk://#diff-571631582b988e88c52c86960cc083b0b8fa63cf88f056f26e9e684195221c27L79-R89)
### Code Cleanup and Refactoring:
* Refactored `TempFileRegistry` to improve readability by simplifying
variable initialization.
[[1]](diffhunk://#diff-ed08845255ade38ea6e28a959ed967957611025fb7f8f5dcf4543b5f200a5fecL12)
[[2]](diffhunk://#diff-ed08845255ade38ea6e28a959ed967957611025fb7f8f5dcf4543b5f200a5fecL30-R29)
* Updated `CompressController` to dynamically check tool group
availability using the new endpoint configuration logic.
[[1]](diffhunk://#diff-fc8dc1845d34077a089d9265521ce8c8d6104a8b79137e1de5310e30ffb0348aR50)
[[2]](diffhunk://#diff-fc8dc1845d34077a089d9265521ce8c8d6104a8b79137e1de5310e30ffb0348aR65-R76)
These changes collectively enhance the application's capabilities for
PDF processing, improve dependency management, and streamline endpoint
handling.
---
## Checklist
### General
- [ ] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [ ] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md)
(if applicable)
- [ ] I have performed a self-review of my own code
- [ ] My changes generate no new warnings
### Documentation
- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)
### UI Changes (if applicable)
- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)
### Testing (if applicable)
- [ ] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing)
for more details.
2025-07-01 14:50:56 +01:00
|
|
|
<div class="mb-3" th:if="${@endpointConfiguration.isGroupEnabled('OCRmyPDF')}">
|
|
|
|
<label class="form-label">OCR Options</label>
|
|
|
|
<div class="form-check">
|
|
|
|
<input type="checkbox" class="form-check-input" id="sidecar" name="sidecar" value="true">
|
|
|
|
<label class="form-check-label" for="sidecar">Include OCR text in sidecar text file</label>
|
|
|
|
</div>
|
|
|
|
<div class="form-check">
|
|
|
|
<input type="checkbox" class="form-check-input" id="deskew" name="deskew" value="true">
|
|
|
|
<label class="form-check-label" for="deskew">Deskew input file</label>
|
|
|
|
</div>
|
|
|
|
<div class="form-check">
|
|
|
|
<input type="checkbox" class="form-check-input" id="clean" name="clean" value="true">
|
|
|
|
<label class="form-check-label" for="clean">Clean input file</label>
|
|
|
|
</div>
|
|
|
|
<div class="form-check">
|
|
|
|
<input type="checkbox" class="form-check-input" id="cleanFinal" name="cleanFinal" value="true">
|
|
|
|
<label class="form-check-label" for="cleanFinal">Clean final output</label>
|
|
|
|
</div>
|
|
|
|
<div class="form-check">
|
|
|
|
<input type="checkbox" class="form-check-input" id="removeImagesAfter" name="removeImagesAfter" value="true">
|
|
|
|
<label class="form-check-label" for="removeImagesAfter">Remove images from output PDF</label>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
<br>
|
2024-02-16 22:49:06 +01:00
|
|
|
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{ocr.submit}"></button>
|
|
|
|
</form>
|
2025-05-27 13:50:16 +01:00
|
|
|
<script th:inline="javascript">
|
|
|
|
// Get language translations from Thymeleaf messages
|
|
|
|
const languageTranslations = {};
|
|
|
|
/*[# th:each="lang : ${languages}"]*/
|
|
|
|
languageTranslations['[(${lang})]'] = /*[[#{${'lang.' + lang}}]]*/[(${lang})];
|
|
|
|
/*[/]*/
|
|
|
|
|
|
|
|
// Map two-letter language codes to Tesseract language codes
|
|
|
|
const localeToTesseract = {
|
|
|
|
'en': 'eng',
|
|
|
|
'fr': 'fra',
|
|
|
|
'de': 'deu',
|
|
|
|
'es': 'spa',
|
|
|
|
'it': 'ita',
|
|
|
|
'pt': 'por',
|
|
|
|
'ru': 'rus',
|
|
|
|
'zh': 'chi_sim',
|
|
|
|
'ja': 'jpn',
|
|
|
|
'ko': 'kor',
|
|
|
|
'ar': 'ara',
|
|
|
|
'hi': 'hin',
|
|
|
|
'nl': 'nld',
|
|
|
|
'cs': 'ces',
|
|
|
|
'pl': 'pol',
|
|
|
|
'tr': 'tur',
|
|
|
|
'uk': 'ukr',
|
|
|
|
'vi': 'vie',
|
|
|
|
'sv': 'swe',
|
|
|
|
'no': 'nor',
|
|
|
|
'fi': 'fin',
|
|
|
|
'da': 'dan',
|
|
|
|
'el': 'ell',
|
|
|
|
'he': 'heb',
|
|
|
|
'hu': 'hun',
|
|
|
|
'bg': 'bul',
|
|
|
|
'ro': 'ron',
|
|
|
|
'hr': 'hrv',
|
|
|
|
'sk': 'slk',
|
|
|
|
'id': 'ind',
|
|
|
|
'th': 'tha',
|
|
|
|
'sl': 'slv'
|
2024-02-16 22:49:06 +01:00
|
|
|
};
|
|
|
|
|
2025-05-27 13:50:16 +01:00
|
|
|
// Function to get the translated language name
|
|
|
|
function getTranslatedLanguageName(shortCode) {
|
|
|
|
return languageTranslations[shortCode] || shortCode;
|
2024-02-16 22:49:06 +01:00
|
|
|
}
|
|
|
|
|
2025-05-27 13:50:16 +01:00
|
|
|
// Function to prioritize languages based on browser locale
|
|
|
|
function prioritizeLanguages() {
|
|
|
|
const languageContainer = document.getElementById('languages');
|
|
|
|
if (!languageContainer) return;
|
|
|
|
|
|
|
|
// Update all labels with translated language names
|
|
|
|
const formChecks = Array.from(languageContainer.getElementsByClassName('form-check'));
|
|
|
|
if (formChecks.length === 0) return;
|
|
|
|
|
|
|
|
formChecks.forEach(element => {
|
|
|
|
const label = element.querySelector('label');
|
|
|
|
if (label) {
|
|
|
|
const langCode = label.getAttribute('for').split('-')[1];
|
|
|
|
label.textContent = getTranslatedLanguageName(langCode);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
// Get browser/UI language
|
|
|
|
const browserLanguage = document.documentElement.lang || navigator.language || navigator.userLanguage;
|
|
|
|
const uiLanguage = document.documentElement.getAttribute('data-language') || browserLanguage;
|
|
|
|
|
|
|
|
// Extract primary language code
|
|
|
|
const primaryLanguageCode = (uiLanguage || '').split(/[-_]/)[0].toLowerCase();
|
|
|
|
const tesseractPrimaryCode = localeToTesseract[primaryLanguageCode];
|
|
|
|
|
|
|
|
// Define priority language array
|
|
|
|
const priorityLanguages = [];
|
|
|
|
|
|
|
|
// Primary language first
|
|
|
|
if (tesseractPrimaryCode) {
|
|
|
|
priorityLanguages.push(tesseractPrimaryCode);
|
|
|
|
}
|
|
|
|
|
|
|
|
// English second (if not already added)
|
|
|
|
if (tesseractPrimaryCode !== 'eng') {
|
|
|
|
priorityLanguages.push('eng');
|
|
|
|
}
|
|
|
|
|
|
|
|
// Sort the elements
|
|
|
|
const sortedElements = formChecks.sort((a, b) => {
|
|
|
|
const aInput = a.querySelector('input');
|
|
|
|
const bInput = b.querySelector('input');
|
|
|
|
|
|
|
|
if (!aInput || !bInput) return 0;
|
|
|
|
|
|
|
|
const aLangCode = aInput.id.split('-')[1];
|
|
|
|
const bLangCode = bInput.id.split('-')[1];
|
|
|
|
|
|
|
|
const aIsPriority = priorityLanguages.includes(aLangCode);
|
|
|
|
const bIsPriority = priorityLanguages.includes(bLangCode);
|
|
|
|
|
|
|
|
if (aIsPriority && !bIsPriority) return -1;
|
|
|
|
if (!aIsPriority && bIsPriority) return 1;
|
|
|
|
|
|
|
|
if (aIsPriority && bIsPriority) {
|
|
|
|
// If both are priority, sort by priority order
|
|
|
|
return priorityLanguages.indexOf(aLangCode) - priorityLanguages.indexOf(bLangCode);
|
|
|
|
}
|
|
|
|
|
|
|
|
// For non-priority languages, sort alphabetically by full name
|
|
|
|
return getTranslatedLanguageName(aLangCode).localeCompare(getTranslatedLanguageName(bLangCode));
|
|
|
|
});
|
|
|
|
|
|
|
|
// Clear container and append sorted elements
|
|
|
|
languageContainer.innerHTML = '';
|
|
|
|
sortedElements.forEach(element => {
|
|
|
|
languageContainer.appendChild(element);
|
2024-02-16 22:49:06 +01:00
|
|
|
});
|
2025-05-27 13:50:16 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Initialize on page load
|
|
|
|
document.addEventListener('DOMContentLoaded', () => {
|
|
|
|
prioritizeLanguages();
|
2024-02-16 22:49:06 +01:00
|
|
|
});
|
|
|
|
</script>
|
|
|
|
<p th:text="#{ocr.help}"></p>
|
2024-12-30 15:49:28 +00:00
|
|
|
<a href="https://docs.stirlingpdf.com/Advanced%20Configuration/OCR">https://docs.stirlingpdf.com/Advanced%20Configuration/OCR</a>
|
2023-03-20 21:55:11 +00:00
|
|
|
</div>
|
2024-02-16 22:49:06 +01:00
|
|
|
</div>
|
2023-03-20 21:55:11 +00:00
|
|
|
</div>
|
2024-02-16 22:49:06 +01:00
|
|
|
</div>
|
|
|
|
<th:block th:insert="~{fragments/footer.html :: footer}"></th:block>
|
2023-03-20 21:55:11 +00:00
|
|
|
</div>
|
2024-02-16 22:49:06 +01:00
|
|
|
</body>
|
2024-03-21 21:58:01 +01:00
|
|
|
</html>
|