diff --git a/.github/workflows/PR-Demo-Comment-with-react.yml b/.github/workflows/PR-Demo-Comment-with-react.yml index 8bb7475e3..8850981bb 100644 --- a/.github/workflows/PR-Demo-Comment-with-react.yml +++ b/.github/workflows/PR-Demo-Comment-with-react.yml @@ -38,7 +38,8 @@ jobs: pr_ref: ${{ steps.get-pr-info.outputs.ref }} comment_id: ${{ github.event.comment.id }} disable_security: ${{ steps.check-security-flag.outputs.disable_security }} - + enable_pro: ${{ steps.check-pro-flag.outputs.enable_pro }} + enable_enterprise: ${{ steps.check-pro-flag.outputs.enable_enterprise }} steps: - name: Harden Runner uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1 @@ -98,6 +99,25 @@ jobs: echo "disable_security=true" >> $GITHUB_OUTPUT fi + - name: Check for pro flag + id: check-pro-flag + env: + COMMENT_BODY: ${{ github.event.comment.body }} + run: | + if [[ "$COMMENT_BODY" == *"pro"* ]] || [[ "$COMMENT_BODY" == *"premium"* ]]; then + echo "pro flags detected in comment" + echo "enable_pro=true" >> $GITHUB_OUTPUT + echo "enable_enterprise=false" >> $GITHUB_OUTPUT + elif [[ "$COMMENT_BODY" == *"enterprise"* ]]; then + echo "enterprise flags detected in comment" + echo "enable_enterprise=true" >> $GITHUB_OUTPUT + echo "enable_pro=false" >> $GITHUB_OUTPUT + else + echo "No pro or enterprise flags detected in comment" + echo "enable_pro=false" >> $GITHUB_OUTPUT + echo "enable_enterprise=false" >> $GITHUB_OUTPUT + fi + - name: Add 'in_progress' reaction to comment id: add-eyes-reaction uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 @@ -209,6 +229,21 @@ jobs: SECURITY_STATUS="Security Disabled" fi + # Set pro/enterprise settings (enterprise implies pro) + if [ "${{ needs.check-comment.outputs.enable_enterprise }}" == "true" ]; then + PREMIUM_ENABLED="true" + PREMIUM_KEY="${{ secrets.ENTERPRISE_KEY }}" + PREMIUM_PROFEATURES_AUDIT_ENABLED="true" + elif [ "${{ needs.check-comment.outputs.enable_pro }}" == "true" ]; then + PREMIUM_ENABLED="true" + PREMIUM_KEY="${{ secrets.PREMIUM_KEY }}" + PREMIUM_PROFEATURES_AUDIT_ENABLED="true" + else + PREMIUM_ENABLED="false" + PREMIUM_KEY="" + PREMIUM_PROFEATURES_AUDIT_ENABLED="false" + fi + # First create the docker-compose content locally cat > docker-compose.yml << EOF version: '3.3' @@ -232,6 +267,9 @@ jobs: SYSTEM_MAXFILESIZE: "100" METRICS_ENABLED: "true" SYSTEM_GOOGLEVISIBILITY: "false" + PREMIUM_KEY: "${PREMIUM_KEY}" + PREMIUM_ENABLED: "${PREMIUM_ENABLED}" + PREMIUM_PROFEATURES_AUDIT_ENABLED: "${PREMIUM_PROFEATURES_AUDIT_ENABLED}" restart: on-failure:5 EOF diff --git a/HowToAddNewLanguage.md b/HowToAddNewLanguage.md index cdc4729a2..94caca12a 100644 --- a/HowToAddNewLanguage.md +++ b/HowToAddNewLanguage.md @@ -61,8 +61,16 @@ Make sure to place the entry under the correct language section. This helps main #### Windows command -```ps -python .github/scripts/check_language_properties.py --reference-file src\main\resources\messages_en_GB.properties --branch "" --files src\main\resources\messages_pl_PL.properties +```powershell +python .github/scripts/check_language_properties.py --reference-file stirling-pdf\src\main\resources\messages_en_GB.properties --branch "" --files stirling-pdf\src\main\resources\messages_pl_PL.properties -python .github/scripts/check_language_properties.py --reference-file src\main\resources\messages_en_GB.properties --branch "" --check-file src\main\resources\messages_pl_PL.properties +python .github/scripts/check_language_properties.py --reference-file stirling-pdf\src\main\resources\messages_en_GB.properties --branch "" --check-file stirling-pdf\src\main\resources\messages_pl_PL.properties +``` + +#### Linux command + +```bash +python3 .github/scripts/check_language_properties.py --reference-file stirling-pdf/src/main/resources/messages_en_GB.properties --branch "" --files stirling-pdf/src/main/resources/messages_pl_PL.properties + +python3 .github/scripts/check_language_properties.py --reference-file stirling-pdf/src/main/resources/messages_en_GB.properties --branch "" --check-file stirling-pdf/src/main/resources/messages_pl_PL.properties ``` diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java index 9f8c19158..a289a5421 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java @@ -1,16 +1,61 @@ package stirling.software.SPDF.controller.api.converters; +import java.awt.Color; +import java.io.ByteArrayOutputStream; import io.github.pixee.security.Filenames; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; import java.io.File; +import java.io.IOException; +import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; +import java.util.Calendar; +import java.util.Collections; +import java.util.HashSet; import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.TimeZone; + import lombok.extern.slf4j.Slf4j; import org.apache.commons.io.FileUtils; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdfwriter.compress.CompressParameters; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentCatalog; +import org.apache.pdfbox.pdmodel.PDDocumentInformation; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.common.PDMetadata; +import org.apache.pdfbox.pdmodel.common.PDStream; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; +import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; +import org.apache.pdfbox.pdmodel.font.PDType0Font; +import org.apache.pdfbox.pdmodel.graphics.PDXObject; +import org.apache.pdfbox.pdmodel.graphics.color.PDOutputIntent; +import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; +import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup; +import org.apache.pdfbox.pdmodel.interactive.viewerpreferences.PDViewerPreferences; +import org.apache.xmpbox.XMPMetadata; +import org.apache.xmpbox.schema.AdobePDFSchema; +import org.apache.xmpbox.schema.DublinCoreSchema; +import org.apache.xmpbox.schema.PDFAIdentificationSchema; +import org.apache.xmpbox.schema.XMPBasicSchema; +import org.apache.xmpbox.xml.DomXmpParser; +import org.apache.xmpbox.xml.XmpSerializer; import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.ModelAttribute; @@ -56,54 +101,37 @@ public class ConvertPDFToPDFA { : originalFileName; Path tempInputFile = null; - Path tempOutputDir = null; byte[] fileBytes; + Path loPdfPath = null; // Used for LibreOffice conversion output + File preProcessedFile = null; + int pdfaPart = 2; try { // Save uploaded file to temp location tempInputFile = Files.createTempFile("input_", ".pdf"); inputFile.transferTo(tempInputFile); - // Create temp output directory - tempOutputDir = Files.createTempDirectory("output_"); - - // Determine PDF/A filter based on requested format - String pdfFilter = - "pdfa".equals(outputFormat) - ? "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"2\"}}" - : "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"1\"}}"; - - // Prepare LibreOffice command - List command = - new ArrayList<>( - Arrays.asList( - "soffice", - "--headless", - "--nologo", - "--convert-to", - pdfFilter, - "--outdir", - tempOutputDir.toString(), - tempInputFile.toString())); - - ProcessExecutorResult returnCode = - ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE) - .runCommandWithOutputHandling(command); - - if (returnCode.getRc() != 0) { - log.error("PDF/A conversion failed with return code: {}", returnCode.getRc()); - throw new RuntimeException("PDF/A conversion failed"); + // Branch conversion based on desired output PDF/A format + if ("pdfa".equals(outputFormat)) { + preProcessedFile = tempInputFile.toFile(); + } else { + pdfaPart = 1; + preProcessedFile = preProcessHighlights(tempInputFile.toFile()); } - - // Get the output file - File[] outputFiles = tempOutputDir.toFile().listFiles(); - if (outputFiles == null || outputFiles.length != 1) { - throw new RuntimeException( - "Expected exactly one output file but found " - + (outputFiles == null ? "none" : outputFiles.length)); + Set missingFonts = new HashSet<>(); + boolean needImgs = false; + try (PDDocument doc = Loader.loadPDF(preProcessedFile)) { + missingFonts = findUnembeddedFontNames(doc); + needImgs = (pdfaPart == 1) && hasTransparentImages(doc); + if (!missingFonts.isEmpty() || needImgs) { + // Run LibreOffice conversion to get flattened images and embedded fonts + loPdfPath = runLibreOfficeConversion(preProcessedFile.toPath(), pdfaPart); + } } + fileBytes = + convertToPdfA( + preProcessedFile.toPath(), loPdfPath, pdfaPart, missingFonts, needImgs); - fileBytes = FileUtils.readFileToByteArray(outputFiles[0]); String outputFilename = baseFileName + "_PDFA.pdf"; return WebResponseUtils.bytesToWebResponse( @@ -114,9 +142,557 @@ public class ConvertPDFToPDFA { if (tempInputFile != null) { Files.deleteIfExists(tempInputFile); } - if (tempOutputDir != null) { - FileUtils.deleteDirectory(tempOutputDir.toFile()); + if (loPdfPath != null && loPdfPath.getParent() != null) { + FileUtils.deleteDirectory(loPdfPath.getParent().toFile()); + } + if (preProcessedFile != null) { + Files.deleteIfExists(preProcessedFile.toPath()); } } } + + /** + * Merge fonts & flattened images from loPdfPath into basePdfPath, then run the standard + * PDFBox/A pipeline. + * + * @param basePdfPath Path to the original (or highlight‐preprocessed) PDF + * @param loPdfPath Path to the LibreOffice–flattened PDF/A, or null if not used + * @param pdfaPart 1 (PDF/A-1B) or 2 (PDF/A-2B) + * @return the final PDF/A bytes + */ + private byte[] convertToPdfA( + Path basePdfPath, + Path loPdfPath, + int pdfaPart, + Set missingFonts, + boolean importImages) + throws Exception { + try (PDDocument baseDoc = Loader.loadPDF(basePdfPath.toFile())) { + + if (loPdfPath != null) { + try (PDDocument loDoc = Loader.loadPDF(loPdfPath.toFile())) { + if (!missingFonts.isEmpty()) { + embedMissingFonts(loDoc, baseDoc, missingFonts); + } + if (importImages) { + importFlattenedImages(loDoc, baseDoc); + } + } + } + return processWithPDFBox(baseDoc, pdfaPart); + } + } + + private byte[] processWithPDFBox(PDDocument document, int pdfaPart) throws Exception { + + removeElementsForPdfA(document, pdfaPart); + + mergeAndAddXmpMetadata(document, pdfaPart); + + addICCProfileIfNotPresent(document); + + // Mark the document as PDF/A + PDDocumentCatalog catalog = document.getDocumentCatalog(); + catalog.setMetadata( + document.getDocumentCatalog().getMetadata()); // Ensure metadata is linked + catalog.setViewerPreferences( + new PDViewerPreferences(catalog.getCOSObject())); // PDF/A best practice + document.getDocument().setVersion(pdfaPart == 1 ? 1.4f : 1.7f); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + if (pdfaPart == 1) { + document.save(baos, CompressParameters.NO_COMPRESSION); + } else { + document.save(baos); + } + + return baos.toByteArray(); + } + + private Path runLibreOfficeConversion(Path tempInputFile, int pdfaPart) throws Exception { + // Create temp output directory + Path tempOutputDir = Files.createTempDirectory("output_"); + + // Determine PDF/A filter based on requested format + String pdfFilter = + pdfaPart == 2 + ? "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"2\"}}" + : "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"1\"}}"; + + // Prepare LibreOffice command + List command = + new ArrayList<>( + Arrays.asList( + "soffice", + "--headless", + "--nologo", + "--convert-to", + pdfFilter, + "--outdir", + tempOutputDir.toString(), + tempInputFile.toString())); + + ProcessExecutorResult returnCode = + ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE) + .runCommandWithOutputHandling(command); + + if (returnCode.getRc() != 0) { + log.error("PDF/A conversion failed with return code: {}", returnCode.getRc()); + throw new RuntimeException("PDF/A conversion failed"); + } + + // Get the output file + File[] outputFiles = tempOutputDir.toFile().listFiles(); + if (outputFiles == null || outputFiles.length != 1) { + throw new RuntimeException( + "Expected one output PDF, found " + + (outputFiles == null ? "none" : outputFiles.length)); + } + return outputFiles[0].toPath(); + } + + private void embedMissingFonts(PDDocument loDoc, PDDocument baseDoc, Set missingFonts) + throws IOException { + List loPages = new ArrayList<>(); + loDoc.getPages().forEach(loPages::add); + List basePages = new ArrayList<>(); + baseDoc.getPages().forEach(basePages::add); + + for (int i = 0; i < loPages.size(); i++) { + PDResources loRes = loPages.get(i).getResources(); + PDResources baseRes = basePages.get(i).getResources(); + + for (COSName fontKey : loRes.getFontNames()) { + PDFont loFont = loRes.getFont(fontKey); + if (loFont == null) continue; + + String psName = loFont.getName(); + if (!missingFonts.contains(psName)) continue; + + PDFontDescriptor desc = loFont.getFontDescriptor(); + if (desc == null) continue; + + PDStream fontStream = null; + if (desc.getFontFile() != null) { + fontStream = desc.getFontFile(); + } else if (desc.getFontFile2() != null) { + fontStream = desc.getFontFile2(); + } else if (desc.getFontFile3() != null) { + fontStream = desc.getFontFile3(); + } + if (fontStream == null) continue; + + try (InputStream in = fontStream.createInputStream()) { + PDFont newFont = null; + try { + newFont = PDType0Font.load(baseDoc, in, false); + } catch (IOException e1) { + try { + newFont = PDTrueTypeFont.load(baseDoc, in, null); + } catch (IOException | IllegalArgumentException e2) { + log.error("Could not embed font {}: {}", psName, e2.getMessage()); + continue; + } + } + if (newFont != null) { + baseRes.put(fontKey, newFont); + } + } + } + } + } + + private Set findUnembeddedFontNames(PDDocument doc) throws IOException { + Set missing = new HashSet<>(); + for (PDPage page : doc.getPages()) { + PDResources res = page.getResources(); + for (COSName name : res.getFontNames()) { + PDFont font = res.getFont(name); + if (font != null && !font.isEmbedded()) { + missing.add(font.getName()); + } + } + } + return missing; + } + + private void importFlattenedImages(PDDocument loDoc, PDDocument baseDoc) throws IOException { + List loPages = new ArrayList<>(); + loDoc.getPages().forEach(loPages::add); + List basePages = new ArrayList<>(); + baseDoc.getPages().forEach(basePages::add); + + for (int i = 0; i < loPages.size(); i++) { + PDPage loPage = loPages.get(i); + PDPage basePage = basePages.get(i); + + PDResources loRes = loPage.getResources(); + PDResources baseRes = basePage.getResources(); + Set toReplace = detectTransparentXObjects(basePage); + + for (COSName name : toReplace) { + PDXObject loXo = loRes.getXObject(name); + if (!(loXo instanceof PDImageXObject img)) continue; + + PDImageXObject newImg = LosslessFactory.createFromImage(baseDoc, img.getImage()); + + // replace the resource under the same name + baseRes.put(name, newImg); + } + } + } + + private Set detectTransparentXObjects(PDPage page) { + Set transparentObjects = new HashSet<>(); + PDResources res = page.getResources(); + if (res == null) return transparentObjects; + + for (COSName name : res.getXObjectNames()) { + try { + PDXObject xo = res.getXObject(name); + if (xo instanceof PDImageXObject img) { + COSDictionary d = img.getCOSObject(); + if (d.containsKey(COSName.SMASK) + || isTransparencyGroup(d) + || d.getBoolean(COSName.INTERPOLATE, false)) { + transparentObjects.add(name); + } + } + } catch (IOException ioe) { + log.error("Error processing XObject {}: {}", name.getName(), ioe.getMessage()); + } + } + return transparentObjects; + } + + private boolean isTransparencyGroup(COSDictionary dict) { + COSBase g = dict.getDictionaryObject(COSName.GROUP); + return g instanceof COSDictionary gd + && COSName.TRANSPARENCY.equals(gd.getCOSName(COSName.S)); + } + + private boolean hasTransparentImages(PDDocument doc) { + for (PDPage page : doc.getPages()) { + PDResources res = page.getResources(); + if (res == null) continue; + for (COSName name : res.getXObjectNames()) { + try { + PDXObject xo = res.getXObject(name); + if (xo instanceof PDImageXObject img) { + COSDictionary dict = img.getCOSObject(); + if (dict.containsKey(COSName.SMASK)) return true; + COSBase g = dict.getDictionaryObject(COSName.GROUP); + if (g instanceof COSDictionary gd + && COSName.TRANSPARENCY.equals(gd.getCOSName(COSName.S))) { + return true; + } + if (dict.getBoolean(COSName.INTERPOLATE, false)) return true; + } + } catch (IOException ioe) { + log.error("Error processing XObject {}: {}", name.getName(), ioe.getMessage()); + } + } + } + return false; + } + + private void sanitizePdfA(COSBase base, PDResources resources, int pdfaPart) { + if (base instanceof COSDictionary dict) { + if (pdfaPart == 1) { + // Remove transparency-related elements + COSBase group = dict.getDictionaryObject(COSName.GROUP); + if (group instanceof COSDictionary gDict + && COSName.TRANSPARENCY.equals(gDict.getCOSName(COSName.S))) { + dict.removeItem(COSName.GROUP); + } + + dict.removeItem(COSName.SMASK); + // Transparency blending constants (/CA, /ca) — disallowed in PDF/A-1 + dict.removeItem(COSName.CA); + dict.removeItem(COSName.getPDFName("ca")); + } + + // Interpolation (non-deterministic image scaling) — required to be false + if (dict.containsKey(COSName.INTERPOLATE) + && dict.getBoolean(COSName.INTERPOLATE, true)) { + dict.setBoolean(COSName.INTERPOLATE, false); + } + + // Remove common forbidden features (for PDF/A 1 and 2) + dict.removeItem(COSName.JAVA_SCRIPT); + dict.removeItem(COSName.getPDFName("JS")); + dict.removeItem(COSName.getPDFName("RichMedia")); + dict.removeItem(COSName.getPDFName("Movie")); + dict.removeItem(COSName.getPDFName("Sound")); + dict.removeItem(COSName.getPDFName("Launch")); + dict.removeItem(COSName.URI); + dict.removeItem(COSName.getPDFName("GoToR")); + dict.removeItem(COSName.EMBEDDED_FILES); + dict.removeItem(COSName.FILESPEC); + + // Recurse through all entries in the dictionary + for (Map.Entry entry : dict.entrySet()) { + sanitizePdfA(entry.getValue(), resources, pdfaPart); + } + + } else if (base instanceof COSArray arr) { + // Recursively sanitize each item in the array + for (COSBase item : arr) { + sanitizePdfA(item, resources, pdfaPart); + } + } + } + + private void removeElementsForPdfA(PDDocument doc, int pdfaPart) { + + if (pdfaPart == 1) { + // Remove Optional Content (Layers) - not allowed in PDF/A-1 + doc.getDocumentCatalog().getCOSObject().removeItem(COSName.getPDFName("OCProperties")); + } + + for (PDPage page : doc.getPages()) { + if (pdfaPart == 1) { + page.setAnnotations(Collections.emptyList()); + } + PDResources res = page.getResources(); + // Clean page-level dictionary + sanitizePdfA(page.getCOSObject(), res, pdfaPart); + + // sanitize each Form XObject + if (res != null) { + for (COSName name : res.getXObjectNames()) { + try { + PDXObject xo = res.getXObject(name); + if (xo instanceof PDFormXObject form) { + sanitizePdfA(form.getCOSObject(), res, pdfaPart); + } else if (xo instanceof PDImageXObject img) { + sanitizePdfA(img.getCOSObject(), res, pdfaPart); + } + } catch (IOException ioe) { + log.error("Cannot load XObject {}: {}", name.getName(), ioe.getMessage()); + } + } + } + } + } + + /** Embbeds the XMP metadata required for PDF/A compliance. */ + private void mergeAndAddXmpMetadata(PDDocument document, int pdfaPart) throws Exception { + PDMetadata existingMetadata = document.getDocumentCatalog().getMetadata(); + XMPMetadata xmp; + + // Load existing XMP if available + if (existingMetadata != null) { + try (InputStream xmpStream = existingMetadata.createInputStream()) { + DomXmpParser parser = new DomXmpParser(); + parser.setStrictParsing(false); + xmp = parser.parse(xmpStream); + } catch (Exception e) { + xmp = XMPMetadata.createXMPMetadata(); + } + } else { + xmp = XMPMetadata.createXMPMetadata(); + } + + PDDocumentInformation docInfo = document.getDocumentInformation(); + if (docInfo == null) { + docInfo = new PDDocumentInformation(); + } + + String originalCreator = Optional.ofNullable(docInfo.getCreator()).orElse("Unknown"); + String originalProducer = Optional.ofNullable(docInfo.getProducer()).orElse("Unknown"); + + // Only keep the original creator so it can match xmp creator tool for compliance + DublinCoreSchema dcSchema = xmp.getDublinCoreSchema(); + if (dcSchema != null) { + List existingCreators = dcSchema.getCreators(); + if (existingCreators != null) { + for (String creator : new ArrayList<>(existingCreators)) { + dcSchema.removeCreator(creator); + } + } + } else { + dcSchema = xmp.createAndAddDublinCoreSchema(); + } + dcSchema.addCreator(originalCreator); + + PDFAIdentificationSchema pdfaSchema = + (PDFAIdentificationSchema) xmp.getSchema(PDFAIdentificationSchema.class); + if (pdfaSchema == null) { + pdfaSchema = xmp.createAndAddPDFAIdentificationSchema(); + } + pdfaSchema.setPart(pdfaPart); + pdfaSchema.setConformance("B"); + + XMPBasicSchema xmpBasicSchema = xmp.getXMPBasicSchema(); + if (xmpBasicSchema == null) { + xmpBasicSchema = xmp.createAndAddXMPBasicSchema(); + } + + AdobePDFSchema adobePdfSchema = xmp.getAdobePDFSchema(); + if (adobePdfSchema == null) { + adobePdfSchema = xmp.createAndAddAdobePDFSchema(); + } + + docInfo.setCreator(originalCreator); + xmpBasicSchema.setCreatorTool(originalCreator); + + docInfo.setProducer(originalProducer); + adobePdfSchema.setProducer(originalProducer); + + String originalAuthor = docInfo.getAuthor(); + if (originalAuthor != null && !originalAuthor.isBlank()) { + docInfo.setAuthor(null); + // If the author is set, we keep it in the XMP metadata + if (!originalCreator.equals(originalAuthor)) { + dcSchema.addCreator(originalAuthor); + } + } + + String title = docInfo.getTitle(); + if (title != null && !title.isBlank()) { + dcSchema.setTitle(title); + } + String subject = docInfo.getSubject(); + if (subject != null && !subject.isBlank()) { + dcSchema.addSubject(subject); + } + String keywords = docInfo.getKeywords(); + if (keywords != null && !keywords.isBlank()) { + adobePdfSchema.setKeywords(keywords); + } + + // Set creation and modification dates + Calendar now = Calendar.getInstance(TimeZone.getTimeZone("UTC")); + Calendar originalCreationDate = docInfo.getCreationDate(); + if (originalCreationDate == null) { + originalCreationDate = now; + } + docInfo.setCreationDate(originalCreationDate); + xmpBasicSchema.setCreateDate(originalCreationDate); + + docInfo.setModificationDate(now); + xmpBasicSchema.setModifyDate(now); + xmpBasicSchema.setMetadataDate(now); + + // Serialize the created metadata so it can be attached to the existent metadata + ByteArrayOutputStream xmpOut = new ByteArrayOutputStream(); + new XmpSerializer().serialize(xmp, xmpOut, true); + + PDMetadata newMetadata = new PDMetadata(document); + newMetadata.importXMPMetadata(xmpOut.toByteArray()); + document.getDocumentCatalog().setMetadata(newMetadata); + } + + private void addICCProfileIfNotPresent(PDDocument document) throws Exception { + if (document.getDocumentCatalog().getOutputIntents().isEmpty()) { + try (InputStream colorProfile = getClass().getResourceAsStream("/icc/sRGB2014.icc")) { + PDOutputIntent outputIntent = new PDOutputIntent(document, colorProfile); + outputIntent.setInfo("sRGB IEC61966-2.1"); + outputIntent.setOutputCondition("sRGB IEC61966-2.1"); + outputIntent.setOutputConditionIdentifier("sRGB IEC61966-2.1"); + outputIntent.setRegistryName("http://www.color.org"); + document.getDocumentCatalog().addOutputIntent(outputIntent); + } catch (Exception e) { + log.error("Failed to load ICC profile: {}", e.getMessage()); + } + } + } + + private File preProcessHighlights(File inputPdf) throws Exception { + + try (PDDocument document = Loader.loadPDF(inputPdf)) { + + for (PDPage page : document.getPages()) { + // Retrieve the annotations on the page. + List annotations = page.getAnnotations(); + for (PDAnnotation annot : annotations) { + // Process only highlight annotations. + if ("Highlight".equals(annot.getSubtype()) + && annot instanceof PDAnnotationTextMarkup highlight) { + // Create a new appearance stream with the same bounding box. + float[] colorComponents = + highlight.getColor() != null + ? highlight.getColor().getComponents() + : new float[] {1f, 1f, 0f}; + Color highlightColor = + new Color( + colorComponents[0], colorComponents[1], colorComponents[2]); + + float[] quadPoints = highlight.getQuadPoints(); + if (quadPoints != null) { + try (PDPageContentStream cs = + new PDPageContentStream( + document, + page, + PDPageContentStream.AppendMode.PREPEND, + true, + true)) { + + cs.setStrokingColor(highlightColor); + cs.setLineWidth(0.05f); + float spacing = 2f; + // Draw diagonal lines across the highlight area to simulate + // transparency. + for (int i = 0; i < quadPoints.length; i += 8) { + float minX = + Math.min( + Math.min(quadPoints[i], quadPoints[i + 2]), + Math.min(quadPoints[i + 4], quadPoints[i + 6])); + float maxX = + Math.max( + Math.max(quadPoints[i], quadPoints[i + 2]), + Math.max(quadPoints[i + 4], quadPoints[i + 6])); + float minY = + Math.min( + Math.min(quadPoints[i + 1], quadPoints[i + 3]), + Math.min(quadPoints[i + 5], quadPoints[i + 7])); + float maxY = + Math.max( + Math.max(quadPoints[i + 1], quadPoints[i + 3]), + Math.max(quadPoints[i + 5], quadPoints[i + 7])); + + float width = maxX - minX; + float height = maxY - minY; + + for (float y = minY; y <= maxY; y += spacing) { + float len = Math.min(width, maxY - y); + cs.moveTo(minX, y); + cs.lineTo(minX + len, y + len); + } + for (float x = minX + spacing; x <= maxX; x += spacing) { + float len = Math.min(maxX - x, height); + cs.moveTo(x, minY); + cs.lineTo(x + len, minY + len); + } + } + + cs.stroke(); + } + } + + page.getAnnotations().remove(highlight); + COSDictionary pageDict = page.getCOSObject(); + + if (pageDict.containsKey(COSName.GROUP)) { + COSDictionary groupDict = + (COSDictionary) pageDict.getDictionaryObject(COSName.GROUP); + + if (groupDict != null) { + if (COSName.TRANSPARENCY + .getName() + .equalsIgnoreCase(groupDict.getNameAsString(COSName.S))) { + pageDict.removeItem(COSName.GROUP); + } + } + } + } + } + } + // Save the modified document to a temporary file. + File preProcessedFile = Files.createTempFile("preprocessed_", ".pdf").toFile(); + document.save(preProcessedFile); + return preProcessedFile; + } + } } diff --git a/stirling-pdf/src/main/resources/icc/sRGB2014.icc b/stirling-pdf/src/main/resources/icc/sRGB2014.icc new file mode 100644 index 000000000..49afbfef1 Binary files /dev/null and b/stirling-pdf/src/main/resources/icc/sRGB2014.icc differ