Merge branch 'main' into audit2

2025-08-22 04:09:22 +00:00 · 2025-06-17 17:41:03 +01:00 · 2025-06-17 17:41:03 +01:00 · 911c894023
commit 911c894023
parent eb82ca46e3 45b4588a42
4 changed files with 666 additions and 44 deletions
--- a/.github/workflows/PR-Demo-Comment-with-react.yml
+++ b/.github/workflows/PR-Demo-Comment-with-react.yml
@ -38,7 +38,8 @@ jobs:
      pr_ref: ${{ steps.get-pr-info.outputs.ref }}
      comment_id: ${{ github.event.comment.id }}
      disable_security: ${{ steps.check-security-flag.outputs.disable_security }}
-
+      enable_pro: ${{ steps.check-pro-flag.outputs.enable_pro }}
      enable_enterprise: ${{ steps.check-pro-flag.outputs.enable_enterprise }}
    steps:
      - name: Harden Runner
        uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
@ -98,6 +99,25 @@ jobs:
            echo "disable_security=true" >> $GITHUB_OUTPUT
          fi
      - name: Check for pro flag
        id: check-pro-flag
        env:
          COMMENT_BODY: ${{ github.event.comment.body }}
        run: |
          if [[ "$COMMENT_BODY" == *"pro"* ]] || [[ "$COMMENT_BODY" == *"premium"* ]]; then
            echo "pro flags detected in comment"
            echo "enable_pro=true" >> $GITHUB_OUTPUT
            echo "enable_enterprise=false" >> $GITHUB_OUTPUT
          elif [[ "$COMMENT_BODY" == *"enterprise"* ]]; then
            echo "enterprise flags detected in comment"
            echo "enable_enterprise=true" >> $GITHUB_OUTPUT
            echo "enable_pro=false" >> $GITHUB_OUTPUT
          else
            echo "No pro or enterprise flags detected in comment"
            echo "enable_pro=false" >> $GITHUB_OUTPUT
            echo "enable_enterprise=false" >> $GITHUB_OUTPUT
          fi
      - name: Add 'in_progress' reaction to comment
        id: add-eyes-reaction
        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
@ -209,6 +229,21 @@ jobs:
            SECURITY_STATUS="Security Disabled"
          fi
          # Set pro/enterprise settings (enterprise implies pro)
          if [ "${{ needs.check-comment.outputs.enable_enterprise }}" == "true" ]; then
            PREMIUM_ENABLED="true"
            PREMIUM_KEY="${{ secrets.ENTERPRISE_KEY }}"
            PREMIUM_PROFEATURES_AUDIT_ENABLED="true"
          elif [ "${{ needs.check-comment.outputs.enable_pro }}" == "true" ]; then
            PREMIUM_ENABLED="true"
            PREMIUM_KEY="${{ secrets.PREMIUM_KEY }}"
            PREMIUM_PROFEATURES_AUDIT_ENABLED="true"
          else
            PREMIUM_ENABLED="false"
            PREMIUM_KEY=""
            PREMIUM_PROFEATURES_AUDIT_ENABLED="false"
          fi
          # First create the docker-compose content locally
          cat > docker-compose.yml << EOF
          version: '3.3'
@ -232,6 +267,9 @@ jobs:
                SYSTEM_MAXFILESIZE: "100"
                METRICS_ENABLED: "true"
                SYSTEM_GOOGLEVISIBILITY: "false"
                PREMIUM_KEY:                         "${PREMIUM_KEY}"
                PREMIUM_ENABLED:                     "${PREMIUM_ENABLED}"
                PREMIUM_PROFEATURES_AUDIT_ENABLED:   "${PREMIUM_PROFEATURES_AUDIT_ENABLED}"
              restart: on-failure:5
          EOF
--- a/HowToAddNewLanguage.md
+++ b/HowToAddNewLanguage.md
@ -61,8 +61,16 @@ Make sure to place the entry under the correct language section. This helps main
 #### Windows command
-```ps
+```powershell
-python .github/scripts/check_language_properties.py --reference-file src\main\resources\messages_en_GB.properties --branch "" --files src\main\resources\messages_pl_PL.properties
+python .github/scripts/check_language_properties.py --reference-file stirling-pdf\src\main\resources\messages_en_GB.properties --branch "" --files stirling-pdf\src\main\resources\messages_pl_PL.properties
-python .github/scripts/check_language_properties.py --reference-file src\main\resources\messages_en_GB.properties --branch "" --check-file src\main\resources\messages_pl_PL.properties
+python .github/scripts/check_language_properties.py --reference-file stirling-pdf\src\main\resources\messages_en_GB.properties --branch "" --check-file stirling-pdf\src\main\resources\messages_pl_PL.properties
 ```
 #### Linux command
 ```bash
 python3 .github/scripts/check_language_properties.py --reference-file stirling-pdf/src/main/resources/messages_en_GB.properties --branch "" --files stirling-pdf/src/main/resources/messages_pl_PL.properties
 python3 .github/scripts/check_language_properties.py --reference-file stirling-pdf/src/main/resources/messages_en_GB.properties --branch "" --check-file stirling-pdf/src/main/resources/messages_pl_PL.properties
 ```
--- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java
+++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java
@ -1,16 +1,61 @@
 package stirling.software.SPDF.controller.api.converters;
 import java.awt.Color;
 import java.io.ByteArrayOutputStream;
 import io.github.pixee.security.Filenames;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.tags.Tag;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Calendar;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
 import java.util.TimeZone;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.commons.io.FileUtils;
 import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
 import org.apache.pdfbox.pdmodel.PDDocumentInformation;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDPageContentStream;
 import org.apache.pdfbox.pdmodel.PDResources;
 import org.apache.pdfbox.pdmodel.common.PDMetadata;
 import org.apache.pdfbox.pdmodel.common.PDStream;
 import org.apache.pdfbox.pdmodel.font.PDFont;
 import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
 import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
 import org.apache.pdfbox.pdmodel.font.PDType0Font;
 import org.apache.pdfbox.pdmodel.graphics.PDXObject;
 import org.apache.pdfbox.pdmodel.graphics.color.PDOutputIntent;
 import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
 import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
 import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
 import org.apache.pdfbox.pdmodel.interactive.viewerpreferences.PDViewerPreferences;
 import org.apache.xmpbox.XMPMetadata;
 import org.apache.xmpbox.schema.AdobePDFSchema;
 import org.apache.xmpbox.schema.DublinCoreSchema;
 import org.apache.xmpbox.schema.PDFAIdentificationSchema;
 import org.apache.xmpbox.schema.XMPBasicSchema;
 import org.apache.xmpbox.xml.DomXmpParser;
 import org.apache.xmpbox.xml.XmpSerializer;
 import org.springframework.http.MediaType;
 import org.springframework.http.ResponseEntity;
 import org.springframework.web.bind.annotation.ModelAttribute;
@ -56,54 +101,37 @@ public class ConvertPDFToPDFA {
                        : originalFileName;
        Path tempInputFile = null;
        Path tempOutputDir = null;
        byte[] fileBytes;
        Path loPdfPath = null; // Used for LibreOffice conversion output
        File preProcessedFile = null;
        int pdfaPart = 2;
        try {
            // Save uploaded file to temp location
            tempInputFile = Files.createTempFile("input_", ".pdf");
            inputFile.transferTo(tempInputFile);
-            // Create temp output directory
+            // Branch conversion based on desired output PDF/A format
-            tempOutputDir = Files.createTempDirectory("output_");
+            if ("pdfa".equals(outputFormat)) {
-
+                preProcessedFile = tempInputFile.toFile();
-            // Determine PDF/A filter based on requested format
+            } else {
-            String pdfFilter =
+                pdfaPart = 1;
-                    "pdfa".equals(outputFormat)
+                preProcessedFile = preProcessHighlights(tempInputFile.toFile());
                            ? "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"2\"}}"
                            : "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"1\"}}";
            // Prepare LibreOffice command
            List<String> command =
                    new ArrayList<>(
                            Arrays.asList(
                                    "soffice",
                                    "--headless",
                                    "--nologo",
                                    "--convert-to",
                                    pdfFilter,
                                    "--outdir",
                                    tempOutputDir.toString(),
                                    tempInputFile.toString()));
            ProcessExecutorResult returnCode =
                    ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
                            .runCommandWithOutputHandling(command);
            if (returnCode.getRc() != 0) {
                log.error("PDF/A conversion failed with return code: {}", returnCode.getRc());
                throw new RuntimeException("PDF/A conversion failed");
            }
-
+            Set<String> missingFonts = new HashSet<>();
-            // Get the output file
+            boolean needImgs = false;
-            File[] outputFiles = tempOutputDir.toFile().listFiles();
+            try (PDDocument doc = Loader.loadPDF(preProcessedFile)) {
-            if (outputFiles == null || outputFiles.length != 1) {
+                missingFonts = findUnembeddedFontNames(doc);
-                throw new RuntimeException(
+                needImgs = (pdfaPart == 1) && hasTransparentImages(doc);
-                        "Expected exactly one output file but found "
+                if (!missingFonts.isEmpty() || needImgs) {
-                                + (outputFiles == null ? "none" : outputFiles.length));
+                    // Run LibreOffice conversion to get flattened images and embedded fonts
                    loPdfPath = runLibreOfficeConversion(preProcessedFile.toPath(), pdfaPart);
                }
            }
            fileBytes =
                    convertToPdfA(
                            preProcessedFile.toPath(), loPdfPath, pdfaPart, missingFonts, needImgs);
            fileBytes = FileUtils.readFileToByteArray(outputFiles[0]);
            String outputFilename = baseFileName + "_PDFA.pdf";
            return WebResponseUtils.bytesToWebResponse(
@ -114,9 +142,557 @@ public class ConvertPDFToPDFA {
            if (tempInputFile != null) {
                Files.deleteIfExists(tempInputFile);
            }
-            if (tempOutputDir != null) {
+            if (loPdfPath != null && loPdfPath.getParent() != null) {
-                FileUtils.deleteDirectory(tempOutputDir.toFile());
+                FileUtils.deleteDirectory(loPdfPath.getParent().toFile());
            }
            if (preProcessedFile != null) {
                Files.deleteIfExists(preProcessedFile.toPath());
            }
        }
    }
    /**
     * Merge fonts & flattened images from loPdfPath into basePdfPath, then run the standard
     * PDFBox/A pipeline.
     *
     * @param basePdfPath Path to the original (or highlight‐preprocessed) PDF
     * @param loPdfPath Path to the LibreOffice–flattened PDF/A, or null if not used
     * @param pdfaPart 1 (PDF/A-1B) or 2 (PDF/A-2B)
     * @return the final PDF/A bytes
     */
    private byte[] convertToPdfA(
            Path basePdfPath,
            Path loPdfPath,
            int pdfaPart,
            Set<String> missingFonts,
            boolean importImages)
            throws Exception {
        try (PDDocument baseDoc = Loader.loadPDF(basePdfPath.toFile())) {
            if (loPdfPath != null) {
                try (PDDocument loDoc = Loader.loadPDF(loPdfPath.toFile())) {
                    if (!missingFonts.isEmpty()) {
                        embedMissingFonts(loDoc, baseDoc, missingFonts);
                    }
                    if (importImages) {
                        importFlattenedImages(loDoc, baseDoc);
                    }
                }
            }
            return processWithPDFBox(baseDoc, pdfaPart);
        }
    }
    private byte[] processWithPDFBox(PDDocument document, int pdfaPart) throws Exception {
        removeElementsForPdfA(document, pdfaPart);
        mergeAndAddXmpMetadata(document, pdfaPart);
        addICCProfileIfNotPresent(document);
        // Mark the document as PDF/A
        PDDocumentCatalog catalog = document.getDocumentCatalog();
        catalog.setMetadata(
                document.getDocumentCatalog().getMetadata()); // Ensure metadata is linked
        catalog.setViewerPreferences(
                new PDViewerPreferences(catalog.getCOSObject())); // PDF/A best practice
        document.getDocument().setVersion(pdfaPart == 1 ? 1.4f : 1.7f);
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        if (pdfaPart == 1) {
            document.save(baos, CompressParameters.NO_COMPRESSION);
        } else {
            document.save(baos);
        }
        return baos.toByteArray();
    }
    private Path runLibreOfficeConversion(Path tempInputFile, int pdfaPart) throws Exception {
        // Create temp output directory
        Path tempOutputDir = Files.createTempDirectory("output_");
        // Determine PDF/A filter based on requested format
        String pdfFilter =
                pdfaPart == 2
                        ? "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"2\"}}"
                        : "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"1\"}}";
        // Prepare LibreOffice command
        List<String> command =
                new ArrayList<>(
                        Arrays.asList(
                                "soffice",
                                "--headless",
                                "--nologo",
                                "--convert-to",
                                pdfFilter,
                                "--outdir",
                                tempOutputDir.toString(),
                                tempInputFile.toString()));
        ProcessExecutorResult returnCode =
                ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
                        .runCommandWithOutputHandling(command);
        if (returnCode.getRc() != 0) {
            log.error("PDF/A conversion failed with return code: {}", returnCode.getRc());
            throw new RuntimeException("PDF/A conversion failed");
        }
        // Get the output file
        File[] outputFiles = tempOutputDir.toFile().listFiles();
        if (outputFiles == null || outputFiles.length != 1) {
            throw new RuntimeException(
                    "Expected one output PDF, found "
                            + (outputFiles == null ? "none" : outputFiles.length));
        }
        return outputFiles[0].toPath();
    }
    private void embedMissingFonts(PDDocument loDoc, PDDocument baseDoc, Set<String> missingFonts)
            throws IOException {
        List<PDPage> loPages = new ArrayList<>();
        loDoc.getPages().forEach(loPages::add);
        List<PDPage> basePages = new ArrayList<>();
        baseDoc.getPages().forEach(basePages::add);
        for (int i = 0; i < loPages.size(); i++) {
            PDResources loRes = loPages.get(i).getResources();
            PDResources baseRes = basePages.get(i).getResources();
            for (COSName fontKey : loRes.getFontNames()) {
                PDFont loFont = loRes.getFont(fontKey);
                if (loFont == null) continue;
                String psName = loFont.getName();
                if (!missingFonts.contains(psName)) continue;
                PDFontDescriptor desc = loFont.getFontDescriptor();
                if (desc == null) continue;
                PDStream fontStream = null;
                if (desc.getFontFile() != null) {
                    fontStream = desc.getFontFile();
                } else if (desc.getFontFile2() != null) {
                    fontStream = desc.getFontFile2();
                } else if (desc.getFontFile3() != null) {
                    fontStream = desc.getFontFile3();
                }
                if (fontStream == null) continue;
                try (InputStream in = fontStream.createInputStream()) {
                    PDFont newFont = null;
                    try {
                        newFont = PDType0Font.load(baseDoc, in, false);
                    } catch (IOException e1) {
                        try {
                            newFont = PDTrueTypeFont.load(baseDoc, in, null);
                        } catch (IOException | IllegalArgumentException e2) {
                            log.error("Could not embed font {}: {}", psName, e2.getMessage());
                            continue;
                        }
                    }
                    if (newFont != null) {
                        baseRes.put(fontKey, newFont);
                    }
                }
            }
        }
    }
    private Set<String> findUnembeddedFontNames(PDDocument doc) throws IOException {
        Set<String> missing = new HashSet<>();
        for (PDPage page : doc.getPages()) {
            PDResources res = page.getResources();
            for (COSName name : res.getFontNames()) {
                PDFont font = res.getFont(name);
                if (font != null && !font.isEmbedded()) {
                    missing.add(font.getName());
                }
            }
        }
        return missing;
    }
    private void importFlattenedImages(PDDocument loDoc, PDDocument baseDoc) throws IOException {
        List<PDPage> loPages = new ArrayList<>();
        loDoc.getPages().forEach(loPages::add);
        List<PDPage> basePages = new ArrayList<>();
        baseDoc.getPages().forEach(basePages::add);
        for (int i = 0; i < loPages.size(); i++) {
            PDPage loPage = loPages.get(i);
            PDPage basePage = basePages.get(i);
            PDResources loRes = loPage.getResources();
            PDResources baseRes = basePage.getResources();
            Set<COSName> toReplace = detectTransparentXObjects(basePage);
            for (COSName name : toReplace) {
                PDXObject loXo = loRes.getXObject(name);
                if (!(loXo instanceof PDImageXObject img)) continue;
                PDImageXObject newImg = LosslessFactory.createFromImage(baseDoc, img.getImage());
                // replace the resource under the same name
                baseRes.put(name, newImg);
            }
        }
    }
    private Set<COSName> detectTransparentXObjects(PDPage page) {
        Set<COSName> transparentObjects = new HashSet<>();
        PDResources res = page.getResources();
        if (res == null) return transparentObjects;
        for (COSName name : res.getXObjectNames()) {
            try {
                PDXObject xo = res.getXObject(name);
                if (xo instanceof PDImageXObject img) {
                    COSDictionary d = img.getCOSObject();
                    if (d.containsKey(COSName.SMASK)
                            || isTransparencyGroup(d)
                            || d.getBoolean(COSName.INTERPOLATE, false)) {
                        transparentObjects.add(name);
                    }
                }
            } catch (IOException ioe) {
                log.error("Error processing XObject {}: {}", name.getName(), ioe.getMessage());
            }
        }
        return transparentObjects;
    }
    private boolean isTransparencyGroup(COSDictionary dict) {
        COSBase g = dict.getDictionaryObject(COSName.GROUP);
        return g instanceof COSDictionary gd
                && COSName.TRANSPARENCY.equals(gd.getCOSName(COSName.S));
    }
    private boolean hasTransparentImages(PDDocument doc) {
        for (PDPage page : doc.getPages()) {
            PDResources res = page.getResources();
            if (res == null) continue;
            for (COSName name : res.getXObjectNames()) {
                try {
                    PDXObject xo = res.getXObject(name);
                    if (xo instanceof PDImageXObject img) {
                        COSDictionary dict = img.getCOSObject();
                        if (dict.containsKey(COSName.SMASK)) return true;
                        COSBase g = dict.getDictionaryObject(COSName.GROUP);
                        if (g instanceof COSDictionary gd
                                && COSName.TRANSPARENCY.equals(gd.getCOSName(COSName.S))) {
                            return true;
                        }
                        if (dict.getBoolean(COSName.INTERPOLATE, false)) return true;
                    }
                } catch (IOException ioe) {
                    log.error("Error processing XObject {}: {}", name.getName(), ioe.getMessage());
                }
            }
        }
        return false;
    }
    private void sanitizePdfA(COSBase base, PDResources resources, int pdfaPart) {
        if (base instanceof COSDictionary dict) {
            if (pdfaPart == 1) {
                // Remove transparency-related elements
                COSBase group = dict.getDictionaryObject(COSName.GROUP);
                if (group instanceof COSDictionary gDict
                        && COSName.TRANSPARENCY.equals(gDict.getCOSName(COSName.S))) {
                    dict.removeItem(COSName.GROUP);
                }
                dict.removeItem(COSName.SMASK);
                // Transparency blending constants (/CA, /ca) — disallowed in PDF/A-1
                dict.removeItem(COSName.CA);
                dict.removeItem(COSName.getPDFName("ca"));
            }
            // Interpolation (non-deterministic image scaling) — required to be false
            if (dict.containsKey(COSName.INTERPOLATE)
                    && dict.getBoolean(COSName.INTERPOLATE, true)) {
                dict.setBoolean(COSName.INTERPOLATE, false);
            }
            // Remove common forbidden features (for PDF/A 1 and 2)
            dict.removeItem(COSName.JAVA_SCRIPT);
            dict.removeItem(COSName.getPDFName("JS"));
            dict.removeItem(COSName.getPDFName("RichMedia"));
            dict.removeItem(COSName.getPDFName("Movie"));
            dict.removeItem(COSName.getPDFName("Sound"));
            dict.removeItem(COSName.getPDFName("Launch"));
            dict.removeItem(COSName.URI);
            dict.removeItem(COSName.getPDFName("GoToR"));
            dict.removeItem(COSName.EMBEDDED_FILES);
            dict.removeItem(COSName.FILESPEC);
            // Recurse through all entries in the dictionary
            for (Map.Entry<COSName, COSBase> entry : dict.entrySet()) {
                sanitizePdfA(entry.getValue(), resources, pdfaPart);
            }
        } else if (base instanceof COSArray arr) {
            // Recursively sanitize each item in the array
            for (COSBase item : arr) {
                sanitizePdfA(item, resources, pdfaPart);
            }
        }
    }
    private void removeElementsForPdfA(PDDocument doc, int pdfaPart) {
        if (pdfaPart == 1) {
            // Remove Optional Content (Layers) - not allowed in PDF/A-1
            doc.getDocumentCatalog().getCOSObject().removeItem(COSName.getPDFName("OCProperties"));
        }
        for (PDPage page : doc.getPages()) {
            if (pdfaPart == 1) {
                page.setAnnotations(Collections.emptyList());
            }
            PDResources res = page.getResources();
            // Clean page-level dictionary
            sanitizePdfA(page.getCOSObject(), res, pdfaPart);
            // sanitize each Form XObject
            if (res != null) {
                for (COSName name : res.getXObjectNames()) {
                    try {
                        PDXObject xo = res.getXObject(name);
                        if (xo instanceof PDFormXObject form) {
                            sanitizePdfA(form.getCOSObject(), res, pdfaPart);
                        } else if (xo instanceof PDImageXObject img) {
                            sanitizePdfA(img.getCOSObject(), res, pdfaPart);
                        }
                    } catch (IOException ioe) {
                        log.error("Cannot load XObject {}: {}", name.getName(), ioe.getMessage());
                    }
                }
            }
        }
    }
    /** Embbeds the XMP metadata required for PDF/A compliance. */
    private void mergeAndAddXmpMetadata(PDDocument document, int pdfaPart) throws Exception {
        PDMetadata existingMetadata = document.getDocumentCatalog().getMetadata();
        XMPMetadata xmp;
        // Load existing XMP if available
        if (existingMetadata != null) {
            try (InputStream xmpStream = existingMetadata.createInputStream()) {
                DomXmpParser parser = new DomXmpParser();
                parser.setStrictParsing(false);
                xmp = parser.parse(xmpStream);
            } catch (Exception e) {
                xmp = XMPMetadata.createXMPMetadata();
            }
        } else {
            xmp = XMPMetadata.createXMPMetadata();
        }
        PDDocumentInformation docInfo = document.getDocumentInformation();
        if (docInfo == null) {
            docInfo = new PDDocumentInformation();
        }
        String originalCreator = Optional.ofNullable(docInfo.getCreator()).orElse("Unknown");
        String originalProducer = Optional.ofNullable(docInfo.getProducer()).orElse("Unknown");
        // Only keep the original creator so it can match xmp creator tool for compliance
        DublinCoreSchema dcSchema = xmp.getDublinCoreSchema();
        if (dcSchema != null) {
            List<String> existingCreators = dcSchema.getCreators();
            if (existingCreators != null) {
                for (String creator : new ArrayList<>(existingCreators)) {
                    dcSchema.removeCreator(creator);
                }
            }
        } else {
            dcSchema = xmp.createAndAddDublinCoreSchema();
        }
        dcSchema.addCreator(originalCreator);
        PDFAIdentificationSchema pdfaSchema =
                (PDFAIdentificationSchema) xmp.getSchema(PDFAIdentificationSchema.class);
        if (pdfaSchema == null) {
            pdfaSchema = xmp.createAndAddPDFAIdentificationSchema();
        }
        pdfaSchema.setPart(pdfaPart);
        pdfaSchema.setConformance("B");
        XMPBasicSchema xmpBasicSchema = xmp.getXMPBasicSchema();
        if (xmpBasicSchema == null) {
            xmpBasicSchema = xmp.createAndAddXMPBasicSchema();
        }
        AdobePDFSchema adobePdfSchema = xmp.getAdobePDFSchema();
        if (adobePdfSchema == null) {
            adobePdfSchema = xmp.createAndAddAdobePDFSchema();
        }
        docInfo.setCreator(originalCreator);
        xmpBasicSchema.setCreatorTool(originalCreator);
        docInfo.setProducer(originalProducer);
        adobePdfSchema.setProducer(originalProducer);
        String originalAuthor = docInfo.getAuthor();
        if (originalAuthor != null && !originalAuthor.isBlank()) {
            docInfo.setAuthor(null);
            // If the author is set, we keep it in the XMP metadata
            if (!originalCreator.equals(originalAuthor)) {
                dcSchema.addCreator(originalAuthor);
            }
        }
        String title = docInfo.getTitle();
        if (title != null && !title.isBlank()) {
            dcSchema.setTitle(title);
        }
        String subject = docInfo.getSubject();
        if (subject != null && !subject.isBlank()) {
            dcSchema.addSubject(subject);
        }
        String keywords = docInfo.getKeywords();
        if (keywords != null && !keywords.isBlank()) {
            adobePdfSchema.setKeywords(keywords);
        }
        // Set creation and modification dates
        Calendar now = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
        Calendar originalCreationDate = docInfo.getCreationDate();
        if (originalCreationDate == null) {
            originalCreationDate = now;
        }
        docInfo.setCreationDate(originalCreationDate);
        xmpBasicSchema.setCreateDate(originalCreationDate);
        docInfo.setModificationDate(now);
        xmpBasicSchema.setModifyDate(now);
        xmpBasicSchema.setMetadataDate(now);
        // Serialize the created metadata so it can be attached to the existent metadata
        ByteArrayOutputStream xmpOut = new ByteArrayOutputStream();
        new XmpSerializer().serialize(xmp, xmpOut, true);
        PDMetadata newMetadata = new PDMetadata(document);
        newMetadata.importXMPMetadata(xmpOut.toByteArray());
        document.getDocumentCatalog().setMetadata(newMetadata);
    }
    private void addICCProfileIfNotPresent(PDDocument document) throws Exception {
        if (document.getDocumentCatalog().getOutputIntents().isEmpty()) {
            try (InputStream colorProfile = getClass().getResourceAsStream("/icc/sRGB2014.icc")) {
                PDOutputIntent outputIntent = new PDOutputIntent(document, colorProfile);
                outputIntent.setInfo("sRGB IEC61966-2.1");
                outputIntent.setOutputCondition("sRGB IEC61966-2.1");
                outputIntent.setOutputConditionIdentifier("sRGB IEC61966-2.1");
                outputIntent.setRegistryName("http://www.color.org");
                document.getDocumentCatalog().addOutputIntent(outputIntent);
            } catch (Exception e) {
                log.error("Failed to load ICC profile: {}", e.getMessage());
            }
        }
    }
    private File preProcessHighlights(File inputPdf) throws Exception {
        try (PDDocument document = Loader.loadPDF(inputPdf)) {
            for (PDPage page : document.getPages()) {
                // Retrieve the annotations on the page.
                List<PDAnnotation> annotations = page.getAnnotations();
                for (PDAnnotation annot : annotations) {
                    // Process only highlight annotations.
                    if ("Highlight".equals(annot.getSubtype())
                            && annot instanceof PDAnnotationTextMarkup highlight) {
                        // Create a new appearance stream with the same bounding box.
                        float[] colorComponents =
                                highlight.getColor() != null
                                        ? highlight.getColor().getComponents()
                                        : new float[] {1f, 1f, 0f};
                        Color highlightColor =
                                new Color(
                                        colorComponents[0], colorComponents[1], colorComponents[2]);
                        float[] quadPoints = highlight.getQuadPoints();
                        if (quadPoints != null) {
                            try (PDPageContentStream cs =
                                    new PDPageContentStream(
                                            document,
                                            page,
                                            PDPageContentStream.AppendMode.PREPEND,
                                            true,
                                            true)) {
                                cs.setStrokingColor(highlightColor);
                                cs.setLineWidth(0.05f);
                                float spacing = 2f;
                                // Draw diagonal lines across the highlight area to simulate
                                // transparency.
                                for (int i = 0; i < quadPoints.length; i += 8) {
                                    float minX =
                                            Math.min(
                                                    Math.min(quadPoints[i], quadPoints[i + 2]),
                                                    Math.min(quadPoints[i + 4], quadPoints[i + 6]));
                                    float maxX =
                                            Math.max(
                                                    Math.max(quadPoints[i], quadPoints[i + 2]),
                                                    Math.max(quadPoints[i + 4], quadPoints[i + 6]));
                                    float minY =
                                            Math.min(
                                                    Math.min(quadPoints[i + 1], quadPoints[i + 3]),
                                                    Math.min(quadPoints[i + 5], quadPoints[i + 7]));
                                    float maxY =
                                            Math.max(
                                                    Math.max(quadPoints[i + 1], quadPoints[i + 3]),
                                                    Math.max(quadPoints[i + 5], quadPoints[i + 7]));
                                    float width = maxX - minX;
                                    float height = maxY - minY;
                                    for (float y = minY; y <= maxY; y += spacing) {
                                        float len = Math.min(width, maxY - y);
                                        cs.moveTo(minX, y);
                                        cs.lineTo(minX + len, y + len);
                                    }
                                    for (float x = minX + spacing; x <= maxX; x += spacing) {
                                        float len = Math.min(maxX - x, height);
                                        cs.moveTo(x, minY);
                                        cs.lineTo(x + len, minY + len);
                                    }
                                }
                                cs.stroke();
                            }
                        }
                        page.getAnnotations().remove(highlight);
                        COSDictionary pageDict = page.getCOSObject();
                        if (pageDict.containsKey(COSName.GROUP)) {
                            COSDictionary groupDict =
                                    (COSDictionary) pageDict.getDictionaryObject(COSName.GROUP);
                            if (groupDict != null) {
                                if (COSName.TRANSPARENCY
                                        .getName()
                                        .equalsIgnoreCase(groupDict.getNameAsString(COSName.S))) {
                                    pageDict.removeItem(COSName.GROUP);
                                }
                            }
                        }
                    }
                }
            }
            // Save the modified document to a temporary file.
            File preProcessedFile = Files.createTempFile("preprocessed_", ".pdf").toFile();
            document.save(preProcessedFile);
            return preProcessedFile;
        }
    }
 }
--- a/stirling-pdf/src/main/resources/icc/sRGB2014.icc
+++ b/stirling-pdf/src/main/resources/icc/sRGB2014.icc