From f82662aaafb123c538fdc84caabbf0537ebb231d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Bernardino?= Date: Tue, 17 Jun 2025 16:03:38 +0100 Subject: [PATCH] PDF-A conversion removes highlight opacity fix (#3106) (#3695) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description of Changes Previously, highlight annotations in PDF documents converted to PDF/A format lost their opacity, resulting in highlights appearing completely opaque. This issue stemmed from LibreOffice’s PDF to PDF/A conversion process and affected both supported PDF/A variants (PDF/A-1B and PDF/A-2B). To resolve this, a new conversion method was implemented. Because PDF/A-1B does not support transparency, unlike PDF/A-2B, the input PDF must be preprocessed to flatten existing transparent objects when targeting PDF/A-1B. Changes: - Preprocess the PDF to handle highlight transparency when converting to PDF/A-1B; - LibreOffice's PDF to PDF/A conversion is now only used if fonts are not embedded or images require flattening; - If needed, missing fonts and flattened images are imported from the LibreOffice-converted file; - The document is traversed to remove elements non-compliant with PDF/A standards; - Updated metadata, including all metadata schemes, to ensure full compliance; - Added an ICC Profile if one was not already present. Any challenges encountered: - Since PDF/A-1B does not support transparency, the best workaround I found in other conversion tools was to draw close diagonal lines with the highlight color to simulate transparency, as seem in the example below. Closes #3106 Example from the issue: Original: ![Screenshot from 2025-06-13 19-28-38](https://github.com/user-attachments/assets/f0065101-8266-439b-9761-7ee85210b938) PDF/A-1B: ![Screenshot from 2025-06-13 19-28-47](https://github.com/user-attachments/assets/188a0c6a-4386-4a3b-901d-4533e26c14be) PDF/A-2B: ![Screenshot from 2025-06-13 19-28-43](https://github.com/user-attachments/assets/6d167d9b-a99e-4b6e-ad9c-6d11872cb45a) --- ## Checklist ### General - [x] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [x] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [x] I have performed a self-review of my own code - [x] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [x] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details. I tested the changes using a variety of PDF files and verified PDF/A compliance of the outputs using VeraPDF. While these tests covered different scenarios and document types, PDF files can vary significantly in structure and complexity. As a result, the testing was not exhaustive, and while the results so far have been compliant, full compliance in all edge cases cannot be guaranteed. --------- Co-authored-by: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> --- .../api/converters/ConvertPDFToPDFA.java | 656 ++++++++++++++++-- .../src/main/resources/icc/sRGB2014.icc | Bin 0 -> 3024 bytes 2 files changed, 616 insertions(+), 40 deletions(-) create mode 100644 stirling-pdf/src/main/resources/icc/sRGB2014.icc diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java index 9f8c19158..a289a5421 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java @@ -1,16 +1,61 @@ package stirling.software.SPDF.controller.api.converters; +import java.awt.Color; +import java.io.ByteArrayOutputStream; import io.github.pixee.security.Filenames; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; import java.io.File; +import java.io.IOException; +import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; +import java.util.Calendar; +import java.util.Collections; +import java.util.HashSet; import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.TimeZone; + import lombok.extern.slf4j.Slf4j; import org.apache.commons.io.FileUtils; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdfwriter.compress.CompressParameters; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentCatalog; +import org.apache.pdfbox.pdmodel.PDDocumentInformation; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.common.PDMetadata; +import org.apache.pdfbox.pdmodel.common.PDStream; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; +import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; +import org.apache.pdfbox.pdmodel.font.PDType0Font; +import org.apache.pdfbox.pdmodel.graphics.PDXObject; +import org.apache.pdfbox.pdmodel.graphics.color.PDOutputIntent; +import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; +import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup; +import org.apache.pdfbox.pdmodel.interactive.viewerpreferences.PDViewerPreferences; +import org.apache.xmpbox.XMPMetadata; +import org.apache.xmpbox.schema.AdobePDFSchema; +import org.apache.xmpbox.schema.DublinCoreSchema; +import org.apache.xmpbox.schema.PDFAIdentificationSchema; +import org.apache.xmpbox.schema.XMPBasicSchema; +import org.apache.xmpbox.xml.DomXmpParser; +import org.apache.xmpbox.xml.XmpSerializer; import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.ModelAttribute; @@ -56,54 +101,37 @@ public class ConvertPDFToPDFA { : originalFileName; Path tempInputFile = null; - Path tempOutputDir = null; byte[] fileBytes; + Path loPdfPath = null; // Used for LibreOffice conversion output + File preProcessedFile = null; + int pdfaPart = 2; try { // Save uploaded file to temp location tempInputFile = Files.createTempFile("input_", ".pdf"); inputFile.transferTo(tempInputFile); - // Create temp output directory - tempOutputDir = Files.createTempDirectory("output_"); - - // Determine PDF/A filter based on requested format - String pdfFilter = - "pdfa".equals(outputFormat) - ? "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"2\"}}" - : "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"1\"}}"; - - // Prepare LibreOffice command - List command = - new ArrayList<>( - Arrays.asList( - "soffice", - "--headless", - "--nologo", - "--convert-to", - pdfFilter, - "--outdir", - tempOutputDir.toString(), - tempInputFile.toString())); - - ProcessExecutorResult returnCode = - ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE) - .runCommandWithOutputHandling(command); - - if (returnCode.getRc() != 0) { - log.error("PDF/A conversion failed with return code: {}", returnCode.getRc()); - throw new RuntimeException("PDF/A conversion failed"); + // Branch conversion based on desired output PDF/A format + if ("pdfa".equals(outputFormat)) { + preProcessedFile = tempInputFile.toFile(); + } else { + pdfaPart = 1; + preProcessedFile = preProcessHighlights(tempInputFile.toFile()); } - - // Get the output file - File[] outputFiles = tempOutputDir.toFile().listFiles(); - if (outputFiles == null || outputFiles.length != 1) { - throw new RuntimeException( - "Expected exactly one output file but found " - + (outputFiles == null ? "none" : outputFiles.length)); + Set missingFonts = new HashSet<>(); + boolean needImgs = false; + try (PDDocument doc = Loader.loadPDF(preProcessedFile)) { + missingFonts = findUnembeddedFontNames(doc); + needImgs = (pdfaPart == 1) && hasTransparentImages(doc); + if (!missingFonts.isEmpty() || needImgs) { + // Run LibreOffice conversion to get flattened images and embedded fonts + loPdfPath = runLibreOfficeConversion(preProcessedFile.toPath(), pdfaPart); + } } + fileBytes = + convertToPdfA( + preProcessedFile.toPath(), loPdfPath, pdfaPart, missingFonts, needImgs); - fileBytes = FileUtils.readFileToByteArray(outputFiles[0]); String outputFilename = baseFileName + "_PDFA.pdf"; return WebResponseUtils.bytesToWebResponse( @@ -114,9 +142,557 @@ public class ConvertPDFToPDFA { if (tempInputFile != null) { Files.deleteIfExists(tempInputFile); } - if (tempOutputDir != null) { - FileUtils.deleteDirectory(tempOutputDir.toFile()); + if (loPdfPath != null && loPdfPath.getParent() != null) { + FileUtils.deleteDirectory(loPdfPath.getParent().toFile()); + } + if (preProcessedFile != null) { + Files.deleteIfExists(preProcessedFile.toPath()); } } } + + /** + * Merge fonts & flattened images from loPdfPath into basePdfPath, then run the standard + * PDFBox/A pipeline. + * + * @param basePdfPath Path to the original (or highlight‐preprocessed) PDF + * @param loPdfPath Path to the LibreOffice–flattened PDF/A, or null if not used + * @param pdfaPart 1 (PDF/A-1B) or 2 (PDF/A-2B) + * @return the final PDF/A bytes + */ + private byte[] convertToPdfA( + Path basePdfPath, + Path loPdfPath, + int pdfaPart, + Set missingFonts, + boolean importImages) + throws Exception { + try (PDDocument baseDoc = Loader.loadPDF(basePdfPath.toFile())) { + + if (loPdfPath != null) { + try (PDDocument loDoc = Loader.loadPDF(loPdfPath.toFile())) { + if (!missingFonts.isEmpty()) { + embedMissingFonts(loDoc, baseDoc, missingFonts); + } + if (importImages) { + importFlattenedImages(loDoc, baseDoc); + } + } + } + return processWithPDFBox(baseDoc, pdfaPart); + } + } + + private byte[] processWithPDFBox(PDDocument document, int pdfaPart) throws Exception { + + removeElementsForPdfA(document, pdfaPart); + + mergeAndAddXmpMetadata(document, pdfaPart); + + addICCProfileIfNotPresent(document); + + // Mark the document as PDF/A + PDDocumentCatalog catalog = document.getDocumentCatalog(); + catalog.setMetadata( + document.getDocumentCatalog().getMetadata()); // Ensure metadata is linked + catalog.setViewerPreferences( + new PDViewerPreferences(catalog.getCOSObject())); // PDF/A best practice + document.getDocument().setVersion(pdfaPart == 1 ? 1.4f : 1.7f); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + if (pdfaPart == 1) { + document.save(baos, CompressParameters.NO_COMPRESSION); + } else { + document.save(baos); + } + + return baos.toByteArray(); + } + + private Path runLibreOfficeConversion(Path tempInputFile, int pdfaPart) throws Exception { + // Create temp output directory + Path tempOutputDir = Files.createTempDirectory("output_"); + + // Determine PDF/A filter based on requested format + String pdfFilter = + pdfaPart == 2 + ? "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"2\"}}" + : "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"1\"}}"; + + // Prepare LibreOffice command + List command = + new ArrayList<>( + Arrays.asList( + "soffice", + "--headless", + "--nologo", + "--convert-to", + pdfFilter, + "--outdir", + tempOutputDir.toString(), + tempInputFile.toString())); + + ProcessExecutorResult returnCode = + ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE) + .runCommandWithOutputHandling(command); + + if (returnCode.getRc() != 0) { + log.error("PDF/A conversion failed with return code: {}", returnCode.getRc()); + throw new RuntimeException("PDF/A conversion failed"); + } + + // Get the output file + File[] outputFiles = tempOutputDir.toFile().listFiles(); + if (outputFiles == null || outputFiles.length != 1) { + throw new RuntimeException( + "Expected one output PDF, found " + + (outputFiles == null ? "none" : outputFiles.length)); + } + return outputFiles[0].toPath(); + } + + private void embedMissingFonts(PDDocument loDoc, PDDocument baseDoc, Set missingFonts) + throws IOException { + List loPages = new ArrayList<>(); + loDoc.getPages().forEach(loPages::add); + List basePages = new ArrayList<>(); + baseDoc.getPages().forEach(basePages::add); + + for (int i = 0; i < loPages.size(); i++) { + PDResources loRes = loPages.get(i).getResources(); + PDResources baseRes = basePages.get(i).getResources(); + + for (COSName fontKey : loRes.getFontNames()) { + PDFont loFont = loRes.getFont(fontKey); + if (loFont == null) continue; + + String psName = loFont.getName(); + if (!missingFonts.contains(psName)) continue; + + PDFontDescriptor desc = loFont.getFontDescriptor(); + if (desc == null) continue; + + PDStream fontStream = null; + if (desc.getFontFile() != null) { + fontStream = desc.getFontFile(); + } else if (desc.getFontFile2() != null) { + fontStream = desc.getFontFile2(); + } else if (desc.getFontFile3() != null) { + fontStream = desc.getFontFile3(); + } + if (fontStream == null) continue; + + try (InputStream in = fontStream.createInputStream()) { + PDFont newFont = null; + try { + newFont = PDType0Font.load(baseDoc, in, false); + } catch (IOException e1) { + try { + newFont = PDTrueTypeFont.load(baseDoc, in, null); + } catch (IOException | IllegalArgumentException e2) { + log.error("Could not embed font {}: {}", psName, e2.getMessage()); + continue; + } + } + if (newFont != null) { + baseRes.put(fontKey, newFont); + } + } + } + } + } + + private Set findUnembeddedFontNames(PDDocument doc) throws IOException { + Set missing = new HashSet<>(); + for (PDPage page : doc.getPages()) { + PDResources res = page.getResources(); + for (COSName name : res.getFontNames()) { + PDFont font = res.getFont(name); + if (font != null && !font.isEmbedded()) { + missing.add(font.getName()); + } + } + } + return missing; + } + + private void importFlattenedImages(PDDocument loDoc, PDDocument baseDoc) throws IOException { + List loPages = new ArrayList<>(); + loDoc.getPages().forEach(loPages::add); + List basePages = new ArrayList<>(); + baseDoc.getPages().forEach(basePages::add); + + for (int i = 0; i < loPages.size(); i++) { + PDPage loPage = loPages.get(i); + PDPage basePage = basePages.get(i); + + PDResources loRes = loPage.getResources(); + PDResources baseRes = basePage.getResources(); + Set toReplace = detectTransparentXObjects(basePage); + + for (COSName name : toReplace) { + PDXObject loXo = loRes.getXObject(name); + if (!(loXo instanceof PDImageXObject img)) continue; + + PDImageXObject newImg = LosslessFactory.createFromImage(baseDoc, img.getImage()); + + // replace the resource under the same name + baseRes.put(name, newImg); + } + } + } + + private Set detectTransparentXObjects(PDPage page) { + Set transparentObjects = new HashSet<>(); + PDResources res = page.getResources(); + if (res == null) return transparentObjects; + + for (COSName name : res.getXObjectNames()) { + try { + PDXObject xo = res.getXObject(name); + if (xo instanceof PDImageXObject img) { + COSDictionary d = img.getCOSObject(); + if (d.containsKey(COSName.SMASK) + || isTransparencyGroup(d) + || d.getBoolean(COSName.INTERPOLATE, false)) { + transparentObjects.add(name); + } + } + } catch (IOException ioe) { + log.error("Error processing XObject {}: {}", name.getName(), ioe.getMessage()); + } + } + return transparentObjects; + } + + private boolean isTransparencyGroup(COSDictionary dict) { + COSBase g = dict.getDictionaryObject(COSName.GROUP); + return g instanceof COSDictionary gd + && COSName.TRANSPARENCY.equals(gd.getCOSName(COSName.S)); + } + + private boolean hasTransparentImages(PDDocument doc) { + for (PDPage page : doc.getPages()) { + PDResources res = page.getResources(); + if (res == null) continue; + for (COSName name : res.getXObjectNames()) { + try { + PDXObject xo = res.getXObject(name); + if (xo instanceof PDImageXObject img) { + COSDictionary dict = img.getCOSObject(); + if (dict.containsKey(COSName.SMASK)) return true; + COSBase g = dict.getDictionaryObject(COSName.GROUP); + if (g instanceof COSDictionary gd + && COSName.TRANSPARENCY.equals(gd.getCOSName(COSName.S))) { + return true; + } + if (dict.getBoolean(COSName.INTERPOLATE, false)) return true; + } + } catch (IOException ioe) { + log.error("Error processing XObject {}: {}", name.getName(), ioe.getMessage()); + } + } + } + return false; + } + + private void sanitizePdfA(COSBase base, PDResources resources, int pdfaPart) { + if (base instanceof COSDictionary dict) { + if (pdfaPart == 1) { + // Remove transparency-related elements + COSBase group = dict.getDictionaryObject(COSName.GROUP); + if (group instanceof COSDictionary gDict + && COSName.TRANSPARENCY.equals(gDict.getCOSName(COSName.S))) { + dict.removeItem(COSName.GROUP); + } + + dict.removeItem(COSName.SMASK); + // Transparency blending constants (/CA, /ca) — disallowed in PDF/A-1 + dict.removeItem(COSName.CA); + dict.removeItem(COSName.getPDFName("ca")); + } + + // Interpolation (non-deterministic image scaling) — required to be false + if (dict.containsKey(COSName.INTERPOLATE) + && dict.getBoolean(COSName.INTERPOLATE, true)) { + dict.setBoolean(COSName.INTERPOLATE, false); + } + + // Remove common forbidden features (for PDF/A 1 and 2) + dict.removeItem(COSName.JAVA_SCRIPT); + dict.removeItem(COSName.getPDFName("JS")); + dict.removeItem(COSName.getPDFName("RichMedia")); + dict.removeItem(COSName.getPDFName("Movie")); + dict.removeItem(COSName.getPDFName("Sound")); + dict.removeItem(COSName.getPDFName("Launch")); + dict.removeItem(COSName.URI); + dict.removeItem(COSName.getPDFName("GoToR")); + dict.removeItem(COSName.EMBEDDED_FILES); + dict.removeItem(COSName.FILESPEC); + + // Recurse through all entries in the dictionary + for (Map.Entry entry : dict.entrySet()) { + sanitizePdfA(entry.getValue(), resources, pdfaPart); + } + + } else if (base instanceof COSArray arr) { + // Recursively sanitize each item in the array + for (COSBase item : arr) { + sanitizePdfA(item, resources, pdfaPart); + } + } + } + + private void removeElementsForPdfA(PDDocument doc, int pdfaPart) { + + if (pdfaPart == 1) { + // Remove Optional Content (Layers) - not allowed in PDF/A-1 + doc.getDocumentCatalog().getCOSObject().removeItem(COSName.getPDFName("OCProperties")); + } + + for (PDPage page : doc.getPages()) { + if (pdfaPart == 1) { + page.setAnnotations(Collections.emptyList()); + } + PDResources res = page.getResources(); + // Clean page-level dictionary + sanitizePdfA(page.getCOSObject(), res, pdfaPart); + + // sanitize each Form XObject + if (res != null) { + for (COSName name : res.getXObjectNames()) { + try { + PDXObject xo = res.getXObject(name); + if (xo instanceof PDFormXObject form) { + sanitizePdfA(form.getCOSObject(), res, pdfaPart); + } else if (xo instanceof PDImageXObject img) { + sanitizePdfA(img.getCOSObject(), res, pdfaPart); + } + } catch (IOException ioe) { + log.error("Cannot load XObject {}: {}", name.getName(), ioe.getMessage()); + } + } + } + } + } + + /** Embbeds the XMP metadata required for PDF/A compliance. */ + private void mergeAndAddXmpMetadata(PDDocument document, int pdfaPart) throws Exception { + PDMetadata existingMetadata = document.getDocumentCatalog().getMetadata(); + XMPMetadata xmp; + + // Load existing XMP if available + if (existingMetadata != null) { + try (InputStream xmpStream = existingMetadata.createInputStream()) { + DomXmpParser parser = new DomXmpParser(); + parser.setStrictParsing(false); + xmp = parser.parse(xmpStream); + } catch (Exception e) { + xmp = XMPMetadata.createXMPMetadata(); + } + } else { + xmp = XMPMetadata.createXMPMetadata(); + } + + PDDocumentInformation docInfo = document.getDocumentInformation(); + if (docInfo == null) { + docInfo = new PDDocumentInformation(); + } + + String originalCreator = Optional.ofNullable(docInfo.getCreator()).orElse("Unknown"); + String originalProducer = Optional.ofNullable(docInfo.getProducer()).orElse("Unknown"); + + // Only keep the original creator so it can match xmp creator tool for compliance + DublinCoreSchema dcSchema = xmp.getDublinCoreSchema(); + if (dcSchema != null) { + List existingCreators = dcSchema.getCreators(); + if (existingCreators != null) { + for (String creator : new ArrayList<>(existingCreators)) { + dcSchema.removeCreator(creator); + } + } + } else { + dcSchema = xmp.createAndAddDublinCoreSchema(); + } + dcSchema.addCreator(originalCreator); + + PDFAIdentificationSchema pdfaSchema = + (PDFAIdentificationSchema) xmp.getSchema(PDFAIdentificationSchema.class); + if (pdfaSchema == null) { + pdfaSchema = xmp.createAndAddPDFAIdentificationSchema(); + } + pdfaSchema.setPart(pdfaPart); + pdfaSchema.setConformance("B"); + + XMPBasicSchema xmpBasicSchema = xmp.getXMPBasicSchema(); + if (xmpBasicSchema == null) { + xmpBasicSchema = xmp.createAndAddXMPBasicSchema(); + } + + AdobePDFSchema adobePdfSchema = xmp.getAdobePDFSchema(); + if (adobePdfSchema == null) { + adobePdfSchema = xmp.createAndAddAdobePDFSchema(); + } + + docInfo.setCreator(originalCreator); + xmpBasicSchema.setCreatorTool(originalCreator); + + docInfo.setProducer(originalProducer); + adobePdfSchema.setProducer(originalProducer); + + String originalAuthor = docInfo.getAuthor(); + if (originalAuthor != null && !originalAuthor.isBlank()) { + docInfo.setAuthor(null); + // If the author is set, we keep it in the XMP metadata + if (!originalCreator.equals(originalAuthor)) { + dcSchema.addCreator(originalAuthor); + } + } + + String title = docInfo.getTitle(); + if (title != null && !title.isBlank()) { + dcSchema.setTitle(title); + } + String subject = docInfo.getSubject(); + if (subject != null && !subject.isBlank()) { + dcSchema.addSubject(subject); + } + String keywords = docInfo.getKeywords(); + if (keywords != null && !keywords.isBlank()) { + adobePdfSchema.setKeywords(keywords); + } + + // Set creation and modification dates + Calendar now = Calendar.getInstance(TimeZone.getTimeZone("UTC")); + Calendar originalCreationDate = docInfo.getCreationDate(); + if (originalCreationDate == null) { + originalCreationDate = now; + } + docInfo.setCreationDate(originalCreationDate); + xmpBasicSchema.setCreateDate(originalCreationDate); + + docInfo.setModificationDate(now); + xmpBasicSchema.setModifyDate(now); + xmpBasicSchema.setMetadataDate(now); + + // Serialize the created metadata so it can be attached to the existent metadata + ByteArrayOutputStream xmpOut = new ByteArrayOutputStream(); + new XmpSerializer().serialize(xmp, xmpOut, true); + + PDMetadata newMetadata = new PDMetadata(document); + newMetadata.importXMPMetadata(xmpOut.toByteArray()); + document.getDocumentCatalog().setMetadata(newMetadata); + } + + private void addICCProfileIfNotPresent(PDDocument document) throws Exception { + if (document.getDocumentCatalog().getOutputIntents().isEmpty()) { + try (InputStream colorProfile = getClass().getResourceAsStream("/icc/sRGB2014.icc")) { + PDOutputIntent outputIntent = new PDOutputIntent(document, colorProfile); + outputIntent.setInfo("sRGB IEC61966-2.1"); + outputIntent.setOutputCondition("sRGB IEC61966-2.1"); + outputIntent.setOutputConditionIdentifier("sRGB IEC61966-2.1"); + outputIntent.setRegistryName("http://www.color.org"); + document.getDocumentCatalog().addOutputIntent(outputIntent); + } catch (Exception e) { + log.error("Failed to load ICC profile: {}", e.getMessage()); + } + } + } + + private File preProcessHighlights(File inputPdf) throws Exception { + + try (PDDocument document = Loader.loadPDF(inputPdf)) { + + for (PDPage page : document.getPages()) { + // Retrieve the annotations on the page. + List annotations = page.getAnnotations(); + for (PDAnnotation annot : annotations) { + // Process only highlight annotations. + if ("Highlight".equals(annot.getSubtype()) + && annot instanceof PDAnnotationTextMarkup highlight) { + // Create a new appearance stream with the same bounding box. + float[] colorComponents = + highlight.getColor() != null + ? highlight.getColor().getComponents() + : new float[] {1f, 1f, 0f}; + Color highlightColor = + new Color( + colorComponents[0], colorComponents[1], colorComponents[2]); + + float[] quadPoints = highlight.getQuadPoints(); + if (quadPoints != null) { + try (PDPageContentStream cs = + new PDPageContentStream( + document, + page, + PDPageContentStream.AppendMode.PREPEND, + true, + true)) { + + cs.setStrokingColor(highlightColor); + cs.setLineWidth(0.05f); + float spacing = 2f; + // Draw diagonal lines across the highlight area to simulate + // transparency. + for (int i = 0; i < quadPoints.length; i += 8) { + float minX = + Math.min( + Math.min(quadPoints[i], quadPoints[i + 2]), + Math.min(quadPoints[i + 4], quadPoints[i + 6])); + float maxX = + Math.max( + Math.max(quadPoints[i], quadPoints[i + 2]), + Math.max(quadPoints[i + 4], quadPoints[i + 6])); + float minY = + Math.min( + Math.min(quadPoints[i + 1], quadPoints[i + 3]), + Math.min(quadPoints[i + 5], quadPoints[i + 7])); + float maxY = + Math.max( + Math.max(quadPoints[i + 1], quadPoints[i + 3]), + Math.max(quadPoints[i + 5], quadPoints[i + 7])); + + float width = maxX - minX; + float height = maxY - minY; + + for (float y = minY; y <= maxY; y += spacing) { + float len = Math.min(width, maxY - y); + cs.moveTo(minX, y); + cs.lineTo(minX + len, y + len); + } + for (float x = minX + spacing; x <= maxX; x += spacing) { + float len = Math.min(maxX - x, height); + cs.moveTo(x, minY); + cs.lineTo(x + len, minY + len); + } + } + + cs.stroke(); + } + } + + page.getAnnotations().remove(highlight); + COSDictionary pageDict = page.getCOSObject(); + + if (pageDict.containsKey(COSName.GROUP)) { + COSDictionary groupDict = + (COSDictionary) pageDict.getDictionaryObject(COSName.GROUP); + + if (groupDict != null) { + if (COSName.TRANSPARENCY + .getName() + .equalsIgnoreCase(groupDict.getNameAsString(COSName.S))) { + pageDict.removeItem(COSName.GROUP); + } + } + } + } + } + } + // Save the modified document to a temporary file. + File preProcessedFile = Files.createTempFile("preprocessed_", ".pdf").toFile(); + document.save(preProcessedFile); + return preProcessedFile; + } + } } diff --git a/stirling-pdf/src/main/resources/icc/sRGB2014.icc b/stirling-pdf/src/main/resources/icc/sRGB2014.icc new file mode 100644 index 0000000000000000000000000000000000000000..49afbfef10f22a1832590b68369d2f248ea553b9 GIT binary patch literal 3024 zcmb`Jc{r5o8^@pboqe;-klom~#=Z=)?<7n1RL0C;EQ4W?v`H$Qlq6e;oU(N2=!6`p zq_j9fq0&N*O8IqkN}I~>9j@P{b6vkb&vRYx^M3C8x$pP6pZoda{Q^K51jvAqCy}2f z2yl0zhlYjIaZeGKxM&3c7CSY0nf@_DE7pfmuw>n3hyBge}>zEF^|hhw$p<`Vm43NktlHVq|Q#Wc`bi=uVbDr*Q%R@mv7f?y!Y| z^kpAf^uhola$__g2b6(2&;bl!0xW?IZ~(5r3;2RS5C%2@Hi!j@Kmam8HrNI7Kmj-i zj(`eK4eCGxXa=pI9dv;!;5xVs2Ehmz2NPf#yasdN16Y6{2nSIhDkKM~K$?&~WCAfE zJIEDU3k5)7P$U!s@gX6U4ef>spkk;3s(~7yU!e=o73d~31U-Nzp&96J=nIU3$uJF8 zg0)~nm^L%}Fw^fA^LPfRE#29trw!<1r9Va{W&VMZ|1m=9PiRtBq$wZwX0 z!?1DKt=K~BF>DL=GIj_%g`LOYaB?_(oGs25$HJxI@^Iz2Gq_8*VcazC6P|=u!JFXS z@ZoqqJ_lclZ^U=whw(4)3j_&*Cc&EEOW+W;5Q+$OgigX8!ZcxlC`r^N+7bhaal~E3 zGGa6F8u1bF9f?FzBUzFBNj%a{QW@zi=>}<%^qDM0)+0NUBgjJX0rF|`W%2{^I|_xO zMRA~nQ_?60C=HaWlqZx=VpK5$F;6j$*bcEuu{N<`u{YubaZPbY@lE1c;-%u}#P5jD zN)RNpB%CE!65AyzB`!#eNz6-9C5m zB-1K0D)VKP(kjPQ+*SKmHLn_8^-)$q);f{g-OAzz_Y;h`d|sHYg9xK;6_V!z_NlCqM!QnFIH(p9BdWf^4$ z67?SSISmyJAB}8{CXI)h1Wl%9tmaY8KFyC>+FBu6d$roNUTVu~dunHC zH)%i8q3GD_r0CS@+|$MCGIis1kLeET!FuL;v3iwycl2R>3w@scG5w*{nAKLR`KxPJ zk1@y$M@BlMi7{y)W3bjB$DrNdjiH8NxZxqgKEv-u=0*udbw=aHQpR4!ImVsFf1Bu; zuuUpW?wL|d-As3wc9_03(>LRq9XGpgPBr&2-)r7u{>{SDLSWHsF=MG=8EIK%ImV{PDV}wr}Iu9ovod>IbU``xwyOJy9~HW zxdypbxIS@HbBl3na+`BEci-xM*#qO@?QzIs%u~se?b+Zt=Vj@&&8yd&?7iN*!u#1; zy|se1oj$OSm(O9JN9#1#@z=Hc0$)$x!@iIGwEa^2e)q@v`}tS;KMybt$PVaRPhG!x zedGEMflh%%f#X3sLBgP(VDaFH;D+FjAub`sArqm7q1!@lhslTW!aBln;lbgj!sj=* zZaA`GI>J06FJg3~_QuSOH#f;|O4xL9v-oD#=5vvl$dJg!$geD4RxN8j$}_4eYL4y9 zKFWU0ap072X1KQ8V(yD*+vwuxmoc_6hht`9?PE)0XL-)N3f|i|kGSf%kMX|or{fnB zLK0dM@rjX%7x+^Acz$n^a#Ci}P_lklKhhQM>Ze1S!z~VeUx}qcyyv{ZCOXTM)|?=uNAQsBb82-EmewD`>Q@4 z;~X14?r^-hTB*9A`pXI4iTgF~HEp$8wWTMqC(}uM0h$Hl62xH~9T@mugq#Md^!0-Nf$P?!`-4 zm*y`gU!J`Zb7iV$bIqdq~gGTRGoA)2d|5UxvdGp&}4uAE}h0aaC6}(;i zyYQXdyVLK@-uKM=%|H2&_+jB={wKLl^`Dua`@V#Hd9jf375BC5o9?&H@7~`ZEha85 z{-8k&JYAjX7RFW<77P=HG2Mk5%@QW0(M8J6IVmAYD4?%TX0f?+23;gpmIcJWHm~TE zsB!?>_W&UKaK(pgBT{F`Sk`1q_=ApIvi~>1Kja-poFc8Ycg2@f3jlK-0Mx-$UJPB7 zEaT{Co~CjhDoy^Z4|Cv`LizZ;q8ZSF~{&Hxtp1 zNS#T^TLiqA*fhE)KaDHkvqTlK5|(a9AgVDnNsz`9Ca$I)0svP6z_+5s#f6&1#cxP2P~!kx7XBBF2+<<| literal 0 HcmV?d00001