Merge branch 'main' into audit2

This commit is contained in:
Anthony Stirling 2025-06-17 17:41:03 +01:00 committed by GitHub
commit 911c894023
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 666 additions and 44 deletions

View File

@ -38,7 +38,8 @@ jobs:
pr_ref: ${{ steps.get-pr-info.outputs.ref }} pr_ref: ${{ steps.get-pr-info.outputs.ref }}
comment_id: ${{ github.event.comment.id }} comment_id: ${{ github.event.comment.id }}
disable_security: ${{ steps.check-security-flag.outputs.disable_security }} disable_security: ${{ steps.check-security-flag.outputs.disable_security }}
enable_pro: ${{ steps.check-pro-flag.outputs.enable_pro }}
enable_enterprise: ${{ steps.check-pro-flag.outputs.enable_enterprise }}
steps: steps:
- name: Harden Runner - name: Harden Runner
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1 uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
@ -98,6 +99,25 @@ jobs:
echo "disable_security=true" >> $GITHUB_OUTPUT echo "disable_security=true" >> $GITHUB_OUTPUT
fi fi
- name: Check for pro flag
id: check-pro-flag
env:
COMMENT_BODY: ${{ github.event.comment.body }}
run: |
if [[ "$COMMENT_BODY" == *"pro"* ]] || [[ "$COMMENT_BODY" == *"premium"* ]]; then
echo "pro flags detected in comment"
echo "enable_pro=true" >> $GITHUB_OUTPUT
echo "enable_enterprise=false" >> $GITHUB_OUTPUT
elif [[ "$COMMENT_BODY" == *"enterprise"* ]]; then
echo "enterprise flags detected in comment"
echo "enable_enterprise=true" >> $GITHUB_OUTPUT
echo "enable_pro=false" >> $GITHUB_OUTPUT
else
echo "No pro or enterprise flags detected in comment"
echo "enable_pro=false" >> $GITHUB_OUTPUT
echo "enable_enterprise=false" >> $GITHUB_OUTPUT
fi
- name: Add 'in_progress' reaction to comment - name: Add 'in_progress' reaction to comment
id: add-eyes-reaction id: add-eyes-reaction
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
@ -209,6 +229,21 @@ jobs:
SECURITY_STATUS="Security Disabled" SECURITY_STATUS="Security Disabled"
fi fi
# Set pro/enterprise settings (enterprise implies pro)
if [ "${{ needs.check-comment.outputs.enable_enterprise }}" == "true" ]; then
PREMIUM_ENABLED="true"
PREMIUM_KEY="${{ secrets.ENTERPRISE_KEY }}"
PREMIUM_PROFEATURES_AUDIT_ENABLED="true"
elif [ "${{ needs.check-comment.outputs.enable_pro }}" == "true" ]; then
PREMIUM_ENABLED="true"
PREMIUM_KEY="${{ secrets.PREMIUM_KEY }}"
PREMIUM_PROFEATURES_AUDIT_ENABLED="true"
else
PREMIUM_ENABLED="false"
PREMIUM_KEY=""
PREMIUM_PROFEATURES_AUDIT_ENABLED="false"
fi
# First create the docker-compose content locally # First create the docker-compose content locally
cat > docker-compose.yml << EOF cat > docker-compose.yml << EOF
version: '3.3' version: '3.3'
@ -232,6 +267,9 @@ jobs:
SYSTEM_MAXFILESIZE: "100" SYSTEM_MAXFILESIZE: "100"
METRICS_ENABLED: "true" METRICS_ENABLED: "true"
SYSTEM_GOOGLEVISIBILITY: "false" SYSTEM_GOOGLEVISIBILITY: "false"
PREMIUM_KEY: "${PREMIUM_KEY}"
PREMIUM_ENABLED: "${PREMIUM_ENABLED}"
PREMIUM_PROFEATURES_AUDIT_ENABLED: "${PREMIUM_PROFEATURES_AUDIT_ENABLED}"
restart: on-failure:5 restart: on-failure:5
EOF EOF

View File

@ -61,8 +61,16 @@ Make sure to place the entry under the correct language section. This helps main
#### Windows command #### Windows command
```ps ```powershell
python .github/scripts/check_language_properties.py --reference-file src\main\resources\messages_en_GB.properties --branch "" --files src\main\resources\messages_pl_PL.properties python .github/scripts/check_language_properties.py --reference-file stirling-pdf\src\main\resources\messages_en_GB.properties --branch "" --files stirling-pdf\src\main\resources\messages_pl_PL.properties
python .github/scripts/check_language_properties.py --reference-file src\main\resources\messages_en_GB.properties --branch "" --check-file src\main\resources\messages_pl_PL.properties python .github/scripts/check_language_properties.py --reference-file stirling-pdf\src\main\resources\messages_en_GB.properties --branch "" --check-file stirling-pdf\src\main\resources\messages_pl_PL.properties
```
#### Linux command
```bash
python3 .github/scripts/check_language_properties.py --reference-file stirling-pdf/src/main/resources/messages_en_GB.properties --branch "" --files stirling-pdf/src/main/resources/messages_pl_PL.properties
python3 .github/scripts/check_language_properties.py --reference-file stirling-pdf/src/main/resources/messages_en_GB.properties --branch "" --check-file stirling-pdf/src/main/resources/messages_pl_PL.properties
``` ```

View File

@ -1,16 +1,61 @@
package stirling.software.SPDF.controller.api.converters; package stirling.software.SPDF.controller.api.converters;
import java.awt.Color;
import java.io.ByteArrayOutputStream;
import io.github.pixee.security.Filenames; import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag; import io.swagger.v3.oas.annotations.tags.Tag;
import java.io.File; import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Calendar;
import java.util.Collections;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.TimeZone;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.color.PDOutputIntent;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
import org.apache.pdfbox.pdmodel.interactive.viewerpreferences.PDViewerPreferences;
import org.apache.xmpbox.XMPMetadata;
import org.apache.xmpbox.schema.AdobePDFSchema;
import org.apache.xmpbox.schema.DublinCoreSchema;
import org.apache.xmpbox.schema.PDFAIdentificationSchema;
import org.apache.xmpbox.schema.XMPBasicSchema;
import org.apache.xmpbox.xml.DomXmpParser;
import org.apache.xmpbox.xml.XmpSerializer;
import org.springframework.http.MediaType; import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.ModelAttribute;
@ -56,54 +101,37 @@ public class ConvertPDFToPDFA {
: originalFileName; : originalFileName;
Path tempInputFile = null; Path tempInputFile = null;
Path tempOutputDir = null;
byte[] fileBytes; byte[] fileBytes;
Path loPdfPath = null; // Used for LibreOffice conversion output
File preProcessedFile = null;
int pdfaPart = 2;
try { try {
// Save uploaded file to temp location // Save uploaded file to temp location
tempInputFile = Files.createTempFile("input_", ".pdf"); tempInputFile = Files.createTempFile("input_", ".pdf");
inputFile.transferTo(tempInputFile); inputFile.transferTo(tempInputFile);
// Create temp output directory // Branch conversion based on desired output PDF/A format
tempOutputDir = Files.createTempDirectory("output_"); if ("pdfa".equals(outputFormat)) {
preProcessedFile = tempInputFile.toFile();
// Determine PDF/A filter based on requested format } else {
String pdfFilter = pdfaPart = 1;
"pdfa".equals(outputFormat) preProcessedFile = preProcessHighlights(tempInputFile.toFile());
? "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"2\"}}"
: "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"1\"}}";
// Prepare LibreOffice command
List<String> command =
new ArrayList<>(
Arrays.asList(
"soffice",
"--headless",
"--nologo",
"--convert-to",
pdfFilter,
"--outdir",
tempOutputDir.toString(),
tempInputFile.toString()));
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
.runCommandWithOutputHandling(command);
if (returnCode.getRc() != 0) {
log.error("PDF/A conversion failed with return code: {}", returnCode.getRc());
throw new RuntimeException("PDF/A conversion failed");
} }
Set<String> missingFonts = new HashSet<>();
// Get the output file boolean needImgs = false;
File[] outputFiles = tempOutputDir.toFile().listFiles(); try (PDDocument doc = Loader.loadPDF(preProcessedFile)) {
if (outputFiles == null || outputFiles.length != 1) { missingFonts = findUnembeddedFontNames(doc);
throw new RuntimeException( needImgs = (pdfaPart == 1) && hasTransparentImages(doc);
"Expected exactly one output file but found " if (!missingFonts.isEmpty() || needImgs) {
+ (outputFiles == null ? "none" : outputFiles.length)); // Run LibreOffice conversion to get flattened images and embedded fonts
loPdfPath = runLibreOfficeConversion(preProcessedFile.toPath(), pdfaPart);
}
} }
fileBytes =
convertToPdfA(
preProcessedFile.toPath(), loPdfPath, pdfaPart, missingFonts, needImgs);
fileBytes = FileUtils.readFileToByteArray(outputFiles[0]);
String outputFilename = baseFileName + "_PDFA.pdf"; String outputFilename = baseFileName + "_PDFA.pdf";
return WebResponseUtils.bytesToWebResponse( return WebResponseUtils.bytesToWebResponse(
@ -114,9 +142,557 @@ public class ConvertPDFToPDFA {
if (tempInputFile != null) { if (tempInputFile != null) {
Files.deleteIfExists(tempInputFile); Files.deleteIfExists(tempInputFile);
} }
if (tempOutputDir != null) { if (loPdfPath != null && loPdfPath.getParent() != null) {
FileUtils.deleteDirectory(tempOutputDir.toFile()); FileUtils.deleteDirectory(loPdfPath.getParent().toFile());
}
if (preProcessedFile != null) {
Files.deleteIfExists(preProcessedFile.toPath());
} }
} }
} }
/**
* Merge fonts & flattened images from loPdfPath into basePdfPath, then run the standard
* PDFBox/A pipeline.
*
* @param basePdfPath Path to the original (or highlightpreprocessed) PDF
* @param loPdfPath Path to the LibreOfficeflattened PDF/A, or null if not used
* @param pdfaPart 1 (PDF/A-1B) or 2 (PDF/A-2B)
* @return the final PDF/A bytes
*/
private byte[] convertToPdfA(
Path basePdfPath,
Path loPdfPath,
int pdfaPart,
Set<String> missingFonts,
boolean importImages)
throws Exception {
try (PDDocument baseDoc = Loader.loadPDF(basePdfPath.toFile())) {
if (loPdfPath != null) {
try (PDDocument loDoc = Loader.loadPDF(loPdfPath.toFile())) {
if (!missingFonts.isEmpty()) {
embedMissingFonts(loDoc, baseDoc, missingFonts);
}
if (importImages) {
importFlattenedImages(loDoc, baseDoc);
}
}
}
return processWithPDFBox(baseDoc, pdfaPart);
}
}
private byte[] processWithPDFBox(PDDocument document, int pdfaPart) throws Exception {
removeElementsForPdfA(document, pdfaPart);
mergeAndAddXmpMetadata(document, pdfaPart);
addICCProfileIfNotPresent(document);
// Mark the document as PDF/A
PDDocumentCatalog catalog = document.getDocumentCatalog();
catalog.setMetadata(
document.getDocumentCatalog().getMetadata()); // Ensure metadata is linked
catalog.setViewerPreferences(
new PDViewerPreferences(catalog.getCOSObject())); // PDF/A best practice
document.getDocument().setVersion(pdfaPart == 1 ? 1.4f : 1.7f);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
if (pdfaPart == 1) {
document.save(baos, CompressParameters.NO_COMPRESSION);
} else {
document.save(baos);
}
return baos.toByteArray();
}
private Path runLibreOfficeConversion(Path tempInputFile, int pdfaPart) throws Exception {
// Create temp output directory
Path tempOutputDir = Files.createTempDirectory("output_");
// Determine PDF/A filter based on requested format
String pdfFilter =
pdfaPart == 2
? "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"2\"}}"
: "pdf:writer_pdf_Export:{\"SelectPdfVersion\":{\"type\":\"long\",\"value\":\"1\"}}";
// Prepare LibreOffice command
List<String> command =
new ArrayList<>(
Arrays.asList(
"soffice",
"--headless",
"--nologo",
"--convert-to",
pdfFilter,
"--outdir",
tempOutputDir.toString(),
tempInputFile.toString()));
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
.runCommandWithOutputHandling(command);
if (returnCode.getRc() != 0) {
log.error("PDF/A conversion failed with return code: {}", returnCode.getRc());
throw new RuntimeException("PDF/A conversion failed");
}
// Get the output file
File[] outputFiles = tempOutputDir.toFile().listFiles();
if (outputFiles == null || outputFiles.length != 1) {
throw new RuntimeException(
"Expected one output PDF, found "
+ (outputFiles == null ? "none" : outputFiles.length));
}
return outputFiles[0].toPath();
}
private void embedMissingFonts(PDDocument loDoc, PDDocument baseDoc, Set<String> missingFonts)
throws IOException {
List<PDPage> loPages = new ArrayList<>();
loDoc.getPages().forEach(loPages::add);
List<PDPage> basePages = new ArrayList<>();
baseDoc.getPages().forEach(basePages::add);
for (int i = 0; i < loPages.size(); i++) {
PDResources loRes = loPages.get(i).getResources();
PDResources baseRes = basePages.get(i).getResources();
for (COSName fontKey : loRes.getFontNames()) {
PDFont loFont = loRes.getFont(fontKey);
if (loFont == null) continue;
String psName = loFont.getName();
if (!missingFonts.contains(psName)) continue;
PDFontDescriptor desc = loFont.getFontDescriptor();
if (desc == null) continue;
PDStream fontStream = null;
if (desc.getFontFile() != null) {
fontStream = desc.getFontFile();
} else if (desc.getFontFile2() != null) {
fontStream = desc.getFontFile2();
} else if (desc.getFontFile3() != null) {
fontStream = desc.getFontFile3();
}
if (fontStream == null) continue;
try (InputStream in = fontStream.createInputStream()) {
PDFont newFont = null;
try {
newFont = PDType0Font.load(baseDoc, in, false);
} catch (IOException e1) {
try {
newFont = PDTrueTypeFont.load(baseDoc, in, null);
} catch (IOException | IllegalArgumentException e2) {
log.error("Could not embed font {}: {}", psName, e2.getMessage());
continue;
}
}
if (newFont != null) {
baseRes.put(fontKey, newFont);
}
}
}
}
}
private Set<String> findUnembeddedFontNames(PDDocument doc) throws IOException {
Set<String> missing = new HashSet<>();
for (PDPage page : doc.getPages()) {
PDResources res = page.getResources();
for (COSName name : res.getFontNames()) {
PDFont font = res.getFont(name);
if (font != null && !font.isEmbedded()) {
missing.add(font.getName());
}
}
}
return missing;
}
private void importFlattenedImages(PDDocument loDoc, PDDocument baseDoc) throws IOException {
List<PDPage> loPages = new ArrayList<>();
loDoc.getPages().forEach(loPages::add);
List<PDPage> basePages = new ArrayList<>();
baseDoc.getPages().forEach(basePages::add);
for (int i = 0; i < loPages.size(); i++) {
PDPage loPage = loPages.get(i);
PDPage basePage = basePages.get(i);
PDResources loRes = loPage.getResources();
PDResources baseRes = basePage.getResources();
Set<COSName> toReplace = detectTransparentXObjects(basePage);
for (COSName name : toReplace) {
PDXObject loXo = loRes.getXObject(name);
if (!(loXo instanceof PDImageXObject img)) continue;
PDImageXObject newImg = LosslessFactory.createFromImage(baseDoc, img.getImage());
// replace the resource under the same name
baseRes.put(name, newImg);
}
}
}
private Set<COSName> detectTransparentXObjects(PDPage page) {
Set<COSName> transparentObjects = new HashSet<>();
PDResources res = page.getResources();
if (res == null) return transparentObjects;
for (COSName name : res.getXObjectNames()) {
try {
PDXObject xo = res.getXObject(name);
if (xo instanceof PDImageXObject img) {
COSDictionary d = img.getCOSObject();
if (d.containsKey(COSName.SMASK)
|| isTransparencyGroup(d)
|| d.getBoolean(COSName.INTERPOLATE, false)) {
transparentObjects.add(name);
}
}
} catch (IOException ioe) {
log.error("Error processing XObject {}: {}", name.getName(), ioe.getMessage());
}
}
return transparentObjects;
}
private boolean isTransparencyGroup(COSDictionary dict) {
COSBase g = dict.getDictionaryObject(COSName.GROUP);
return g instanceof COSDictionary gd
&& COSName.TRANSPARENCY.equals(gd.getCOSName(COSName.S));
}
private boolean hasTransparentImages(PDDocument doc) {
for (PDPage page : doc.getPages()) {
PDResources res = page.getResources();
if (res == null) continue;
for (COSName name : res.getXObjectNames()) {
try {
PDXObject xo = res.getXObject(name);
if (xo instanceof PDImageXObject img) {
COSDictionary dict = img.getCOSObject();
if (dict.containsKey(COSName.SMASK)) return true;
COSBase g = dict.getDictionaryObject(COSName.GROUP);
if (g instanceof COSDictionary gd
&& COSName.TRANSPARENCY.equals(gd.getCOSName(COSName.S))) {
return true;
}
if (dict.getBoolean(COSName.INTERPOLATE, false)) return true;
}
} catch (IOException ioe) {
log.error("Error processing XObject {}: {}", name.getName(), ioe.getMessage());
}
}
}
return false;
}
private void sanitizePdfA(COSBase base, PDResources resources, int pdfaPart) {
if (base instanceof COSDictionary dict) {
if (pdfaPart == 1) {
// Remove transparency-related elements
COSBase group = dict.getDictionaryObject(COSName.GROUP);
if (group instanceof COSDictionary gDict
&& COSName.TRANSPARENCY.equals(gDict.getCOSName(COSName.S))) {
dict.removeItem(COSName.GROUP);
}
dict.removeItem(COSName.SMASK);
// Transparency blending constants (/CA, /ca) disallowed in PDF/A-1
dict.removeItem(COSName.CA);
dict.removeItem(COSName.getPDFName("ca"));
}
// Interpolation (non-deterministic image scaling) required to be false
if (dict.containsKey(COSName.INTERPOLATE)
&& dict.getBoolean(COSName.INTERPOLATE, true)) {
dict.setBoolean(COSName.INTERPOLATE, false);
}
// Remove common forbidden features (for PDF/A 1 and 2)
dict.removeItem(COSName.JAVA_SCRIPT);
dict.removeItem(COSName.getPDFName("JS"));
dict.removeItem(COSName.getPDFName("RichMedia"));
dict.removeItem(COSName.getPDFName("Movie"));
dict.removeItem(COSName.getPDFName("Sound"));
dict.removeItem(COSName.getPDFName("Launch"));
dict.removeItem(COSName.URI);
dict.removeItem(COSName.getPDFName("GoToR"));
dict.removeItem(COSName.EMBEDDED_FILES);
dict.removeItem(COSName.FILESPEC);
// Recurse through all entries in the dictionary
for (Map.Entry<COSName, COSBase> entry : dict.entrySet()) {
sanitizePdfA(entry.getValue(), resources, pdfaPart);
}
} else if (base instanceof COSArray arr) {
// Recursively sanitize each item in the array
for (COSBase item : arr) {
sanitizePdfA(item, resources, pdfaPart);
}
}
}
private void removeElementsForPdfA(PDDocument doc, int pdfaPart) {
if (pdfaPart == 1) {
// Remove Optional Content (Layers) - not allowed in PDF/A-1
doc.getDocumentCatalog().getCOSObject().removeItem(COSName.getPDFName("OCProperties"));
}
for (PDPage page : doc.getPages()) {
if (pdfaPart == 1) {
page.setAnnotations(Collections.emptyList());
}
PDResources res = page.getResources();
// Clean page-level dictionary
sanitizePdfA(page.getCOSObject(), res, pdfaPart);
// sanitize each Form XObject
if (res != null) {
for (COSName name : res.getXObjectNames()) {
try {
PDXObject xo = res.getXObject(name);
if (xo instanceof PDFormXObject form) {
sanitizePdfA(form.getCOSObject(), res, pdfaPart);
} else if (xo instanceof PDImageXObject img) {
sanitizePdfA(img.getCOSObject(), res, pdfaPart);
}
} catch (IOException ioe) {
log.error("Cannot load XObject {}: {}", name.getName(), ioe.getMessage());
}
}
}
}
}
/** Embbeds the XMP metadata required for PDF/A compliance. */
private void mergeAndAddXmpMetadata(PDDocument document, int pdfaPart) throws Exception {
PDMetadata existingMetadata = document.getDocumentCatalog().getMetadata();
XMPMetadata xmp;
// Load existing XMP if available
if (existingMetadata != null) {
try (InputStream xmpStream = existingMetadata.createInputStream()) {
DomXmpParser parser = new DomXmpParser();
parser.setStrictParsing(false);
xmp = parser.parse(xmpStream);
} catch (Exception e) {
xmp = XMPMetadata.createXMPMetadata();
}
} else {
xmp = XMPMetadata.createXMPMetadata();
}
PDDocumentInformation docInfo = document.getDocumentInformation();
if (docInfo == null) {
docInfo = new PDDocumentInformation();
}
String originalCreator = Optional.ofNullable(docInfo.getCreator()).orElse("Unknown");
String originalProducer = Optional.ofNullable(docInfo.getProducer()).orElse("Unknown");
// Only keep the original creator so it can match xmp creator tool for compliance
DublinCoreSchema dcSchema = xmp.getDublinCoreSchema();
if (dcSchema != null) {
List<String> existingCreators = dcSchema.getCreators();
if (existingCreators != null) {
for (String creator : new ArrayList<>(existingCreators)) {
dcSchema.removeCreator(creator);
}
}
} else {
dcSchema = xmp.createAndAddDublinCoreSchema();
}
dcSchema.addCreator(originalCreator);
PDFAIdentificationSchema pdfaSchema =
(PDFAIdentificationSchema) xmp.getSchema(PDFAIdentificationSchema.class);
if (pdfaSchema == null) {
pdfaSchema = xmp.createAndAddPDFAIdentificationSchema();
}
pdfaSchema.setPart(pdfaPart);
pdfaSchema.setConformance("B");
XMPBasicSchema xmpBasicSchema = xmp.getXMPBasicSchema();
if (xmpBasicSchema == null) {
xmpBasicSchema = xmp.createAndAddXMPBasicSchema();
}
AdobePDFSchema adobePdfSchema = xmp.getAdobePDFSchema();
if (adobePdfSchema == null) {
adobePdfSchema = xmp.createAndAddAdobePDFSchema();
}
docInfo.setCreator(originalCreator);
xmpBasicSchema.setCreatorTool(originalCreator);
docInfo.setProducer(originalProducer);
adobePdfSchema.setProducer(originalProducer);
String originalAuthor = docInfo.getAuthor();
if (originalAuthor != null && !originalAuthor.isBlank()) {
docInfo.setAuthor(null);
// If the author is set, we keep it in the XMP metadata
if (!originalCreator.equals(originalAuthor)) {
dcSchema.addCreator(originalAuthor);
}
}
String title = docInfo.getTitle();
if (title != null && !title.isBlank()) {
dcSchema.setTitle(title);
}
String subject = docInfo.getSubject();
if (subject != null && !subject.isBlank()) {
dcSchema.addSubject(subject);
}
String keywords = docInfo.getKeywords();
if (keywords != null && !keywords.isBlank()) {
adobePdfSchema.setKeywords(keywords);
}
// Set creation and modification dates
Calendar now = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
Calendar originalCreationDate = docInfo.getCreationDate();
if (originalCreationDate == null) {
originalCreationDate = now;
}
docInfo.setCreationDate(originalCreationDate);
xmpBasicSchema.setCreateDate(originalCreationDate);
docInfo.setModificationDate(now);
xmpBasicSchema.setModifyDate(now);
xmpBasicSchema.setMetadataDate(now);
// Serialize the created metadata so it can be attached to the existent metadata
ByteArrayOutputStream xmpOut = new ByteArrayOutputStream();
new XmpSerializer().serialize(xmp, xmpOut, true);
PDMetadata newMetadata = new PDMetadata(document);
newMetadata.importXMPMetadata(xmpOut.toByteArray());
document.getDocumentCatalog().setMetadata(newMetadata);
}
private void addICCProfileIfNotPresent(PDDocument document) throws Exception {
if (document.getDocumentCatalog().getOutputIntents().isEmpty()) {
try (InputStream colorProfile = getClass().getResourceAsStream("/icc/sRGB2014.icc")) {
PDOutputIntent outputIntent = new PDOutputIntent(document, colorProfile);
outputIntent.setInfo("sRGB IEC61966-2.1");
outputIntent.setOutputCondition("sRGB IEC61966-2.1");
outputIntent.setOutputConditionIdentifier("sRGB IEC61966-2.1");
outputIntent.setRegistryName("http://www.color.org");
document.getDocumentCatalog().addOutputIntent(outputIntent);
} catch (Exception e) {
log.error("Failed to load ICC profile: {}", e.getMessage());
}
}
}
private File preProcessHighlights(File inputPdf) throws Exception {
try (PDDocument document = Loader.loadPDF(inputPdf)) {
for (PDPage page : document.getPages()) {
// Retrieve the annotations on the page.
List<PDAnnotation> annotations = page.getAnnotations();
for (PDAnnotation annot : annotations) {
// Process only highlight annotations.
if ("Highlight".equals(annot.getSubtype())
&& annot instanceof PDAnnotationTextMarkup highlight) {
// Create a new appearance stream with the same bounding box.
float[] colorComponents =
highlight.getColor() != null
? highlight.getColor().getComponents()
: new float[] {1f, 1f, 0f};
Color highlightColor =
new Color(
colorComponents[0], colorComponents[1], colorComponents[2]);
float[] quadPoints = highlight.getQuadPoints();
if (quadPoints != null) {
try (PDPageContentStream cs =
new PDPageContentStream(
document,
page,
PDPageContentStream.AppendMode.PREPEND,
true,
true)) {
cs.setStrokingColor(highlightColor);
cs.setLineWidth(0.05f);
float spacing = 2f;
// Draw diagonal lines across the highlight area to simulate
// transparency.
for (int i = 0; i < quadPoints.length; i += 8) {
float minX =
Math.min(
Math.min(quadPoints[i], quadPoints[i + 2]),
Math.min(quadPoints[i + 4], quadPoints[i + 6]));
float maxX =
Math.max(
Math.max(quadPoints[i], quadPoints[i + 2]),
Math.max(quadPoints[i + 4], quadPoints[i + 6]));
float minY =
Math.min(
Math.min(quadPoints[i + 1], quadPoints[i + 3]),
Math.min(quadPoints[i + 5], quadPoints[i + 7]));
float maxY =
Math.max(
Math.max(quadPoints[i + 1], quadPoints[i + 3]),
Math.max(quadPoints[i + 5], quadPoints[i + 7]));
float width = maxX - minX;
float height = maxY - minY;
for (float y = minY; y <= maxY; y += spacing) {
float len = Math.min(width, maxY - y);
cs.moveTo(minX, y);
cs.lineTo(minX + len, y + len);
}
for (float x = minX + spacing; x <= maxX; x += spacing) {
float len = Math.min(maxX - x, height);
cs.moveTo(x, minY);
cs.lineTo(x + len, minY + len);
}
}
cs.stroke();
}
}
page.getAnnotations().remove(highlight);
COSDictionary pageDict = page.getCOSObject();
if (pageDict.containsKey(COSName.GROUP)) {
COSDictionary groupDict =
(COSDictionary) pageDict.getDictionaryObject(COSName.GROUP);
if (groupDict != null) {
if (COSName.TRANSPARENCY
.getName()
.equalsIgnoreCase(groupDict.getNameAsString(COSName.S))) {
pageDict.removeItem(COSName.GROUP);
}
}
}
}
}
}
// Save the modified document to a temporary file.
File preProcessedFile = Files.createTempFile("preprocessed_", ".pdf").toFile();
document.save(preProcessedFile);
return preProcessedFile;
}
}
} }

Binary file not shown.