Sebastian Espei 8cbb4367ab
PDF-to-Image different page formats fix (#1460)
* Improve the PDF rendering process for pages of varying sizes

This commit includes changes to handle the rendering of PDF documents with pages of different sizes. The updated code calculates the dimensions of each page upfront and assembles a final combined image that accommodates for the differing page dimensions. This approach avoids repetitive renderings of the same page sizes.

* Refactor image preparation for Pdf to Image
2024-06-14 23:39:30 +01:00

515 lines
22 KiB
Java

package stirling.software.SPDF.utils;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.awt.image.RenderedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import javax.imageio.*;
import javax.imageio.stream.ImageOutputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.web.multipart.MultipartFile;
import io.github.pixee.security.Filenames;
import stirling.software.SPDF.model.PdfMetadata;
public class PdfUtils {
private static final Logger logger = LoggerFactory.getLogger(PdfUtils.class);
public static PDRectangle textToPageSize(String size) {
switch (size.toUpperCase()) {
case "A0":
return PDRectangle.A0;
case "A1":
return PDRectangle.A1;
case "A2":
return PDRectangle.A2;
case "A3":
return PDRectangle.A3;
case "A4":
return PDRectangle.A4;
case "A5":
return PDRectangle.A5;
case "A6":
return PDRectangle.A6;
case "LETTER":
return PDRectangle.LETTER;
case "LEGAL":
return PDRectangle.LEGAL;
default:
throw new IllegalArgumentException("Invalid standard page size: " + size);
}
}
public static List<RenderedImage> getAllImages(PDResources resources) throws IOException {
List<RenderedImage> images = new ArrayList<>();
for (COSName name : resources.getXObjectNames()) {
PDXObject object = resources.getXObject(name);
if (object instanceof PDImageXObject) {
images.add(((PDImageXObject) object).getImage());
} else if (object instanceof PDFormXObject) {
images.addAll(getAllImages(((PDFormXObject) object).getResources()));
}
}
return images;
}
public static boolean hasImages(PDDocument document, String pagesToCheck) throws IOException {
String[] pageOrderArr = pagesToCheck.split(",");
List<Integer> pageList =
GeneralUtils.parsePageList(pageOrderArr, document.getNumberOfPages());
for (int pageNumber : pageList) {
PDPage page = document.getPage(pageNumber);
if (hasImagesOnPage(page)) {
return true;
}
}
return false;
}
public static boolean hasText(PDDocument document, String pageNumbersToCheck, String phrase)
throws IOException {
String[] pageOrderArr = pageNumbersToCheck.split(",");
List<Integer> pageList =
GeneralUtils.parsePageList(pageOrderArr, document.getNumberOfPages());
for (int pageNumber : pageList) {
PDPage page = document.getPage(pageNumber);
if (hasTextOnPage(page, phrase)) {
return true;
}
}
return false;
}
public static boolean hasImagesOnPage(PDPage page) throws IOException {
return getAllImages(page.getResources()).size() > 0;
}
public static boolean hasTextOnPage(PDPage page, String phrase) throws IOException {
PDFTextStripper textStripper = new PDFTextStripper();
PDDocument tempDoc = new PDDocument();
tempDoc.addPage(page);
String pageText = textStripper.getText(tempDoc);
tempDoc.close();
return pageText.contains(phrase);
}
public boolean containsTextInFile(PDDocument pdfDocument, String text, String pagesToCheck)
throws IOException {
PDFTextStripper textStripper = new PDFTextStripper();
String pdfText = "";
if (pagesToCheck == null || "all".equals(pagesToCheck)) {
pdfText = textStripper.getText(pdfDocument);
} else {
// remove whitespaces
pagesToCheck = pagesToCheck.replaceAll("\\s+", "");
String[] splitPoints = pagesToCheck.split(",");
for (String splitPoint : splitPoints) {
if (splitPoint.contains("-")) {
// Handle page ranges
String[] range = splitPoint.split("-");
int startPage = Integer.parseInt(range[0]);
int endPage = Integer.parseInt(range[1]);
for (int i = startPage; i <= endPage; i++) {
textStripper.setStartPage(i);
textStripper.setEndPage(i);
pdfText += textStripper.getText(pdfDocument);
}
} else {
// Handle individual page
int page = Integer.parseInt(splitPoint);
textStripper.setStartPage(page);
textStripper.setEndPage(page);
pdfText += textStripper.getText(pdfDocument);
}
}
}
pdfDocument.close();
return pdfText.contains(text);
}
public boolean pageCount(PDDocument pdfDocument, int pageCount, String comparator)
throws IOException {
int actualPageCount = pdfDocument.getNumberOfPages();
pdfDocument.close();
switch (comparator.toLowerCase()) {
case "greater":
return actualPageCount > pageCount;
case "equal":
return actualPageCount == pageCount;
case "less":
return actualPageCount < pageCount;
default:
throw new IllegalArgumentException(
"Invalid comparator. Only 'greater', 'equal', and 'less' are supported.");
}
}
public boolean pageSize(PDDocument pdfDocument, String expectedPageSize) throws IOException {
PDPage firstPage = pdfDocument.getPage(0);
PDRectangle mediaBox = firstPage.getMediaBox();
float actualPageWidth = mediaBox.getWidth();
float actualPageHeight = mediaBox.getHeight();
pdfDocument.close();
// Assumes the expectedPageSize is in the format "widthxheight", e.g. "595x842" for A4
String[] dimensions = expectedPageSize.split("x");
float expectedPageWidth = Float.parseFloat(dimensions[0]);
float expectedPageHeight = Float.parseFloat(dimensions[1]);
// Checks if the actual page size matches the expected page size
return actualPageWidth == expectedPageWidth && actualPageHeight == expectedPageHeight;
}
public static byte[] convertFromPdf(
byte[] inputStream,
String imageType,
ImageType colorType,
boolean singleImage,
int DPI,
String filename)
throws IOException, Exception {
try (PDDocument document = Loader.loadPDF(inputStream)) {
PDFRenderer pdfRenderer = new PDFRenderer(document);
pdfRenderer.setSubsamplingAllowed(true);
int pageCount = document.getNumberOfPages();
// Create a ByteArrayOutputStream to save the image(s) to
ByteArrayOutputStream baos = new ByteArrayOutputStream();
if (singleImage) {
if ("tiff".equals(imageType.toLowerCase())
|| "tif".equals(imageType.toLowerCase())) {
// Write the images to the output stream as a TIFF with multiple frames
ImageWriter writer = ImageIO.getImageWritersByFormatName("tiff").next();
ImageWriteParam param = writer.getDefaultWriteParam();
param.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
param.setCompressionType("ZLib");
param.setCompressionQuality(1.0f);
try (ImageOutputStream ios = ImageIO.createImageOutputStream(baos)) {
writer.setOutput(ios);
writer.prepareWriteSequence(null);
for (int i = 0; i < pageCount; ++i) {
BufferedImage image = pdfRenderer.renderImageWithDPI(i, DPI, colorType);
writer.writeToSequence(new IIOImage(image, null, null), param);
}
writer.endWriteSequence();
}
writer.dispose();
} else {
// Combine all images into a single big image
// Calculate the combined image dimensions
int maxWidth = 0;
int totalHeight = 0;
BufferedImage pdfSizeImage = null;
int pdfSizeImageIndex = -1;
// Using a map to store the rendered dimensions of each page size
// to avoid rendering the same page sizes multiple times
HashMap<PdfRenderSettingsKey, PdfImageDimensionValue> pageSizes =
new HashMap<>();
for (int i = 0; i < pageCount; ++i) {
PDPage page = document.getPage(i);
PDRectangle mediaBox = page.getMediaBox();
int rotation = page.getRotation();
PdfRenderSettingsKey settings =
new PdfRenderSettingsKey(
mediaBox.getWidth(), mediaBox.getHeight(), rotation);
PdfImageDimensionValue dimension = pageSizes.get(settings);
if (dimension == null) {
// Render the image to get the dimensions
pdfSizeImage = pdfRenderer.renderImageWithDPI(i, DPI, colorType);
pdfSizeImageIndex = i;
dimension =
new PdfImageDimensionValue(
pdfSizeImage.getWidth(), pdfSizeImage.getHeight());
pageSizes.put(settings, dimension);
if (pdfSizeImage.getWidth() > maxWidth) {
maxWidth = pdfSizeImage.getWidth();
}
}
totalHeight += dimension.height();
}
// Create a new BufferedImage to store the combined images
BufferedImage combined =
prepareImageForPdfToImage(maxWidth, totalHeight, imageType);
Graphics g = combined.getGraphics();
int currentHeight = 0;
BufferedImage pageImage;
// Check if the first image is the last rendered image
boolean firstImageAlreadyRendered = pdfSizeImageIndex == 0;
for (int i = 0; i < pageCount; ++i) {
if (firstImageAlreadyRendered && i == 0) {
pageImage = pdfSizeImage;
} else {
pageImage = pdfRenderer.renderImageWithDPI(i, DPI, colorType);
}
// Calculate the x-coordinate to center the image
int x = (maxWidth - pageImage.getWidth()) / 2;
g.drawImage(pageImage, x, currentHeight, null);
currentHeight += pageImage.getHeight();
}
// Write the image to the output stream
ImageIO.write(combined, imageType, baos);
}
// Log that the image was successfully written to the byte array
logger.info("Image successfully written to byte array");
} else {
// Zip the images and return as byte array
try (ZipOutputStream zos = new ZipOutputStream(baos)) {
for (int i = 0; i < pageCount; ++i) {
BufferedImage image = pdfRenderer.renderImageWithDPI(i, DPI, colorType);
try (ByteArrayOutputStream baosImage = new ByteArrayOutputStream()) {
ImageIO.write(image, imageType, baosImage);
// Add the image to the zip file
zos.putNextEntry(
new ZipEntry(
String.format(
filename + "_%d.%s",
i + 1,
imageType.toLowerCase())));
zos.write(baosImage.toByteArray());
}
}
// Log that the images were successfully written to the byte array
logger.info("Images successfully written to byte array as a zip");
}
}
return baos.toByteArray();
} catch (IOException e) {
// Log an error message if there is an issue converting the PDF to an image
logger.error("Error converting PDF to image", e);
throw e;
}
}
private static BufferedImage prepareImageForPdfToImage(
int maxWidth, int height, String imageType) {
BufferedImage combined;
if ("png".equalsIgnoreCase(imageType)) {
combined = new BufferedImage(maxWidth, height, BufferedImage.TYPE_INT_ARGB);
} else {
combined = new BufferedImage(maxWidth, height, BufferedImage.TYPE_INT_RGB);
}
if (!"png".equalsIgnoreCase(imageType)) {
Graphics g = combined.getGraphics();
g.setColor(Color.WHITE);
g.fillRect(0, 0, combined.getWidth(), combined.getHeight());
g.dispose();
}
return combined;
}
public static byte[] imageToPdf(
MultipartFile[] files, String fitOption, boolean autoRotate, String colorType)
throws IOException {
try (PDDocument doc = new PDDocument()) {
for (MultipartFile file : files) {
String contentType = file.getContentType();
String originalFilename = Filenames.toSimpleFileName(file.getOriginalFilename());
if (originalFilename != null
&& (originalFilename.toLowerCase().endsWith(".tiff")
|| originalFilename.toLowerCase().endsWith(".tif"))) {
ImageReader reader = ImageIO.getImageReadersByFormatName("tiff").next();
reader.setInput(ImageIO.createImageInputStream(file.getInputStream()));
int numPages = reader.getNumImages(true);
for (int i = 0; i < numPages; i++) {
BufferedImage pageImage = reader.read(i);
BufferedImage convertedImage =
ImageProcessingUtils.convertColorType(pageImage, colorType);
PDImageXObject pdImage =
LosslessFactory.createFromImage(doc, convertedImage);
addImageToDocument(doc, pdImage, fitOption, autoRotate);
}
} else {
BufferedImage image = ImageIO.read(file.getInputStream());
BufferedImage convertedImage =
ImageProcessingUtils.convertColorType(image, colorType);
// Use JPEGFactory if it's JPEG since JPEG is lossy
PDImageXObject pdImage =
(contentType != null && "image/jpeg".equals(contentType))
? JPEGFactory.createFromImage(doc, convertedImage)
: LosslessFactory.createFromImage(doc, convertedImage);
addImageToDocument(doc, pdImage, fitOption, autoRotate);
}
}
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
doc.save(byteArrayOutputStream);
logger.info("PDF successfully saved to byte array");
return byteArrayOutputStream.toByteArray();
}
}
public static void addImageToDocument(
PDDocument doc, PDImageXObject image, String fitOption, boolean autoRotate)
throws IOException {
boolean imageIsLandscape = image.getWidth() > image.getHeight();
PDRectangle pageSize = PDRectangle.A4;
if (autoRotate && imageIsLandscape) {
pageSize = new PDRectangle(pageSize.getHeight(), pageSize.getWidth());
}
if ("fitDocumentToImage".equals(fitOption)) {
pageSize = new PDRectangle(image.getWidth(), image.getHeight());
}
PDPage page = new PDPage(pageSize);
doc.addPage(page);
float pageWidth = page.getMediaBox().getWidth();
float pageHeight = page.getMediaBox().getHeight();
try (PDPageContentStream contentStream =
new PDPageContentStream(doc, page, AppendMode.APPEND, true, true)) {
if ("fillPage".equals(fitOption) || "fitDocumentToImage".equals(fitOption)) {
contentStream.drawImage(image, 0, 0, pageWidth, pageHeight);
} else if ("maintainAspectRatio".equals(fitOption)) {
float imageAspectRatio = (float) image.getWidth() / (float) image.getHeight();
float pageAspectRatio = pageWidth / pageHeight;
float scaleFactor = 1.0f;
if (imageAspectRatio > pageAspectRatio) {
scaleFactor = pageWidth / image.getWidth();
} else {
scaleFactor = pageHeight / image.getHeight();
}
float xPos = (pageWidth - (image.getWidth() * scaleFactor)) / 2;
float yPos = (pageHeight - (image.getHeight() * scaleFactor)) / 2;
contentStream.drawImage(
image,
xPos,
yPos,
image.getWidth() * scaleFactor,
image.getHeight() * scaleFactor);
}
} catch (IOException e) {
logger.error("Error adding image to PDF", e);
throw e;
}
}
public static byte[] overlayImage(
byte[] pdfBytes, byte[] imageBytes, float x, float y, boolean everyPage)
throws IOException {
PDDocument document = Loader.loadPDF(pdfBytes);
// Get the first page of the PDF
int pages = document.getNumberOfPages();
for (int i = 0; i < pages; i++) {
PDPage page = document.getPage(i);
try (PDPageContentStream contentStream =
new PDPageContentStream(
document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
// Create an image object from the image bytes
PDImageXObject image = PDImageXObject.createFromByteArray(document, imageBytes, "");
// Draw the image onto the page at the specified x and y coordinates
contentStream.drawImage(image, x, y);
logger.info("Image successfully overlayed onto PDF");
if (!everyPage && i == 0) {
break;
}
} catch (IOException e) {
// Log an error message if there is an issue overlaying the image onto the PDF
logger.error("Error overlaying image onto PDF", e);
throw e;
}
}
// Create a ByteArrayOutputStream to save the PDF to
ByteArrayOutputStream baos = new ByteArrayOutputStream();
document.save(baos);
logger.info("PDF successfully saved to byte array");
return baos.toByteArray();
}
public static PdfMetadata extractMetadataFromPdf(PDDocument pdf) {
return PdfMetadata.builder()
.author(pdf.getDocumentInformation().getAuthor())
.producer(pdf.getDocumentInformation().getProducer())
.title(pdf.getDocumentInformation().getTitle())
.creator(pdf.getDocumentInformation().getCreator())
.subject(pdf.getDocumentInformation().getSubject())
.keywords(pdf.getDocumentInformation().getKeywords())
.creationDate(pdf.getDocumentInformation().getCreationDate())
.modificationDate(pdf.getDocumentInformation().getModificationDate())
.build();
}
public static void setMetadataToPdf(PDDocument pdf, PdfMetadata pdfMetadata) {
pdf.getDocumentInformation().setAuthor(pdfMetadata.getAuthor());
pdf.getDocumentInformation().setProducer(pdfMetadata.getProducer());
pdf.getDocumentInformation().setTitle(pdfMetadata.getTitle());
pdf.getDocumentInformation().setCreator(pdfMetadata.getCreator());
pdf.getDocumentInformation().setSubject(pdfMetadata.getSubject());
pdf.getDocumentInformation().setKeywords(pdfMetadata.getKeywords());
pdf.getDocumentInformation().setCreationDate(pdfMetadata.getCreationDate());
pdf.getDocumentInformation().setModificationDate(Calendar.getInstance());
}
/** Key for storing the dimensions of a rendered image in a map. */
private record PdfRenderSettingsKey(float mediaBoxWidth, float mediaBoxHeight, int rotation) {}
/** Value for storing the dimensions of a rendered image in a map. */
private record PdfImageDimensionValue(int width, int height) {}
}