From 4e8d8e3d53096a455e54b7a28d7684e234dee0d2 Mon Sep 17 00:00:00 2001 From: Felix Kaspar Date: Thu, 26 Oct 2023 19:56:23 +0200 Subject: [PATCH] Started working on splitOn empty/qr-/barcode --- public/functions/extractPages.js | 21 +----- public/functions/removeBlankPages.js | 69 +------------------- public/functions/shared/createSubDocument.js | 16 +++++ public/functions/shared/detectEmptyPages.js | 64 ++++++++++++++++++ public/functions/splitOn.js | 48 ++++++++++++++ public/functions/splitPDF.js | 2 +- 6 files changed, 135 insertions(+), 85 deletions(-) create mode 100644 public/functions/shared/createSubDocument.js create mode 100644 public/functions/shared/detectEmptyPages.js create mode 100644 public/functions/splitOn.js diff --git a/public/functions/extractPages.js b/public/functions/extractPages.js index 20dee1503..dd2cd7c02 100644 --- a/public/functions/extractPages.js +++ b/public/functions/extractPages.js @@ -1,23 +1,8 @@ +import { createSubDocument } from "./shared/createSubDocument.js"; + export async function extractPages(snapshot, pagesToExtractArray, PDFLib) { const pdfDoc = await PDFLib.PDFDocument.load(snapshot) // TODO: invent a better format for pagesToExtractArray and convert it. return createSubDocument(pdfDoc, pagesToExtractArray, PDFLib); -}; - -export async function createSubDocument(pdfDoc, pagesToExtractArray, PDFLib) { - const subDocument = await PDFLib.PDFDocument.create(); - - // Check that array max number is not larger pdf pages number - if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) { - throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`); - } - - const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray); - - for (let i = 0; i < copiedPages.length; i++) { - subDocument.addPage(copiedPages[i]); - } - - return subDocument.save(); -} \ No newline at end of file +}; \ No newline at end of file diff --git a/public/functions/removeBlankPages.js b/public/functions/removeBlankPages.js index 7e9fb939b..4c3e9a3ff 100644 --- a/public/functions/removeBlankPages.js +++ b/public/functions/removeBlankPages.js @@ -1,6 +1,8 @@ +import { detectEmptyPages } from "./shared/detectEmptyPages.js"; + export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV, PDFLib) { - const emptyPages = await findEmptyPages(snapshot); + const emptyPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV); console.log("Empty Pages: ", emptyPages); @@ -12,69 +14,4 @@ export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV, }) return pdfDoc.save(); - - async function findEmptyPages(snapshot) { - const pdfDoc = await PDFJS.getDocument(snapshot).promise; - - const emptyPages = []; - for (let i = 1; i <= pdfDoc.numPages; i++) { - const page = await pdfDoc.getPage(i); - console.log("Checking page " + i); - - if(!await hasText(page)) { - console.log(`Found text on Page ${i}, page is not empty`); - continue; - } - - if(!await areImagesBlank(page, whiteThreashold)) { - console.log(`Found non white image on Page ${i}, page is not empty`); - continue; - } - - console.log(`Page ${i} is empty.`); - emptyPages.push(i - 1); - } - return emptyPages; - } - - async function areImagesBlank(page, whiteThreashold) { - const ops = await page.getOperatorList(); - - for (var j=0; j < ops.fnArray.length; j++) { - if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) { - const image = page.objs.get(ops.argsArray[j][0]); - if(image.data) { - return isImageBlank(image, whiteThreashold); - } - } - } - return true; - } - - async function hasText(page) { - const textContent = await page.getTextContent(); - return textContent.items.length === 0; - } - - async function isImageBlank(image, threshold) { - const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4); - src.data.set(image.data); - // Convert the image to grayscale - const gray = new OpenCV.cv.Mat(); - OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY); - - // Calculate the mean value of the grayscale image - const meanValue = OpenCV.cv.mean(gray); - - // Free memory - src.delete(); - gray.delete(); - - // Check if the mean value is below the threshold - if (meanValue[0] <= threshold) { - return true; - } else { - return false; - } - } }; \ No newline at end of file diff --git a/public/functions/shared/createSubDocument.js b/public/functions/shared/createSubDocument.js new file mode 100644 index 000000000..c0ff96202 --- /dev/null +++ b/public/functions/shared/createSubDocument.js @@ -0,0 +1,16 @@ +export async function createSubDocument(pdfDoc, pagesToExtractArray, PDFLib) { + const subDocument = await PDFLib.PDFDocument.create(); + + // Check that array max number is not larger pdf pages number + if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) { + throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`); + } + + const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray); + + for (let i = 0; i < copiedPages.length; i++) { + subDocument.addPage(copiedPages[i]); + } + + return subDocument.save(); +} \ No newline at end of file diff --git a/public/functions/shared/detectEmptyPages.js b/public/functions/shared/detectEmptyPages.js new file mode 100644 index 000000000..98d023b9f --- /dev/null +++ b/public/functions/shared/detectEmptyPages.js @@ -0,0 +1,64 @@ +export async function detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV) { + const pdfDoc = await PDFJS.getDocument(snapshot).promise; + + const emptyPages = []; + for (let i = 1; i <= pdfDoc.numPages; i++) { + const page = await pdfDoc.getPage(i); + console.log("Checking page " + i); + + if(!await hasText(page)) { + console.log(`Found text on Page ${i}, page is not empty`); + continue; + } + + if(!await areImagesBlank(page, whiteThreashold)) { + console.log(`Found non white image on Page ${i}, page is not empty`); + continue; + } + + console.log(`Page ${i} is empty.`); + emptyPages.push(i - 1); + } + return emptyPages; + + async function hasText(page) { + const textContent = await page.getTextContent(); + return textContent.items.length === 0; + } + + async function areImagesBlank(page, threshold) { + const ops = await page.getOperatorList(); + + for (var j=0; j < ops.fnArray.length; j++) { + if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) { + const image = page.objs.get(ops.argsArray[j][0]); + if(image.data) { + return isImageBlank(image, threshold); + } + } + } + return true; + } + + async function isImageBlank(image, threshold) { + const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4); + src.data.set(image.data); + // Convert the image to grayscale + const gray = new OpenCV.cv.Mat(); + OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY); + + // Calculate the mean value of the grayscale image + const meanValue = OpenCV.cv.mean(gray); + + // Free memory + src.delete(); + gray.delete(); + + // Check if the mean value is below the threshold + if (meanValue[0] <= threshold) { + return true; + } else { + return false; + } + } +} \ No newline at end of file diff --git a/public/functions/splitOn.js b/public/functions/splitOn.js new file mode 100644 index 000000000..9c98f2afe --- /dev/null +++ b/public/functions/splitOn.js @@ -0,0 +1,48 @@ +import { detectEmptyPages } from "./shared/detectEmptyPages"; + +/** + * @typedef {"BAR_CODE"|"QR_CODE"|"BLANK_PAGE"} SplitType + */ + +/** + * + * @param {Uint16Array} snapshot + * @param {SplitType} type + * @param {} PDFJS + * @param {} OpenCV + * @param {} PDFLib + * @param {} QRCode + * @returns + */ +export async function splitOn(snapshot, type, whiteThreashold, PDFJS, OpenCV, PDFLib, QRCode) { + + let splitAtPages = []; + + switch (type) { + case "BAR_CODE": + // TODO: Implement + throw new Error("This split-type has not been implemented yet") + break; + + case "QR_CODE": + // TODO: Implement + throw new Error("This split-type has not been implemented yet") + break; + + case "BLANK_PAGE": + splitAtPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV); + break; + + default: + throw new Error("An invalid split-type was provided.") + break; + } + + console.log("Split At Pages: ", splitAtPages); + + const pdfDoc = await PDFLib.PDFDocument.load(snapshot); + + // TODO: Remove detected Pages & Split + + return pdfDoc.save(); +}; \ No newline at end of file diff --git a/public/functions/splitPDF.js b/public/functions/splitPDF.js index dac378ce1..55b6b5cfd 100644 --- a/public/functions/splitPDF.js +++ b/public/functions/splitPDF.js @@ -1,4 +1,4 @@ -import { createSubDocument } from "./extractPages.js"; +import { createSubDocument } from "./shared/createSubDocument.js"; export async function splitPDF(snapshot, splitAfterPageArray, PDFLib) { const pdfDoc = await PDFLib.PDFDocument.load(snapshot)