From 4e8d8e3d53096a455e54b7a28d7684e234dee0d2 Mon Sep 17 00:00:00 2001
From: Felix Kaspar <ich@felixkaspar.com>
Date: Thu, 26 Oct 2023 19:56:23 +0200
Subject: [PATCH] Started working on splitOn empty/qr-/barcode

---
 public/functions/extractPages.js             | 21 +-----
 public/functions/removeBlankPages.js         | 69 +-------------------
 public/functions/shared/createSubDocument.js | 16 +++++
 public/functions/shared/detectEmptyPages.js  | 64 ++++++++++++++++++
 public/functions/splitOn.js                  | 48 ++++++++++++++
 public/functions/splitPDF.js                 |  2 +-
 6 files changed, 135 insertions(+), 85 deletions(-)
 create mode 100644 public/functions/shared/createSubDocument.js
 create mode 100644 public/functions/shared/detectEmptyPages.js
 create mode 100644 public/functions/splitOn.js

diff --git a/public/functions/extractPages.js b/public/functions/extractPages.js
index 20dee1503..dd2cd7c02 100644
--- a/public/functions/extractPages.js
+++ b/public/functions/extractPages.js
@@ -1,23 +1,8 @@
+import { createSubDocument } from "./shared/createSubDocument.js";
+
 export async function extractPages(snapshot, pagesToExtractArray, PDFLib) {
     const pdfDoc = await PDFLib.PDFDocument.load(snapshot)
 
     // TODO: invent a better format for pagesToExtractArray and convert it.
     return createSubDocument(pdfDoc, pagesToExtractArray, PDFLib);
-};
-
-export async function createSubDocument(pdfDoc, pagesToExtractArray, PDFLib) {
-    const subDocument = await PDFLib.PDFDocument.create();
-
-    // Check that array max number is not larger pdf pages number
-    if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
-        throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
-    }
-
-    const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
-
-    for (let i = 0; i < copiedPages.length; i++) {
-        subDocument.addPage(copiedPages[i]);
-    }
-
-    return subDocument.save();
-}
\ No newline at end of file
+};
\ No newline at end of file
diff --git a/public/functions/removeBlankPages.js b/public/functions/removeBlankPages.js
index 7e9fb939b..4c3e9a3ff 100644
--- a/public/functions/removeBlankPages.js
+++ b/public/functions/removeBlankPages.js
@@ -1,6 +1,8 @@
+import { detectEmptyPages } from "./shared/detectEmptyPages.js";
+
 export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV, PDFLib) {
     
-    const emptyPages = await findEmptyPages(snapshot);
+    const emptyPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV);
 
     console.log("Empty Pages: ", emptyPages);
 
@@ -12,69 +14,4 @@ export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV,
     })
 
     return pdfDoc.save();
-
-    async function findEmptyPages(snapshot) {
-        const pdfDoc = await PDFJS.getDocument(snapshot).promise;
-
-        const emptyPages = [];
-        for (let i = 1; i <= pdfDoc.numPages; i++) {
-            const page = await pdfDoc.getPage(i);
-            console.log("Checking page " + i);
-    
-            if(!await hasText(page)) {
-                console.log(`Found text on Page ${i}, page is not empty`);
-                continue;
-            }
-    
-            if(!await areImagesBlank(page, whiteThreashold)) {
-                console.log(`Found non white image on Page ${i}, page is not empty`);
-                continue;
-            }
-    
-            console.log(`Page ${i} is empty.`);
-            emptyPages.push(i - 1);
-        }
-        return emptyPages;
-    }
-
-    async function areImagesBlank(page, whiteThreashold) {
-        const ops = await page.getOperatorList();
-    
-        for (var j=0; j < ops.fnArray.length; j++) {
-            if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
-                const image = page.objs.get(ops.argsArray[j][0]);
-                if(image.data) {
-                    return isImageBlank(image, whiteThreashold);
-                }
-            }
-        }
-        return true;
-    }
-    
-    async function hasText(page) {
-        const textContent = await page.getTextContent();
-        return textContent.items.length === 0;
-    }
-    
-    async function isImageBlank(image, threshold) {
-        const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
-        src.data.set(image.data);
-        // Convert the image to grayscale
-        const gray = new OpenCV.cv.Mat();
-        OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
-    
-        // Calculate the mean value of the grayscale image
-        const meanValue = OpenCV.cv.mean(gray);
-    
-        // Free memory
-        src.delete();
-        gray.delete();
-    
-        // Check if the mean value is below the threshold
-        if (meanValue[0] <= threshold) {
-            return true;
-        } else {
-            return false;
-        }
-    }
 };
\ No newline at end of file
diff --git a/public/functions/shared/createSubDocument.js b/public/functions/shared/createSubDocument.js
new file mode 100644
index 000000000..c0ff96202
--- /dev/null
+++ b/public/functions/shared/createSubDocument.js
@@ -0,0 +1,16 @@
+export async function createSubDocument(pdfDoc, pagesToExtractArray, PDFLib) {
+    const subDocument = await PDFLib.PDFDocument.create();
+
+    // Check that array max number is not larger pdf pages number
+    if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
+        throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
+    }
+
+    const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
+
+    for (let i = 0; i < copiedPages.length; i++) {
+        subDocument.addPage(copiedPages[i]);
+    }
+
+    return subDocument.save();
+}
\ No newline at end of file
diff --git a/public/functions/shared/detectEmptyPages.js b/public/functions/shared/detectEmptyPages.js
new file mode 100644
index 000000000..98d023b9f
--- /dev/null
+++ b/public/functions/shared/detectEmptyPages.js
@@ -0,0 +1,64 @@
+export async function detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV) {
+    const pdfDoc = await PDFJS.getDocument(snapshot).promise;
+
+    const emptyPages = [];
+    for (let i = 1; i <= pdfDoc.numPages; i++) {
+        const page = await pdfDoc.getPage(i);
+        console.log("Checking page " + i);
+
+        if(!await hasText(page)) {
+            console.log(`Found text on Page ${i}, page is not empty`);
+            continue;
+        }
+
+        if(!await areImagesBlank(page, whiteThreashold)) {
+            console.log(`Found non white image on Page ${i}, page is not empty`);
+            continue;
+        }
+
+        console.log(`Page ${i} is empty.`);
+        emptyPages.push(i - 1);
+    }
+    return emptyPages;
+
+    async function hasText(page) {
+        const textContent = await page.getTextContent();
+        return textContent.items.length === 0;
+    }
+
+    async function areImagesBlank(page, threshold) {
+        const ops = await page.getOperatorList();
+    
+        for (var j=0; j < ops.fnArray.length; j++) {
+            if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
+                const image = page.objs.get(ops.argsArray[j][0]);
+                if(image.data) {
+                    return isImageBlank(image, threshold);
+                }
+            }
+        }
+        return true;
+    }
+    
+    async function isImageBlank(image, threshold) {
+        const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
+        src.data.set(image.data);
+        // Convert the image to grayscale
+        const gray = new OpenCV.cv.Mat();
+        OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
+    
+        // Calculate the mean value of the grayscale image
+        const meanValue = OpenCV.cv.mean(gray);
+    
+        // Free memory
+        src.delete();
+        gray.delete();
+    
+        // Check if the mean value is below the threshold
+        if (meanValue[0] <= threshold) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+}
\ No newline at end of file
diff --git a/public/functions/splitOn.js b/public/functions/splitOn.js
new file mode 100644
index 000000000..9c98f2afe
--- /dev/null
+++ b/public/functions/splitOn.js
@@ -0,0 +1,48 @@
+import { detectEmptyPages } from "./shared/detectEmptyPages";
+
+/**
+ * @typedef {"BAR_CODE"|"QR_CODE"|"BLANK_PAGE"} SplitType
+ */
+
+/**
+ * 
+ * @param {Uint16Array} snapshot
+ * @param {SplitType} type
+ * @param {} PDFJS
+ * @param {} OpenCV
+ * @param {} PDFLib
+ * @param {} QRCode
+ * @returns 
+ */
+export async function splitOn(snapshot, type, whiteThreashold, PDFJS, OpenCV, PDFLib, QRCode) {
+    
+    let splitAtPages = [];
+
+    switch (type) {
+        case "BAR_CODE":
+            // TODO: Implement
+            throw new Error("This split-type has not been implemented yet")
+            break;
+
+        case "QR_CODE":
+            // TODO: Implement
+            throw new Error("This split-type has not been implemented yet")
+            break;
+
+        case "BLANK_PAGE":
+            splitAtPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV);
+            break;
+    
+        default:
+            throw new Error("An invalid split-type was provided.")
+            break;
+    }
+
+    console.log("Split At Pages: ", splitAtPages);
+
+    const pdfDoc = await PDFLib.PDFDocument.load(snapshot);
+
+    // TODO: Remove detected Pages & Split
+
+    return pdfDoc.save();
+};
\ No newline at end of file
diff --git a/public/functions/splitPDF.js b/public/functions/splitPDF.js
index dac378ce1..55b6b5cfd 100644
--- a/public/functions/splitPDF.js
+++ b/public/functions/splitPDF.js
@@ -1,4 +1,4 @@
-import { createSubDocument } from "./extractPages.js";
+import { createSubDocument } from "./shared/createSubDocument.js";
 
 export async function splitPDF(snapshot, splitAfterPageArray, PDFLib) {
     const pdfDoc = await PDFLib.PDFDocument.load(snapshot)