Detect-Empty-Pages node (PDFJS, OpenCV)

This commit is contained in:
Felix Kaspar 2023-10-24 16:09:10 +02:00
parent 0320d76f05
commit 1588ae4948
6 changed files with 203 additions and 1 deletions

View File

@ -1,4 +1,6 @@
import PDFLib from 'pdf-lib';
import OpenCV from 'opencv-wasm';
import PDFJS from "pdfjs-dist";
import * as pdfcpuWraopper from "./public/wasm/pdfcpu-wrapper-node.js";
import { extractPages as dependantExtractPages } from "./public/functions/extractPages.js";
@ -10,6 +12,7 @@ import { scalePage as dependantScalePage } from './public/functions/scalePage.js
import { splitPDF as dependantSplitPDF } from './public/functions/splitPDF.js';
import { editMetadata as dependantEditMetadata } from './public/functions/editMetadata.js';
import { organizePages as dependantOrganizePages } from './public/functions/organizePages.js';
import { removeBlankPages as dependantRemoveBlankPages} from './public/functions/removeBlankPages.js';
export async function extractPages(snapshot, pagesToExtractArray) {
return dependantExtractPages(snapshot, pagesToExtractArray, PDFLib);
@ -45,4 +48,8 @@ export async function editMetadata(snapshot, metadata) {
export async function organizePages(snapshot, operation, customOrderString) {
return dependantOrganizePages(snapshot, operation, customOrderString, PDFLib);
}
export async function removeBlankPages(snapshot, whiteThreashold) {
return dependantRemoveBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV);
}

114
package-lock.json generated
View File

@ -39,6 +39,22 @@
"negotiator": "0.6.3"
}
},
"ajv": {
"version": "6.12.6",
"resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
"integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
"requires": {
"fast-deep-equal": "^3.1.1",
"fast-json-stable-stringify": "^2.0.0",
"json-schema-traverse": "^0.4.1",
"uri-js": "^4.2.2"
}
},
"ajv-keywords": {
"version": "3.5.2",
"resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-3.5.2.tgz",
"integrity": "sha512-5p6WTN0DdTGVQk6VjcEju19IgaHudalcfabD7yhDGeA6bcQnmL+CpveLJq/3hvfwd1aof6L386Ougkx6RfyMIQ=="
},
"archiver": {
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/archiver/-/archiver-6.0.1.tgz",
@ -103,6 +119,11 @@
"resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
"integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA=="
},
"big.js": {
"version": "5.2.2",
"resolved": "https://registry.npmjs.org/big.js/-/big.js-5.2.2.tgz",
"integrity": "sha512-vyL2OymJxmarO8gxMr0mhChsO9QGwhynfuu4+MHTAW6czfq9humCB7rKpUjDd9YUiDPU4mzpyupFSvOClAwbmQ=="
},
"bl": {
"version": "4.1.0",
"resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz",
@ -252,6 +273,11 @@
"resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
"integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow=="
},
"emojis-list": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/emojis-list/-/emojis-list-3.0.0.tgz",
"integrity": "sha512-/kyM18EfinwXZbno9FyUGeFh87KC8HRQBQGildHZbEuRyWFOmv1U10o9BBp8XVZDVNNuQKyIGIu5ZYAAXJ0V2Q=="
},
"encodeurl": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
@ -321,6 +347,11 @@
"busboy": "^1.6.0"
}
},
"fast-deep-equal": {
"version": "3.1.3",
"resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
"integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="
},
"fast-extend": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/fast-extend/-/fast-extend-1.0.2.tgz",
@ -331,6 +362,11 @@
"resolved": "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz",
"integrity": "sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ=="
},
"fast-json-stable-stringify": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
"integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw=="
},
"finalhandler": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.2.0.tgz",
@ -467,6 +503,19 @@
"resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
"integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ=="
},
"json-schema-traverse": {
"version": "0.4.1",
"resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
"integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg=="
},
"json5": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/json5/-/json5-1.0.2.tgz",
"integrity": "sha512-g1MWMLBiz8FKi1e4w0UyVL3w+iJceWAFBAaBnnGKOpNa5f8TLktkbre1+s6oICydWAm+HRUGTmI+//xv2hvXYA==",
"requires": {
"minimist": "^1.2.0"
}
},
"lazystream": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/lazystream/-/lazystream-1.0.1.tgz",
@ -504,6 +553,16 @@
}
}
},
"loader-utils": {
"version": "1.4.2",
"resolved": "https://registry.npmjs.org/loader-utils/-/loader-utils-1.4.2.tgz",
"integrity": "sha512-I5d00Pd/jwMD2QCduo657+YM/6L3KZu++pmX9VFncxaxvHcru9jx1lBaFft+r4Mt2jK0Yhp41XlRAihzPxHNCg==",
"requires": {
"big.js": "^5.2.2",
"emojis-list": "^3.0.0",
"json5": "^1.0.1"
}
},
"lodash": {
"version": "4.17.21",
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
@ -559,6 +618,11 @@
"brace-expansion": "^2.0.1"
}
},
"minimist": {
"version": "1.2.8",
"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
"integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA=="
},
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
@ -569,6 +633,11 @@
"resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.3.tgz",
"integrity": "sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg=="
},
"node-ensure": {
"version": "0.0.0",
"resolved": "https://registry.npmjs.org/node-ensure/-/node-ensure-0.0.0.tgz",
"integrity": "sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw=="
},
"normalize-path": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz",
@ -595,6 +664,11 @@
"wrappy": "1"
}
},
"opencv-wasm": {
"version": "4.3.0-10",
"resolved": "https://registry.npmjs.org/opencv-wasm/-/opencv-wasm-4.3.0-10.tgz",
"integrity": "sha512-EWmWLUzp2suoc6N44Y4ouWT85QwvShx23Q430R+lp6NyS828bjQn6mCgA3NJ6Z/S59aaTeeu+RhqPQIJIYld1w=="
},
"pako": {
"version": "1.0.11",
"resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz",
@ -621,6 +695,15 @@
"tslib": "^1.11.1"
}
},
"pdfjs-dist": {
"version": "2.0.943",
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-2.0.943.tgz",
"integrity": "sha512-iLhNcm4XceTHRaSU5o22ZGCm4YpuW5+rf4+BJFH/feBhMQLbCGBry+Jet8Q419QDI4qgARaIQzXuiNrsNWS8Yw==",
"requires": {
"node-ensure": "^0.0.0",
"worker-loader": "^2.0.0"
}
},
"process-nextick-args": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz",
@ -635,6 +718,11 @@
"ipaddr.js": "1.9.1"
}
},
"punycode": {
"version": "2.3.0",
"resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.0.tgz",
"integrity": "sha512-rRV+zQD8tVFys26lAGR9WUuS4iUAngJScM+ZRSKtvl5tKeZ2t5bvdNFdNHBW9FWR4guGHlgmsZ1G7BSm2wTbuA=="
},
"qs": {
"version": "6.11.0",
"resolved": "https://registry.npmjs.org/qs/-/qs-6.11.0.tgz",
@ -692,6 +780,15 @@
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
},
"schema-utils": {
"version": "0.4.7",
"resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-0.4.7.tgz",
"integrity": "sha512-v/iwU6wvwGK8HbU9yi3/nhGzP0yGSuhQMzL6ySiec1FSrZZDkhm4noOSWzrNFo/jEc+SJY6jRTwuwbSXJPDUnQ==",
"requires": {
"ajv": "^6.1.0",
"ajv-keywords": "^3.1.0"
}
},
"send": {
"version": "0.18.0",
"resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz",
@ -808,6 +905,14 @@
"resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
"integrity": "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ=="
},
"uri-js": {
"version": "4.4.1",
"resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
"integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
"requires": {
"punycode": "^2.1.0"
}
},
"util-deprecate": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
@ -823,6 +928,15 @@
"resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz",
"integrity": "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg=="
},
"worker-loader": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/worker-loader/-/worker-loader-2.0.0.tgz",
"integrity": "sha512-tnvNp4K3KQOpfRnD20m8xltE3eWh89Ye+5oj7wXEEHKac1P4oZ6p9oTj8/8ExqoSBnk9nu5Pr4nKfQ1hn2APJw==",
"requires": {
"loader-utils": "^1.0.0",
"schema-utils": "^0.4.0"
}
},
"wrappy": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",

View File

@ -13,7 +13,9 @@
"archiver": "^6.0.1",
"express": "^4.18.2",
"express-fileupload": "^1.4.1",
"pdf-lib": "^1.17.1"
"opencv-wasm": "^4.3.0-10",
"pdf-lib": "^1.17.1",
"pdfjs-dist": "^2.0.943"
},
"type": "module"
}

View File

@ -1,4 +1,5 @@
// PDFLib gets importet via index.html script-tag
// TODO: OpenCV
import * as pdfcpuWraopper from "./wasm/pdfcpu-wrapper-browser.js";
import { extractPages as dependantExtractPages } from "./functions/extractPages.js";
@ -10,6 +11,7 @@ import { scalePage as dependantScalePage } from './functions/scalePage.js';
import { splitPDF as dependantSplitPDF } from './functions/splitPDF.js';
import { editMetadata as dependantEditMetadata} from "./functions/editMetadata.js";
import { organizePages as dependantOrganizePages} from "./functions/organizePages.js";
import { removeBlankPages as dependantRemoveBlankPages} from "./functions/removeBlankPages.js";
export async function extractPages(snapshot, pagesToExtractArray) {
return dependantExtractPages(snapshot, pagesToExtractArray, PDFLib);
@ -45,4 +47,8 @@ export async function editMetadata(snapshot, metadata) {
export async function organizePages(snapshot, operation, customOrderString) {
return dependantOrganizePages(snapshot, operation, customOrderString, PDFLib);
}
export async function removeBlankPages(snapshot, whiteThreashold) {
return dependantRemoveBlankPages(snapshot, whiteThreashold, PDFLib, OpenCV);
}

View File

@ -0,0 +1,67 @@
export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV) {
const pdfDoc = await PDFJS.getDocument(snapshot).promise;
const emptyPages = [];
for (let i = 1; i <= pdfDoc.numPages; i++) {
const page = await pdfDoc.getPage(i);
if(!await hasText(page)) {
console.log("Found text on Page, page is not empty");
continue;
}
if(!await areImagesBlank(page, whiteThreashold)) {
console.log("Found image on Page, page is not empty");
continue;
}
emptyPages.push[i];
}
console.log(emptyPages);
// TODO: Remove emptyPages using pdflib
// return pdf;
async function areImagesBlank(page, whiteThreashold) {
const ops = await page.getOperatorList();
for (var j=0; j < ops.fnArray.length; j++) {
if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
const image = page.objs.get(ops.argsArray[j][0]);
if(image.data) {
return isImageBlank(image, whiteThreashold);
}
}
}
return true;
}
async function hasText(page) {
const textContent = await page.getTextContent();
return textContent.items.length === 0;
}
async function isImageBlank(image, threshold) {
const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
src.data.set(image.data);
// Convert the image to grayscale
const gray = new OpenCV.cv.Mat();
OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
// Calculate the mean value of the grayscale image
const meanValue = OpenCV.cv.mean(gray);
// Free memory
src.delete();
gray.delete();
// Check if the mean value is below the threshold
if (meanValue[0] <= threshold) {
return true;
} else {
return false;
}
}
};

View File

@ -109,6 +109,12 @@ export async function * traverseOperations(operations, input, Functions) {
input.buffer = await Functions.organizePages(input.buffer, operation.values["operation"], operation.values["customOrderString"]);
});
break;
case "removeBlankPages":
yield* nToN(input, operation, async (input) => {
input.fileName += "_removedBlanks";
input.buffer = await Functions.removeBlankPages(input.buffer, operation.values["whiteThreashold"]);
});
break;
default:
throw new Error(`${operation.type} not implemented yet.`);
break;