2023-11-12 04:46:09 +03:00
2024-01-04 20:17:54 -05:00
import { PdfFile } from "../../wrappers/PdfFile" ;
2024-02-23 23:55:29 +01:00
import { PDFPageProxy } from "pdfjs-dist/types/src/display/api" ;
2024-01-04 20:17:54 -05:00
import { Image , ImageKind } from "image-js" ;
2023-11-08 03:33:22 +03:00
2023-11-16 01:56:17 +01:00
import { getImagesOnPage , PDFJSImage } from "./getImagesOnPage.js" ;
2023-11-08 03:33:22 +03:00
2023-11-12 04:46:09 +03:00
export async function detectEmptyPages ( file : PdfFile , whiteThreashold : number ) : Promise < number [ ] > {
2023-11-15 02:38:07 +03:00
const pdfDoc = await file . pdfJsDocument ;
2023-11-08 03:33:22 +03:00
const emptyPages : number [ ] = [ ] ;
for ( let i = 1 ; i <= pdfDoc . numPages ; i ++ ) {
const page = await pdfDoc . getPage ( i ) ;
console . log ( "Checking page " + i ) ;
if ( ! await hasText ( page ) ) {
console . log ( ` Found text on Page ${ i } , page is not empty ` ) ;
continue ;
}
if ( ! await areImagesBlank ( page , whiteThreashold ) ) {
console . log ( ` Found non white image on Page ${ i } , page is not empty ` ) ;
continue ;
}
console . log ( ` Page ${ i } is empty. ` ) ;
emptyPages . push ( i - 1 ) ;
}
return emptyPages ;
}
async function hasText ( page : PDFPageProxy ) : Promise < boolean > {
const textContent = await page . getTextContent ( ) ;
return textContent . items . length === 0 ;
}
async function areImagesBlank ( page : PDFPageProxy , threshold : number ) : Promise < boolean > {
const images = await getImagesOnPage ( page ) ;
for ( const image of images ) {
2023-11-16 01:54:00 +01:00
if ( ! await isImageBlank ( image as any , threshold ) )
2023-11-08 03:33:22 +03:00
return false ;
}
return true ;
}
2023-11-16 01:56:17 +01:00
// TODO: Fix this function
async function isImageBlank ( image : PDFJSImage , threshold : number ) : Promise < boolean > {
2024-01-04 20:17:54 -05:00
const img = new Image ( image . width , image . height , image . data , { kind : "RGB" as ImageKind } ) ; // TODO: Maybe respect image.kind and convert accordingly, needs to be tested with a pdf with alpha-image
const grey = img . grey ( ) ;
const mean = grey . getMean ( ) ;
2023-11-08 03:33:22 +03:00
return mean [ 0 ] <= threshold ;
}