2025-08-08 16:01:56 +01:00
import { useCallback } from 'react' ;
2025-08-01 14:22:19 +01:00
import { useTranslation } from 'react-i18next' ;
import { OCRParameters } from '../../../components/tools/ocr/OCRSettings' ;
2025-08-08 16:01:56 +01:00
import { useToolOperation , ToolOperationConfig } from '../shared/useToolOperation' ;
import { createStandardErrorHandler } from '../../../utils/toolErrorHandler' ;
import { useToolResources } from '../shared/useToolResources' ;
2025-08-01 14:22:19 +01:00
2025-08-08 16:01:56 +01:00
// Helper: get MIME type based on file extension
function getMimeType ( filename : string ) : string {
const ext = filename . toLowerCase ( ) . split ( '.' ) . pop ( ) ;
switch ( ext ) {
case 'pdf' : return 'application/pdf' ;
case 'txt' : return 'text/plain' ;
case 'zip' : return 'application/zip' ;
default : return 'application/octet-stream' ;
}
}
// Lightweight ZIP extractor (keep or replace with a shared util if you have one)
2025-08-01 14:22:19 +01:00
async function extractZipFile ( zipBlob : Blob ) : Promise < File [ ] > {
const JSZip = await import ( 'jszip' ) ;
const zip = new JSZip . default ( ) ;
2025-08-08 16:01:56 +01:00
const zipContent = await zip . loadAsync ( await zipBlob . arrayBuffer ( ) ) ;
const out : File [ ] = [ ] ;
2025-08-01 14:22:19 +01:00
for ( const [ filename , file ] of Object . entries ( zipContent . files ) ) {
if ( ! file . dir ) {
const content = await file . async ( 'blob' ) ;
2025-08-08 16:01:56 +01:00
out . push ( new File ( [ content ] , filename , { type : getMimeType ( filename ) } ) ) ;
2025-08-01 14:22:19 +01:00
}
}
2025-08-08 16:01:56 +01:00
return out ;
2025-08-01 14:22:19 +01:00
}
2025-08-08 16:01:56 +01:00
// Helper: strip extension
function stripExt ( name : string ) : string {
const i = name . lastIndexOf ( '.' ) ;
return i > 0 ? name . slice ( 0 , i ) : name ;
2025-08-01 14:22:19 +01:00
}
2025-08-08 16:01:56 +01:00
// Signature must be (file, params)
const buildFormData = ( file : File , parameters : OCRParameters ) : FormData = > {
const formData = new FormData ( ) ;
formData . append ( 'fileInput' , file ) ;
parameters . languages . forEach ( ( lang ) = > formData . append ( 'languages' , lang ) ) ;
formData . append ( 'ocrType' , parameters . ocrType ) ;
formData . append ( 'ocrRenderType' , parameters . ocrRenderType ) ;
formData . append ( 'sidecar' , parameters . additionalOptions . includes ( 'sidecar' ) . toString ( ) ) ;
formData . append ( 'deskew' , parameters . additionalOptions . includes ( 'deskew' ) . toString ( ) ) ;
formData . append ( 'clean' , parameters . additionalOptions . includes ( 'clean' ) . toString ( ) ) ;
formData . append ( 'cleanFinal' , parameters . additionalOptions . includes ( 'cleanFinal' ) . toString ( ) ) ;
formData . append ( 'removeImagesAfter' , parameters . additionalOptions . includes ( 'removeImagesAfter' ) . toString ( ) ) ;
return formData ;
} ;
export const useOCROperation = ( ) = > {
2025-08-01 14:22:19 +01:00
const { t } = useTranslation ( ) ;
2025-08-08 16:01:56 +01:00
const { extractZipFiles } = useToolResources ( ) ;
2025-08-01 14:22:19 +01:00
2025-08-08 16:01:56 +01:00
// OCR-specific parsing: ZIP (sidecar) vs PDF vs HTML error
const responseHandler = useCallback ( async ( blob : Blob , originalFiles : File [ ] ) : Promise < File [ ] > = > {
const headBuf = await blob . slice ( 0 , 8 ) . arrayBuffer ( ) ;
const head = new TextDecoder ( ) . decode ( new Uint8Array ( headBuf ) ) ;
2025-08-01 14:22:19 +01:00
2025-08-08 16:01:56 +01:00
// ZIP: sidecar or multi-asset output
if ( head . startsWith ( 'PK' ) ) {
const base = stripExt ( originalFiles [ 0 ] . name ) ;
2025-08-01 14:22:19 +01:00
try {
2025-08-08 16:01:56 +01:00
const extracted = await extractZipFiles ( blob ) ;
if ( extracted . length > 0 ) return extracted ;
} catch { /* ignore and try local extractor */ }
try {
const local = await extractZipFile ( blob ) ; // local fallback
if ( local . length > 0 ) return local ;
} catch { /* fall through */ }
return [ new File ( [ blob ] , ` ocr_ ${ base } .zip ` , { type : 'application/zip' } ) ] ;
2025-08-01 14:22:19 +01:00
}
2025-08-08 16:01:56 +01:00
// Not a PDF: surface error details if present
if ( ! head . startsWith ( '%PDF' ) ) {
const textBuf = await blob . slice ( 0 , 1024 ) . arrayBuffer ( ) ;
const text = new TextDecoder ( ) . decode ( new Uint8Array ( textBuf ) ) ;
if ( /error|exception|html/i . test ( text ) ) {
if ( text . includes ( 'OCR tools' ) && text . includes ( 'not installed' ) ) {
throw new Error ( 'OCR tools (OCRmyPDF or Tesseract) are not installed on the server. Use the standard or fat Docker image instead of ultra-lite, or install OCR tools manually.' ) ;
2025-08-01 14:22:19 +01:00
}
2025-08-08 16:01:56 +01:00
const title =
text . match ( /<title[^>]*>([^<]+)<\/title>/i ) ? . [ 1 ] ||
text . match ( /<h1[^>]*>([^<]+)<\/h1>/i ) ? . [ 1 ] ||
t ( 'ocr.error.unknown' , 'Unknown error' ) ;
throw new Error ( ` OCR service error: ${ title } ` ) ;
2025-08-01 14:22:19 +01:00
}
2025-08-08 16:01:56 +01:00
throw new Error ( ` Response is not a valid PDF. Header: " ${ head } " ` ) ;
2025-08-01 14:22:19 +01:00
}
2025-08-08 16:01:56 +01:00
const base = stripExt ( originalFiles [ 0 ] . name ) ;
return [ new File ( [ blob ] , ` ocr_ ${ base } .pdf ` , { type : 'application/pdf' } ) ] ;
} , [ t , extractZipFiles ] ) ;
const ocrConfig : ToolOperationConfig < OCRParameters > = {
operationType : 'ocr' ,
endpoint : '/api/v1/misc/ocr-pdf' ,
2025-08-11 09:16:16 +01:00
buildFormData : buildFormData as any /* FIX ME */ ,
2025-08-08 16:01:56 +01:00
filePrefix : 'ocr_' ,
multiFileEndpoint : false , // Process files individually
responseHandler , // use shared flow
2025-08-14 14:46:07 +01:00
getErrorMessage : ( error ) = > {
return createStandardErrorHandler ( t ( 'ocr.error.failed' , 'OCR operation failed' ) ) ( error ) ;
} ,
2025-08-01 14:22:19 +01:00
} ;
2025-08-08 16:01:56 +01:00
return useToolOperation ( ocrConfig ) ;
} ;