Stirling-PDF/frontend/src/hooks/tools/ocr/useOCROperation.ts
2025-08-21 18:08:47 +01:00

125 lines
5.3 KiB
TypeScript

import { useCallback } from 'react';
import { useTranslation } from 'react-i18next';
import { OCRParameters, defaultParameters } from './useOCRParameters';
import { useToolOperation, ToolOperationConfig } from '../shared/useToolOperation';
import { createStandardErrorHandler } from '../../../utils/toolErrorHandler';
import { useToolResources } from '../shared/useToolResources';
// Helper: get MIME type based on file extension
function getMimeType(filename: string): string {
const ext = filename.toLowerCase().split('.').pop();
switch (ext) {
case 'pdf': return 'application/pdf';
case 'txt': return 'text/plain';
case 'zip': return 'application/zip';
default: return 'application/octet-stream';
}
}
// Lightweight ZIP extractor (keep or replace with a shared util if you have one)
async function extractZipFile(zipBlob: Blob): Promise<File[]> {
const JSZip = await import('jszip');
const zip = new JSZip.default();
const zipContent = await zip.loadAsync(await zipBlob.arrayBuffer());
const out: File[] = [];
for (const [filename, file] of Object.entries(zipContent.files)) {
if (!file.dir) {
const content = await file.async('blob');
out.push(new File([content], filename, { type: getMimeType(filename) }));
}
}
return out;
}
// Helper: strip extension
function stripExt(name: string): string {
const i = name.lastIndexOf('.');
return i > 0 ? name.slice(0, i) : name;
}
// Static function that can be used by both the hook and automation executor
export const buildOCRFormData = (parameters: OCRParameters, file: File): FormData => {
const formData = new FormData();
formData.append('fileInput', file);
parameters.languages.forEach((lang) => formData.append('languages', lang));
formData.append('ocrType', parameters.ocrType);
formData.append('ocrRenderType', parameters.ocrRenderType);
formData.append('sidecar', parameters.additionalOptions.includes('sidecar').toString());
formData.append('deskew', parameters.additionalOptions.includes('deskew').toString());
formData.append('clean', parameters.additionalOptions.includes('clean').toString());
formData.append('cleanFinal', parameters.additionalOptions.includes('cleanFinal').toString());
formData.append('removeImagesAfter', parameters.additionalOptions.includes('removeImagesAfter').toString());
return formData;
};
// Static response handler for OCR - can be used by automation executor
export const ocrResponseHandler = async (blob: Blob, originalFiles: File[], extractZipFiles: (blob: Blob) => Promise<{ success: boolean; extractedFiles: File[]; errors: string[] }>): Promise<File[]> => {
const headBuf = await blob.slice(0, 8).arrayBuffer();
const head = new TextDecoder().decode(new Uint8Array(headBuf));
// ZIP: sidecar or multi-asset output
if (head.startsWith('PK')) {
const base = stripExt(originalFiles[0].name);
try {
const result = await extractZipFiles(blob);
if (result.success && result.extractedFiles.length > 0) return result.extractedFiles;
} catch { /* ignore and try local extractor */ }
try {
const local = await extractZipFile(blob); // local fallback
if (local.length > 0) return local;
} catch { /* fall through */ }
return [new File([blob], `ocr_${base}.zip`, { type: 'application/zip' })];
}
// Not a PDF: surface error details if present
if (!head.startsWith('%PDF')) {
const textBuf = await blob.slice(0, 1024).arrayBuffer();
const text = new TextDecoder().decode(new Uint8Array(textBuf));
if (/error|exception|html/i.test(text)) {
if (text.includes('OCR tools') && text.includes('not installed')) {
throw new Error('OCR tools (OCRmyPDF or Tesseract) are not installed on the server. Use the standard or fat Docker image instead of ultra-lite, or install OCR tools manually.');
}
const title =
text.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1] ||
text.match(/<h1[^>]*>([^<]+)<\/h1>/i)?.[1] ||
'Unknown error';
throw new Error(`OCR service error: ${title}`);
}
throw new Error(`Response is not a valid PDF. Header: "${head}"`);
}
const base = stripExt(originalFiles[0].name);
return [new File([blob], `ocr_${base}.pdf`, { type: 'application/pdf' })];
};
// Static configuration object (without t function dependencies)
export const ocrOperationConfig = {
operationType: 'ocr',
endpoint: '/api/v1/misc/ocr-pdf',
buildFormData: buildOCRFormData,
filePrefix: 'ocr_',
multiFileEndpoint: false,
defaultParameters,
} as const;
export const useOCROperation = () => {
const { t } = useTranslation();
const { extractZipFiles } = useToolResources();
// OCR-specific parsing: ZIP (sidecar) vs PDF vs HTML error
const responseHandler = useCallback(async (blob: Blob, originalFiles: File[]): Promise<File[]> => {
return ocrResponseHandler(blob, originalFiles, extractZipFiles);
}, [extractZipFiles]);
const ocrConfig: ToolOperationConfig<OCRParameters> = {
...ocrOperationConfig,
responseHandler,
getErrorMessage: (error) =>
error.message?.includes('OCR tools') && error.message?.includes('not installed')
? 'OCR tools (OCRmyPDF or Tesseract) are not installed on the server. Use the standard or fat Docker image instead of ultra-lite, or install OCR tools manually.'
: createStandardErrorHandler(t('ocr.error.failed', 'OCR operation failed'))(error),
};
return useToolOperation(ocrConfig);
};