Clean up OCR and additional tweaks

This commit is contained in:
Reece Browne 2025-08-08 14:26:14 +01:00
parent e1296f5ac9
commit c942db0515
5 changed files with 148 additions and 218 deletions

View File

@ -1,33 +1,54 @@
import { useCallback } from 'react'; import { useCallback } from 'react';
import axios from 'axios';
import { useTranslation } from 'react-i18next'; import { useTranslation } from 'react-i18next';
import { OCRParameters } from '../../../components/tools/ocr/OCRSettings'; import { OCRParameters } from '../../../components/tools/ocr/OCRSettings';
import { useToolOperation, ToolOperationConfig } from '../shared/useToolOperation'; import { useToolOperation, ToolOperationConfig } from '../shared/useToolOperation';
import { createStandardErrorHandler } from '../../../utils/toolErrorHandler'; import { createStandardErrorHandler } from '../../../utils/toolErrorHandler';
import { useToolResources } from '../shared/useToolResources'; import { useToolResources } from '../shared/useToolResources';
const buildFormData = (parameters: OCRParameters, file: File): FormData => { // Helper: get MIME type based on file extension
function getMimeType(filename: string): string {
const ext = filename.toLowerCase().split('.').pop();
switch (ext) {
case 'pdf': return 'application/pdf';
case 'txt': return 'text/plain';
case 'zip': return 'application/zip';
default: return 'application/octet-stream';
}
}
// Lightweight ZIP extractor (keep or replace with a shared util if you have one)
async function extractZipFile(zipBlob: Blob): Promise<File[]> {
const JSZip = await import('jszip');
const zip = new JSZip.default();
const zipContent = await zip.loadAsync(await zipBlob.arrayBuffer());
const out: File[] = [];
for (const [filename, file] of Object.entries(zipContent.files)) {
if (!file.dir) {
const content = await file.async('blob');
out.push(new File([content], filename, { type: getMimeType(filename) }));
}
}
return out;
}
// Helper: strip extension
function stripExt(name: string): string {
const i = name.lastIndexOf('.');
return i > 0 ? name.slice(0, i) : name;
}
// Signature must be (file, params)
const buildFormData = (file: File, parameters: OCRParameters): FormData => {
const formData = new FormData(); const formData = new FormData();
// Add the file
formData.append('fileInput', file); formData.append('fileInput', file);
parameters.languages.forEach((lang) => formData.append('languages', lang));
// Add languages as multiple parameters with same name (like checkboxes)
parameters.languages.forEach(lang => {
formData.append('languages', lang);
});
// Add other parameters
formData.append('ocrType', parameters.ocrType); formData.append('ocrType', parameters.ocrType);
formData.append('ocrRenderType', parameters.ocrRenderType); formData.append('ocrRenderType', parameters.ocrRenderType);
// Handle additional options - convert array to individual boolean parameters
formData.append('sidecar', parameters.additionalOptions.includes('sidecar').toString()); formData.append('sidecar', parameters.additionalOptions.includes('sidecar').toString());
formData.append('deskew', parameters.additionalOptions.includes('deskew').toString()); formData.append('deskew', parameters.additionalOptions.includes('deskew').toString());
formData.append('clean', parameters.additionalOptions.includes('clean').toString()); formData.append('clean', parameters.additionalOptions.includes('clean').toString());
formData.append('cleanFinal', parameters.additionalOptions.includes('cleanFinal').toString()); formData.append('cleanFinal', parameters.additionalOptions.includes('cleanFinal').toString());
formData.append('removeImagesAfter', parameters.additionalOptions.includes('removeImagesAfter').toString()); formData.append('removeImagesAfter', parameters.additionalOptions.includes('removeImagesAfter').toString());
return formData; return formData;
}; };
@ -35,126 +56,61 @@ export const useOCROperation = () => {
const { t } = useTranslation(); const { t } = useTranslation();
const { extractZipFiles } = useToolResources(); const { extractZipFiles } = useToolResources();
const customOCRProcessor = useCallback(async ( // OCR-specific parsing: ZIP (sidecar) vs PDF vs HTML error
parameters: OCRParameters, const responseHandler = useCallback(async (blob: Blob, originalFiles: File[]): Promise<File[]> => {
selectedFiles: File[] const headBuf = await blob.slice(0, 8).arrayBuffer();
): Promise<File[]> => { const head = new TextDecoder().decode(new Uint8Array(headBuf));
const processedFiles: File[] = [];
const failedFiles: string[] = [];
// OCR typically processes one file at a time
for (let i = 0; i < selectedFiles.length; i++) {
const file = selectedFiles[i];
// ZIP: sidecar or multi-asset output
if (head.startsWith('PK')) {
const base = stripExt(originalFiles[0].name);
try { try {
const formData = buildFormData(parameters, file); const extracted = await extractZipFiles(blob);
const response = await axios.post('/api/v1/misc/ocr-pdf', formData, { if (extracted.length > 0) return extracted;
responseType: "blob" } catch { /* ignore and try local extractor */ }
});
// Check for HTTP errors
if (response.status >= 400) {
const errorText = await response.data.text();
throw new Error(`OCR service HTTP error ${response.status}: ${errorText.substring(0, 300)}`);
}
// Validate response
if (!response.data || response.data.size === 0) {
throw new Error('Empty response from OCR service');
}
const contentType = response.headers['content-type'] || 'application/pdf';
// Check if response is actually a PDF by examining the first few bytes
const arrayBuffer = await response.data.arrayBuffer();
const uint8Array = new Uint8Array(arrayBuffer);
const header = new TextDecoder().decode(uint8Array.slice(0, 4));
// Check if it's a ZIP file (OCR service returns ZIP when sidecar is enabled or for multi-file results)
if (header.startsWith('PK')) {
try { try {
// Extract ZIP file contents using tool resources const local = await extractZipFile(blob); // local fallback
const zipBlob = new Blob([arrayBuffer]); if (local.length > 0) return local;
const extractedFiles = await extractZipFiles(zipBlob); } catch { /* fall through */ }
return [new File([blob], `ocr_${base}.zip`, { type: 'application/zip' })];
if (extractedFiles.length > 0) {
// Add extracted files to processed files
processedFiles.push(...extractedFiles);
} else {
// Fallback to treating as single ZIP file if extraction failed
const zipFile = new File([arrayBuffer], `ocr_${file.name}.zip`, { type: 'application/zip' });
processedFiles.push(zipFile);
}
} catch (extractError) {
// Fallback to treating as single ZIP file
const zipFile = new File([arrayBuffer], `ocr_${file.name}.zip`, { type: 'application/zip' });
processedFiles.push(zipFile);
}
continue; // Skip the PDF validation for ZIP files
} }
if (!header.startsWith('%PDF')) { // Not a PDF: surface error details if present
// Check if it's an error response if (!head.startsWith('%PDF')) {
const text = new TextDecoder().decode(uint8Array.slice(0, 500)); const textBuf = await blob.slice(0, 1024).arrayBuffer();
const text = new TextDecoder().decode(new Uint8Array(textBuf));
if (text.includes('error') || text.includes('Error') || text.includes('exception') || text.includes('html')) { if (/error|exception|html/i.test(text)) {
// Check for specific OCR tool unavailable error
if (text.includes('OCR tools') && text.includes('not installed')) { if (text.includes('OCR tools') && text.includes('not installed')) {
throw new Error('OCR tools (OCRmyPDF or Tesseract) are not installed on the server. Use the standard or fat Docker image instead of ultra-lite, or install OCR tools manually.'); throw new Error('OCR tools (OCRmyPDF or Tesseract) are not installed on the server. Use the standard or fat Docker image instead of ultra-lite, or install OCR tools manually.');
} }
throw new Error(`OCR service error: ${text.substring(0, 300)}`); const title =
text.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1] ||
text.match(/<h1[^>]*>([^<]+)<\/h1>/i)?.[1] ||
t('ocr.error.unknown', 'Unknown error');
throw new Error(`OCR service error: ${title}`);
}
throw new Error(`Response is not a valid PDF. Header: "${head}"`);
} }
// Check if it's an HTML error page const base = stripExt(originalFiles[0].name);
if (text.includes('<html') || text.includes('<!DOCTYPE')) { return [new File([blob], `ocr_${base}.pdf`, { type: 'application/pdf' })];
// Try to extract error message from HTML }, [t, extractZipFiles]);
const errorMatch = text.match(/<title[^>]*>([^<]+)<\/title>/i) ||
text.match(/<h1[^>]*>([^<]+)<\/h1>/i) ||
text.match(/<body[^>]*>([^<]+)<\/body>/i);
const errorMessage = errorMatch ? errorMatch[1].trim() : t('ocr.error.unknown', 'Unknown error');
throw new Error(`OCR service error: ${errorMessage}`);
}
throw new Error(`Response is not a valid PDF file. Header: "${header}"`);
}
const blob = new Blob([arrayBuffer], { type: contentType });
const processedFile = new File([blob], `ocr_${file.name}`, { type: contentType });
processedFiles.push(processedFile);
} catch (fileError) {
const errorMessage = fileError instanceof Error ? fileError.message : t('ocr.error.unknown', 'Unknown error');
failedFiles.push(`${file.name} (${errorMessage})`);
}
}
if (failedFiles.length > 0 && processedFiles.length === 0) {
throw new Error(`Failed to process OCR for all files: ${failedFiles.join(', ')}`);
}
return processedFiles;
}, [t]);
const ocrConfig: ToolOperationConfig<OCRParameters> = { const ocrConfig: ToolOperationConfig<OCRParameters> = {
operationType: 'ocr', operationType: 'ocr',
endpoint: '/api/v1/misc/ocr-pdf', // Not used with customProcessor but required endpoint: '/api/v1/misc/ocr-pdf',
buildFormData, // Not used with customProcessor but required buildFormData,
filePrefix: 'ocr_', filePrefix: 'ocr_',
customProcessor: customOCRProcessor, multiFileEndpoint: false, // Process files individually
validateParams: (params) => { responseHandler, // use shared flow
if (params.languages.length === 0) { validateParams: (params) =>
return { valid: false, errors: [t('ocr.validation.languageRequired', 'Please select at least one language for OCR processing.')] }; params.languages.length === 0
} ? { valid: false, errors: [t('ocr.validation.languageRequired', 'Please select at least one language for OCR processing.')] }
return { valid: true }; : { valid: true },
}, getErrorMessage: (error) =>
getErrorMessage: (error) => { error.message?.includes('OCR tools') && error.message?.includes('not installed')
// Handle OCR-specific error first ? 'OCR tools (OCRmyPDF or Tesseract) are not installed on the server. Use the standard or fat Docker image instead of ultra-lite, or install OCR tools manually.'
if (error.message?.includes('OCR tools') && error.message?.includes('not installed')) { : createStandardErrorHandler(t('ocr.error.failed', 'OCR operation failed'))(error),
return 'OCR tools (OCRmyPDF or Tesseract) are not installed on the server. Use the standard or fat Docker image instead of ultra-lite, or install OCR tools manually.';
}
// Fall back to standard error handling
return createStandardErrorHandler(t('ocr.error.failed', 'OCR operation failed'))(error);
}
}; };
return useToolOperation(ocrConfig); return useToolOperation(ocrConfig);

View File

@ -38,9 +38,10 @@ export const useToolApiCalls = <TParams = void>() => {
const endpoint = typeof config.endpoint === 'function' ? config.endpoint(params) : config.endpoint; const endpoint = typeof config.endpoint === 'function' ? config.endpoint(params) : config.endpoint;
const response = await axios.post(endpoint, formData, { const response = await axios.post(endpoint, formData, {
responseType: 'blob', responseType: 'blob',
cancelToken: cancelTokenRef.current.token cancelToken: cancelTokenRef.current.token,
}); });
// Forward to shared response processor (uses tool-specific responseHandler if provided)
const responseFiles = await processResponse( const responseFiles = await processResponse(
response.data, response.data,
[file], [file],

View File

@ -7,7 +7,7 @@ import { useToolApiCalls, type ApiCallsConfig } from './useToolApiCalls';
import { useToolResources } from './useToolResources'; import { useToolResources } from './useToolResources';
import { extractErrorMessage } from '../../../utils/toolErrorHandler'; import { extractErrorMessage } from '../../../utils/toolErrorHandler';
import { createOperation } from '../../../utils/toolOperationTracker'; import { createOperation } from '../../../utils/toolOperationTracker';
import type { ResponseHandler } from '../../../utils/toolResponseProcessor'; import { type ResponseHandler, processResponse } from '../../../utils/toolResponseProcessor';
export interface ValidationResult { export interface ValidationResult {
valid: boolean; valid: boolean;
@ -169,27 +169,24 @@ export const useToolOperation = <TParams = void>(
const response = await axios.post(endpoint, formData, { responseType: 'blob' }); const response = await axios.post(endpoint, formData, { responseType: 'blob' });
// Handle response based on responseHandler // Multi-file responses are typically ZIP files that need extraction
if (config.responseHandler?.type === 'zip' && config.responseHandler?.useZipExtractor) { if (config.responseHandler) {
// Use tool resources for ZIP extraction // Use custom responseHandler for multi-file (handles ZIP extraction)
processedFiles = await config.responseHandler(response.data, validFiles);
} else {
// Default: assume ZIP response for multi-file endpoints
processedFiles = await extractZipFiles(response.data); processedFiles = await extractZipFiles(response.data);
if (processedFiles.length === 0) { if (processedFiles.length === 0) {
// Try the generic extraction as fallback // Try the generic extraction as fallback
processedFiles = await extractAllZipFiles(response.data); processedFiles = await extractAllZipFiles(response.data);
} }
} else {
// Single file response
const filename = validFiles.length === 1
? `${config.filePrefix}${validFiles[0].name}`
: `${config.filePrefix}result.pdf`;
processedFiles = [new File([response.data], filename, { type: response.data.type })];
} }
} else { } else {
// Individual file processing - separate API call per file // Individual file processing - separate API call per file
const apiCallsConfig: ApiCallsConfig<TParams> = { const apiCallsConfig: ApiCallsConfig<TParams> = {
endpoint: config.endpoint, endpoint: config.endpoint,
buildFormData: (file: File, params: TParams) => (config.buildFormData as (params: TParams, file: File) => FormData)(params, file), buildFormData: (file: File, params: TParams) => (config.buildFormData as (file: File, params: TParams) => FormData)(file, params),
filePrefix: config.filePrefix, filePrefix: config.filePrefix,
responseHandler: config.responseHandler responseHandler: config.responseHandler
}; };

View File

@ -66,10 +66,6 @@ export const useSplitOperation = () => {
buildFormData: buildFormData, // Multi-file signature: (params, selectedFiles) => FormData buildFormData: buildFormData, // Multi-file signature: (params, selectedFiles) => FormData
filePrefix: 'split_', filePrefix: 'split_',
multiFileEndpoint: true, // Single API call with all files multiFileEndpoint: true, // Single API call with all files
responseHandler: {
type: 'zip',
useZipExtractor: true
},
validateParams: (params) => { validateParams: (params) => {
if (!params.mode) { if (!params.mode) {
return { valid: false, errors: [t('split.validation.modeRequired', 'Split mode is required')] }; return { valid: false, errors: [t('split.validation.modeRequired', 'Split mode is required')] };

View File

@ -1,45 +1,25 @@
// Note: This utility should be used with useToolResources for ZIP operations // Note: This utility should be used with useToolResources for ZIP operations
export interface ResponseHandler { export type ResponseHandler = (blob: Blob, originalFiles: File[]) => Promise<File[]> | File[];
type: 'single' | 'zip' | 'custom';
processor?: (blob: Blob) => Promise<File[]>;
useZipExtractor?: boolean;
}
const defaultResponseHandler: ResponseHandler = {
type: 'single'
};
/** /**
* Processes API response blob based on handler configuration * Processes a blob response into File(s).
* Note: For ZIP extraction, use useToolResources.extractZipFiles instead * - If a tool-specific responseHandler is provided, it is used.
* - Otherwise, create a single file using the filePrefix + original name.
*/ */
export const processResponse = async ( export async function processResponse(
blob: Blob, blob: Blob,
originalFiles: File[], originalFiles: File[],
filePrefix: string, filePrefix: string,
responseHandler?: ResponseHandler responseHandler?: ResponseHandler
): Promise<File[]> => { ): Promise<File[]> {
const handler = responseHandler || defaultResponseHandler; if (responseHandler) {
const out = await responseHandler(blob, originalFiles);
return Array.isArray(out) ? out : [out as unknown as File];
}
switch (handler.type) { const original = originalFiles[0]?.name ?? 'result.pdf';
case 'zip': const name = `${filePrefix}${original}`;
if (handler.useZipExtractor) { const type = blob.type || 'application/octet-stream';
// This path should be avoided - use useToolResources.extractZipFiles instead return [new File([blob], name, { type })];
throw new Error('ZIP extraction should use useToolResources.extractZipFiles'); }
}
// Fall through to custom if no zip extractor
case 'custom':
if (handler.processor) {
return await handler.processor(blob);
}
// Fall through to single
case 'single':
default:
const contentType = blob.type || 'application/pdf';
const filename = originalFiles.length === 1
? `${filePrefix}${originalFiles[0].name}`
: `${filePrefix}result.pdf`;
return [new File([blob], filename, { type: contentType })];
}
};