diff --git a/frontend/src/hooks/tools/ocr/useOCROperation.ts b/frontend/src/hooks/tools/ocr/useOCROperation.ts index ad0c51c67..316b867d8 100644 --- a/frontend/src/hooks/tools/ocr/useOCROperation.ts +++ b/frontend/src/hooks/tools/ocr/useOCROperation.ts @@ -1,161 +1,117 @@ import { useCallback } from 'react'; -import axios from 'axios'; import { useTranslation } from 'react-i18next'; import { OCRParameters } from '../../../components/tools/ocr/OCRSettings'; import { useToolOperation, ToolOperationConfig } from '../shared/useToolOperation'; import { createStandardErrorHandler } from '../../../utils/toolErrorHandler'; import { useToolResources } from '../shared/useToolResources'; -const buildFormData = (parameters: OCRParameters, file: File): FormData => { +// Helper: get MIME type based on file extension +function getMimeType(filename: string): string { + const ext = filename.toLowerCase().split('.').pop(); + switch (ext) { + case 'pdf': return 'application/pdf'; + case 'txt': return 'text/plain'; + case 'zip': return 'application/zip'; + default: return 'application/octet-stream'; + } +} + +// Lightweight ZIP extractor (keep or replace with a shared util if you have one) +async function extractZipFile(zipBlob: Blob): Promise { + const JSZip = await import('jszip'); + const zip = new JSZip.default(); + const zipContent = await zip.loadAsync(await zipBlob.arrayBuffer()); + const out: File[] = []; + for (const [filename, file] of Object.entries(zipContent.files)) { + if (!file.dir) { + const content = await file.async('blob'); + out.push(new File([content], filename, { type: getMimeType(filename) })); + } + } + return out; +} + +// Helper: strip extension +function stripExt(name: string): string { + const i = name.lastIndexOf('.'); + return i > 0 ? name.slice(0, i) : name; +} + +// Signature must be (file, params) +const buildFormData = (file: File, parameters: OCRParameters): FormData => { const formData = new FormData(); - - // Add the file formData.append('fileInput', file); - - // Add languages as multiple parameters with same name (like checkboxes) - parameters.languages.forEach(lang => { - formData.append('languages', lang); - }); - - // Add other parameters + parameters.languages.forEach((lang) => formData.append('languages', lang)); formData.append('ocrType', parameters.ocrType); formData.append('ocrRenderType', parameters.ocrRenderType); - - // Handle additional options - convert array to individual boolean parameters formData.append('sidecar', parameters.additionalOptions.includes('sidecar').toString()); formData.append('deskew', parameters.additionalOptions.includes('deskew').toString()); formData.append('clean', parameters.additionalOptions.includes('clean').toString()); formData.append('cleanFinal', parameters.additionalOptions.includes('cleanFinal').toString()); formData.append('removeImagesAfter', parameters.additionalOptions.includes('removeImagesAfter').toString()); - return formData; }; export const useOCROperation = () => { const { t } = useTranslation(); const { extractZipFiles } = useToolResources(); - - const customOCRProcessor = useCallback(async ( - parameters: OCRParameters, - selectedFiles: File[] - ): Promise => { - const processedFiles: File[] = []; - const failedFiles: string[] = []; - // OCR typically processes one file at a time - for (let i = 0; i < selectedFiles.length; i++) { - const file = selectedFiles[i]; + // OCR-specific parsing: ZIP (sidecar) vs PDF vs HTML error + const responseHandler = useCallback(async (blob: Blob, originalFiles: File[]): Promise => { + const headBuf = await blob.slice(0, 8).arrayBuffer(); + const head = new TextDecoder().decode(new Uint8Array(headBuf)); + // ZIP: sidecar or multi-asset output + if (head.startsWith('PK')) { + const base = stripExt(originalFiles[0].name); try { - const formData = buildFormData(parameters, file); - const response = await axios.post('/api/v1/misc/ocr-pdf', formData, { - responseType: "blob" - }); + const extracted = await extractZipFiles(blob); + if (extracted.length > 0) return extracted; + } catch { /* ignore and try local extractor */ } + try { + const local = await extractZipFile(blob); // local fallback + if (local.length > 0) return local; + } catch { /* fall through */ } + return [new File([blob], `ocr_${base}.zip`, { type: 'application/zip' })]; + } - // Check for HTTP errors - if (response.status >= 400) { - const errorText = await response.data.text(); - throw new Error(`OCR service HTTP error ${response.status}: ${errorText.substring(0, 300)}`); + // Not a PDF: surface error details if present + if (!head.startsWith('%PDF')) { + const textBuf = await blob.slice(0, 1024).arrayBuffer(); + const text = new TextDecoder().decode(new Uint8Array(textBuf)); + if (/error|exception|html/i.test(text)) { + if (text.includes('OCR tools') && text.includes('not installed')) { + throw new Error('OCR tools (OCRmyPDF or Tesseract) are not installed on the server. Use the standard or fat Docker image instead of ultra-lite, or install OCR tools manually.'); } - - // Validate response - if (!response.data || response.data.size === 0) { - throw new Error('Empty response from OCR service'); - } - - const contentType = response.headers['content-type'] || 'application/pdf'; - - // Check if response is actually a PDF by examining the first few bytes - const arrayBuffer = await response.data.arrayBuffer(); - const uint8Array = new Uint8Array(arrayBuffer); - const header = new TextDecoder().decode(uint8Array.slice(0, 4)); - - // Check if it's a ZIP file (OCR service returns ZIP when sidecar is enabled or for multi-file results) - if (header.startsWith('PK')) { - try { - // Extract ZIP file contents using tool resources - const zipBlob = new Blob([arrayBuffer]); - const extractedFiles = await extractZipFiles(zipBlob); - - if (extractedFiles.length > 0) { - // Add extracted files to processed files - processedFiles.push(...extractedFiles); - } else { - // Fallback to treating as single ZIP file if extraction failed - const zipFile = new File([arrayBuffer], `ocr_${file.name}.zip`, { type: 'application/zip' }); - processedFiles.push(zipFile); - } - } catch (extractError) { - // Fallback to treating as single ZIP file - const zipFile = new File([arrayBuffer], `ocr_${file.name}.zip`, { type: 'application/zip' }); - processedFiles.push(zipFile); - } - continue; // Skip the PDF validation for ZIP files - } - - if (!header.startsWith('%PDF')) { - // Check if it's an error response - const text = new TextDecoder().decode(uint8Array.slice(0, 500)); - - if (text.includes('error') || text.includes('Error') || text.includes('exception') || text.includes('html')) { - // Check for specific OCR tool unavailable error - if (text.includes('OCR tools') && text.includes('not installed')) { - throw new Error('OCR tools (OCRmyPDF or Tesseract) are not installed on the server. Use the standard or fat Docker image instead of ultra-lite, or install OCR tools manually.'); - } - throw new Error(`OCR service error: ${text.substring(0, 300)}`); - } - - // Check if it's an HTML error page - if (text.includes(']*>([^<]+)<\/title>/i) || - text.match(/]*>([^<]+)<\/h1>/i) || - text.match(/]*>([^<]+)<\/body>/i); - const errorMessage = errorMatch ? errorMatch[1].trim() : t('ocr.error.unknown', 'Unknown error'); - throw new Error(`OCR service error: ${errorMessage}`); - } - - throw new Error(`Response is not a valid PDF file. Header: "${header}"`); - } - - const blob = new Blob([arrayBuffer], { type: contentType }); - const processedFile = new File([blob], `ocr_${file.name}`, { type: contentType }); - - processedFiles.push(processedFile); - } catch (fileError) { - const errorMessage = fileError instanceof Error ? fileError.message : t('ocr.error.unknown', 'Unknown error'); - failedFiles.push(`${file.name} (${errorMessage})`); + const title = + text.match(/]*>([^<]+)<\/title>/i)?.[1] || + text.match(/]*>([^<]+)<\/h1>/i)?.[1] || + t('ocr.error.unknown', 'Unknown error'); + throw new Error(`OCR service error: ${title}`); } + throw new Error(`Response is not a valid PDF. Header: "${head}"`); } - if (failedFiles.length > 0 && processedFiles.length === 0) { - throw new Error(`Failed to process OCR for all files: ${failedFiles.join(', ')}`); - } - - return processedFiles; - }, [t]); + const base = stripExt(originalFiles[0].name); + return [new File([blob], `ocr_${base}.pdf`, { type: 'application/pdf' })]; + }, [t, extractZipFiles]); const ocrConfig: ToolOperationConfig = { operationType: 'ocr', - endpoint: '/api/v1/misc/ocr-pdf', // Not used with customProcessor but required - buildFormData, // Not used with customProcessor but required + endpoint: '/api/v1/misc/ocr-pdf', + buildFormData, filePrefix: 'ocr_', - customProcessor: customOCRProcessor, - validateParams: (params) => { - if (params.languages.length === 0) { - return { valid: false, errors: [t('ocr.validation.languageRequired', 'Please select at least one language for OCR processing.')] }; - } - return { valid: true }; - }, - getErrorMessage: (error) => { - // Handle OCR-specific error first - if (error.message?.includes('OCR tools') && error.message?.includes('not installed')) { - return 'OCR tools (OCRmyPDF or Tesseract) are not installed on the server. Use the standard or fat Docker image instead of ultra-lite, or install OCR tools manually.'; - } - // Fall back to standard error handling - return createStandardErrorHandler(t('ocr.error.failed', 'OCR operation failed'))(error); - } + multiFileEndpoint: false, // Process files individually + responseHandler, // use shared flow + validateParams: (params) => + params.languages.length === 0 + ? { valid: false, errors: [t('ocr.validation.languageRequired', 'Please select at least one language for OCR processing.')] } + : { valid: true }, + getErrorMessage: (error) => + error.message?.includes('OCR tools') && error.message?.includes('not installed') + ? 'OCR tools (OCRmyPDF or Tesseract) are not installed on the server. Use the standard or fat Docker image instead of ultra-lite, or install OCR tools manually.' + : createStandardErrorHandler(t('ocr.error.failed', 'OCR operation failed'))(error), }; return useToolOperation(ocrConfig); -}; \ No newline at end of file +}; diff --git a/frontend/src/hooks/tools/shared/useToolApiCalls.ts b/frontend/src/hooks/tools/shared/useToolApiCalls.ts index 639751063..88f1f4d31 100644 --- a/frontend/src/hooks/tools/shared/useToolApiCalls.ts +++ b/frontend/src/hooks/tools/shared/useToolApiCalls.ts @@ -29,7 +29,7 @@ export const useToolApiCalls = () => { for (let i = 0; i < validFiles.length; i++) { const file = validFiles[i]; - + onProgress({ current: i + 1, total, currentFileName: file.name }); onStatus(`Processing ${file.name} (${i + 1}/${total})`); @@ -38,12 +38,13 @@ export const useToolApiCalls = () => { const endpoint = typeof config.endpoint === 'function' ? config.endpoint(params) : config.endpoint; const response = await axios.post(endpoint, formData, { responseType: 'blob', - cancelToken: cancelTokenRef.current.token + cancelToken: cancelTokenRef.current.token, }); + // Forward to shared response processor (uses tool-specific responseHandler if provided) const responseFiles = await processResponse( - response.data, - [file], + response.data, + [file], config.filePrefix, config.responseHandler ); @@ -82,4 +83,4 @@ export const useToolApiCalls = () => { processFiles, cancelOperation, }; -}; \ No newline at end of file +}; diff --git a/frontend/src/hooks/tools/shared/useToolOperation.ts b/frontend/src/hooks/tools/shared/useToolOperation.ts index b5360d979..7251d1fd2 100644 --- a/frontend/src/hooks/tools/shared/useToolOperation.ts +++ b/frontend/src/hooks/tools/shared/useToolOperation.ts @@ -7,7 +7,7 @@ import { useToolApiCalls, type ApiCallsConfig } from './useToolApiCalls'; import { useToolResources } from './useToolResources'; import { extractErrorMessage } from '../../../utils/toolErrorHandler'; import { createOperation } from '../../../utils/toolOperationTracker'; -import type { ResponseHandler } from '../../../utils/toolResponseProcessor'; +import { type ResponseHandler, processResponse } from '../../../utils/toolResponseProcessor'; export interface ValidationResult { valid: boolean; @@ -19,54 +19,54 @@ export type { ProcessingProgress, ResponseHandler }; /** * Configuration for tool operations defining processing behavior and API integration. - * + * * Supports three patterns: * 1. Single-file tools: multiFileEndpoint: false, processes files individually - * 2. Multi-file tools: multiFileEndpoint: true, single API call with all files + * 2. Multi-file tools: multiFileEndpoint: true, single API call with all files * 3. Complex tools: customProcessor handles all processing logic */ export interface ToolOperationConfig { /** Operation identifier for tracking and logging */ operationType: string; - - /** + + /** * API endpoint for the operation. Can be static string or function for dynamic routing. * Not used when customProcessor is provided. */ endpoint: string | ((params: TParams) => string); - - /** + + /** * Builds FormData for API request. Signature determines processing approach: * - (params, file: File) => FormData: Single-file processing - * - (params, files: File[]) => FormData: Multi-file processing + * - (params, files: File[]) => FormData: Multi-file processing * Not used when customProcessor is provided. */ buildFormData: ((params: TParams, file: File) => FormData) | ((params: TParams, files: File[]) => FormData); - + /** Prefix added to processed filenames (e.g., 'compressed_', 'split_') */ filePrefix: string; - - /** + + /** * Whether this tool uses backends that accept MultipartFile[] arrays. * - true: Single API call with all files (backend uses MultipartFile[]) * - false/undefined: Individual API calls per file (backend uses single MultipartFile) * Ignored when customProcessor is provided. */ multiFileEndpoint?: boolean; - + /** How to handle API responses (e.g., ZIP extraction, single file response) */ responseHandler?: ResponseHandler; - - /** + + /** * Custom processing logic that completely bypasses standard file processing. * When provided, tool handles all API calls, response processing, and file creation. * Use for tools with complex routing logic or non-standard processing requirements. */ customProcessor?: (params: TParams, files: File[]) => Promise; - + /** Validate parameters before execution. Return validation errors if invalid. */ validateParams?: (params: TParams) => ValidationResult; - + /** Extract user-friendly error messages from API errors */ getErrorMessage?: (error: any) => string; } @@ -85,7 +85,7 @@ export interface ToolOperationHook { status: string; errorMessage: string | null; progress: ProcessingProgress | null; - + // Actions executeOperation: (params: TParams, selectedFiles: File[]) => Promise; resetResults: () => void; @@ -99,12 +99,12 @@ export { createStandardErrorHandler } from '../../../utils/toolErrorHandler'; /** * Shared hook for tool operations providing consistent error handling, progress tracking, * and FileContext integration. Eliminates boilerplate while maintaining flexibility. - * + * * Supports three tool patterns: * 1. Single-file tools: Set multiFileEndpoint: false, processes files individually * 2. Multi-file tools: Set multiFileEndpoint: true, single API call with all files * 3. Complex tools: Provide customProcessor for full control over processing logic - * + * * @param config - Tool operation configuration * @returns Hook interface with state and execution methods */ @@ -113,7 +113,7 @@ export const useToolOperation = ( ): ToolOperationHook => { const { t } = useTranslation(); const { recordOperation, markOperationApplied, markOperationFailed, addFiles } = useFileContext(); - + // Composed hooks const { state, actions } = useToolState(); const { processFiles, cancelOperation: cancelApiCalls } = useToolApiCalls(); @@ -155,7 +155,7 @@ export const useToolOperation = ( try { let processedFiles: File[]; - + if (config.customProcessor) { actions.setStatus('Processing files...'); processedFiles = await config.customProcessor(params, validFiles); @@ -166,46 +166,43 @@ export const useToolOperation = ( actions.setStatus('Processing files...'); const formData = (config.buildFormData as (params: TParams, files: File[]) => FormData)(params, validFiles); const endpoint = typeof config.endpoint === 'function' ? config.endpoint(params) : config.endpoint; - + const response = await axios.post(endpoint, formData, { responseType: 'blob' }); - - // Handle response based on responseHandler - if (config.responseHandler?.type === 'zip' && config.responseHandler?.useZipExtractor) { - // Use tool resources for ZIP extraction + + // Multi-file responses are typically ZIP files that need extraction + if (config.responseHandler) { + // Use custom responseHandler for multi-file (handles ZIP extraction) + processedFiles = await config.responseHandler(response.data, validFiles); + } else { + // Default: assume ZIP response for multi-file endpoints processedFiles = await extractZipFiles(response.data); if (processedFiles.length === 0) { // Try the generic extraction as fallback processedFiles = await extractAllZipFiles(response.data); } - } else { - // Single file response - const filename = validFiles.length === 1 - ? `${config.filePrefix}${validFiles[0].name}` - : `${config.filePrefix}result.pdf`; - processedFiles = [new File([response.data], filename, { type: response.data.type })]; } } else { // Individual file processing - separate API call per file const apiCallsConfig: ApiCallsConfig = { endpoint: config.endpoint, - buildFormData: (file: File, params: TParams) => (config.buildFormData as (params: TParams, file: File) => FormData)(params, file), + buildFormData: (file: File, params: TParams) => (config.buildFormData as (file: File, params: TParams) => FormData)(file, params), filePrefix: config.filePrefix, responseHandler: config.responseHandler }; processedFiles = await processFiles( - params, - validFiles, + params, + validFiles, apiCallsConfig, actions.setProgress, actions.setStatus ); } } - + if (processedFiles.length > 0) { actions.setFiles(processedFiles); - + // Generate thumbnails and download URL concurrently actions.setGeneratingThumbnails(true); const [thumbnails, downloadInfo] = await Promise.all([ @@ -213,13 +210,13 @@ export const useToolOperation = ( createDownloadInfo(processedFiles, config.operationType) ]); actions.setGeneratingThumbnails(false); - + actions.setThumbnails(thumbnails); actions.setDownloadInfo(downloadInfo.url, downloadInfo.filename); - + // Add to file context await addFiles(processedFiles); - + markOperationApplied(fileId, operationId); } @@ -257,11 +254,11 @@ export const useToolOperation = ( status: state.status, errorMessage: state.errorMessage, progress: state.progress, - + // Actions executeOperation, resetResults, clearError: actions.clearError, cancelOperation }; -}; \ No newline at end of file +}; diff --git a/frontend/src/hooks/tools/split/useSplitOperation.ts b/frontend/src/hooks/tools/split/useSplitOperation.ts index 9994c23c7..4979d0ea0 100644 --- a/frontend/src/hooks/tools/split/useSplitOperation.ts +++ b/frontend/src/hooks/tools/split/useSplitOperation.ts @@ -9,7 +9,7 @@ import { SPLIT_MODES } from '../../../constants/splitConstants'; const buildFormData = (parameters: SplitParameters, selectedFiles: File[]): FormData => { const formData = new FormData(); - + selectedFiles.forEach(file => { formData.append("fileInput", file); }); @@ -59,28 +59,24 @@ const getEndpoint = (parameters: SplitParameters): string => { export const useSplitOperation = () => { const { t } = useTranslation(); - + return useToolOperation({ operationType: 'split', endpoint: (params) => getEndpoint(params), - buildFormData: buildFormData, // Multi-file signature: (params, selectedFiles) => FormData + buildFormData: buildFormData, // Multi-file signature: (params, selectedFiles) => FormData filePrefix: 'split_', multiFileEndpoint: true, // Single API call with all files - responseHandler: { - type: 'zip', - useZipExtractor: true - }, validateParams: (params) => { if (!params.mode) { return { valid: false, errors: [t('split.validation.modeRequired', 'Split mode is required')] }; } - + if (params.mode === SPLIT_MODES.BY_PAGES && !params.pages) { return { valid: false, errors: [t('split.validation.pagesRequired', 'Page numbers are required for split by pages')] }; } - + return { valid: true }; }, getErrorMessage: createStandardErrorHandler(t('split.error.failed', 'An error occurred while splitting the PDF.')) }); -}; \ No newline at end of file +}; diff --git a/frontend/src/utils/toolResponseProcessor.ts b/frontend/src/utils/toolResponseProcessor.ts index 8433d1e50..fe2f11242 100644 --- a/frontend/src/utils/toolResponseProcessor.ts +++ b/frontend/src/utils/toolResponseProcessor.ts @@ -1,45 +1,25 @@ // Note: This utility should be used with useToolResources for ZIP operations -export interface ResponseHandler { - type: 'single' | 'zip' | 'custom'; - processor?: (blob: Blob) => Promise; - useZipExtractor?: boolean; -} - -const defaultResponseHandler: ResponseHandler = { - type: 'single' -}; +export type ResponseHandler = (blob: Blob, originalFiles: File[]) => Promise | File[]; /** - * Processes API response blob based on handler configuration - * Note: For ZIP extraction, use useToolResources.extractZipFiles instead + * Processes a blob response into File(s). + * - If a tool-specific responseHandler is provided, it is used. + * - Otherwise, create a single file using the filePrefix + original name. */ -export const processResponse = async ( - blob: Blob, - originalFiles: File[], +export async function processResponse( + blob: Blob, + originalFiles: File[], filePrefix: string, responseHandler?: ResponseHandler -): Promise => { - const handler = responseHandler || defaultResponseHandler; - - switch (handler.type) { - case 'zip': - if (handler.useZipExtractor) { - // This path should be avoided - use useToolResources.extractZipFiles instead - throw new Error('ZIP extraction should use useToolResources.extractZipFiles'); - } - // Fall through to custom if no zip extractor - case 'custom': - if (handler.processor) { - return await handler.processor(blob); - } - // Fall through to single - case 'single': - default: - const contentType = blob.type || 'application/pdf'; - const filename = originalFiles.length === 1 - ? `${filePrefix}${originalFiles[0].name}` - : `${filePrefix}result.pdf`; - return [new File([blob], filename, { type: contentType })]; +): Promise { + if (responseHandler) { + const out = await responseHandler(blob, originalFiles); + return Array.isArray(out) ? out : [out as unknown as File]; } -}; \ No newline at end of file + + const original = originalFiles[0]?.name ?? 'result.pdf'; + const name = `${filePrefix}${original}`; + const type = blob.type || 'application/octet-stream'; + return [new File([blob], name, { type })]; +}