Clean up OCR and additional tweaks

2025-08-26 06:09:23 +00:00 · 2025-08-08 14:26:14 +01:00 · 2025-08-08 14:26:14 +01:00 · c942db0515
commit c942db0515
parent e1296f5ac9
5 changed files with 148 additions and 218 deletions
--- a/frontend/src/hooks/tools/ocr/useOCROperation.ts
+++ b/frontend/src/hooks/tools/ocr/useOCROperation.ts
@ -1,161 +1,117 @@
 import { useCallback } from 'react';
-import axios from 'axios';
 import { useTranslation } from 'react-i18next';
 import { OCRParameters } from '../../../components/tools/ocr/OCRSettings';
 import { useToolOperation, ToolOperationConfig } from '../shared/useToolOperation';
 import { createStandardErrorHandler } from '../../../utils/toolErrorHandler';
 import { useToolResources } from '../shared/useToolResources';

-const buildFormData = (parameters: OCRParameters, file: File): FormData => {
+// Helper: get MIME type based on file extension
+function getMimeType(filename: string): string {
+  const ext = filename.toLowerCase().split('.').pop();
+  switch (ext) {
+    case 'pdf': return 'application/pdf';
+    case 'txt': return 'text/plain';
+    case 'zip': return 'application/zip';
+    default: return 'application/octet-stream';
+  }
+}
+
+// Lightweight ZIP extractor (keep or replace with a shared util if you have one)
+async function extractZipFile(zipBlob: Blob): Promise<File[]> {
+  const JSZip = await import('jszip');
+  const zip = new JSZip.default();
+  const zipContent = await zip.loadAsync(await zipBlob.arrayBuffer());
+  const out: File[] = [];
+  for (const [filename, file] of Object.entries(zipContent.files)) {
+    if (!file.dir) {
+      const content = await file.async('blob');
+      out.push(new File([content], filename, { type: getMimeType(filename) }));
+    }
+  }
+  return out;
+}
+
+// Helper: strip extension
+function stripExt(name: string): string {
+  const i = name.lastIndexOf('.');
+  return i > 0 ? name.slice(0, i) : name;
+}
+
+// Signature must be (file, params)
+const buildFormData = (file: File, parameters: OCRParameters): FormData => {
  const formData = new FormData();
-
-  // Add the file
  formData.append('fileInput', file);
-
-  // Add languages as multiple parameters with same name (like checkboxes)
-  parameters.languages.forEach(lang => {
-    formData.append('languages', lang);
-  });
-
-  // Add other parameters
+  parameters.languages.forEach((lang) => formData.append('languages', lang));
  formData.append('ocrType', parameters.ocrType);
  formData.append('ocrRenderType', parameters.ocrRenderType);
-  
-  // Handle additional options - convert array to individual boolean parameters
  formData.append('sidecar', parameters.additionalOptions.includes('sidecar').toString());
  formData.append('deskew', parameters.additionalOptions.includes('deskew').toString());
  formData.append('clean', parameters.additionalOptions.includes('clean').toString());
  formData.append('cleanFinal', parameters.additionalOptions.includes('cleanFinal').toString());
  formData.append('removeImagesAfter', parameters.additionalOptions.includes('removeImagesAfter').toString());
-
  return formData;
 };

 export const useOCROperation = () => {
  const { t } = useTranslation();
  const { extractZipFiles } = useToolResources();
-  
-  const customOCRProcessor = useCallback(async (
-    parameters: OCRParameters,
-    selectedFiles: File[]
-  ): Promise<File[]> => {
-    const processedFiles: File[] = [];
-    const failedFiles: string[] = [];

-    // OCR typically processes one file at a time
-    for (let i = 0; i < selectedFiles.length; i++) {
-      const file = selectedFiles[i];
+  // OCR-specific parsing: ZIP (sidecar) vs PDF vs HTML error
+  const responseHandler = useCallback(async (blob: Blob, originalFiles: File[]): Promise<File[]> => {
+    const headBuf = await blob.slice(0, 8).arrayBuffer();
+    const head = new TextDecoder().decode(new Uint8Array(headBuf));

+    // ZIP: sidecar or multi-asset output
+    if (head.startsWith('PK')) {
+      const base = stripExt(originalFiles[0].name);
      try {
-        const formData = buildFormData(parameters, file);
-        const response = await axios.post('/api/v1/misc/ocr-pdf', formData, { 
-          responseType: "blob"
-        });
+        const extracted = await extractZipFiles(blob);
+        if (extracted.length > 0) return extracted;
+      } catch { /* ignore and try local extractor */ }
+      try {
+        const local = await extractZipFile(blob); // local fallback
+        if (local.length > 0) return local;
+      } catch { /* fall through */ }
+      return [new File([blob], `ocr_${base}.zip`, { type: 'application/zip' })];
+    }

-        // Check for HTTP errors
-        if (response.status >= 400) {
-          const errorText = await response.data.text();
-          throw new Error(`OCR service HTTP error ${response.status}: ${errorText.substring(0, 300)}`);
+    // Not a PDF: surface error details if present
+    if (!head.startsWith('%PDF')) {
+      const textBuf = await blob.slice(0, 1024).arrayBuffer();
+      const text = new TextDecoder().decode(new Uint8Array(textBuf));
+      if (/error|exception|html/i.test(text)) {
+        if (text.includes('OCR tools') && text.includes('not installed')) {
+          throw new Error('OCR tools (OCRmyPDF or Tesseract) are not installed on the server. Use the standard or fat Docker image instead of ultra-lite, or install OCR tools manually.');
        }
-
-        // Validate response
-        if (!response.data || response.data.size === 0) {
-          throw new Error('Empty response from OCR service');
-        }
-
-        const contentType = response.headers['content-type'] || 'application/pdf';
-        
-        // Check if response is actually a PDF by examining the first few bytes
-        const arrayBuffer = await response.data.arrayBuffer();
-        const uint8Array = new Uint8Array(arrayBuffer);
-        const header = new TextDecoder().decode(uint8Array.slice(0, 4));
-        
-        // Check if it's a ZIP file (OCR service returns ZIP when sidecar is enabled or for multi-file results)
-        if (header.startsWith('PK')) {
-          try {
-            // Extract ZIP file contents using tool resources
-            const zipBlob = new Blob([arrayBuffer]);
-            const extractedFiles = await extractZipFiles(zipBlob);
-            
-            if (extractedFiles.length > 0) {
-              // Add extracted files to processed files
-              processedFiles.push(...extractedFiles);
-            } else {
-              // Fallback to treating as single ZIP file if extraction failed
-              const zipFile = new File([arrayBuffer], `ocr_${file.name}.zip`, { type: 'application/zip' });
-              processedFiles.push(zipFile);
-            }
-          } catch (extractError) {
-            // Fallback to treating as single ZIP file
-            const zipFile = new File([arrayBuffer], `ocr_${file.name}.zip`, { type: 'application/zip' });
-            processedFiles.push(zipFile);
-          }
-          continue; // Skip the PDF validation for ZIP files
-        }
-        
-        if (!header.startsWith('%PDF')) {
-          // Check if it's an error response
-          const text = new TextDecoder().decode(uint8Array.slice(0, 500));
-          
-          if (text.includes('error') || text.includes('Error') || text.includes('exception') || text.includes('html')) {
-            // Check for specific OCR tool unavailable error
-            if (text.includes('OCR tools') && text.includes('not installed')) {
-              throw new Error('OCR tools (OCRmyPDF or Tesseract) are not installed on the server. Use the standard or fat Docker image instead of ultra-lite, or install OCR tools manually.');
-            }
-            throw new Error(`OCR service error: ${text.substring(0, 300)}`);
-          }
-          
-          // Check if it's an HTML error page
-          if (text.includes('<html') || text.includes('<!DOCTYPE')) {
-            // Try to extract error message from HTML
-            const errorMatch = text.match(/<title[^>]*>([^<]+)<\/title>/i) || 
-                             text.match(/<h1[^>]*>([^<]+)<\/h1>/i) ||
-                             text.match(/<body[^>]*>([^<]+)<\/body>/i);
-            const errorMessage = errorMatch ? errorMatch[1].trim() : t('ocr.error.unknown', 'Unknown error');
-            throw new Error(`OCR service error: ${errorMessage}`);
-          }
-          
-          throw new Error(`Response is not a valid PDF file. Header: "${header}"`);
-        }
-
-        const blob = new Blob([arrayBuffer], { type: contentType });
-        const processedFile = new File([blob], `ocr_${file.name}`, { type: contentType });
-
-        processedFiles.push(processedFile);
-      } catch (fileError) {
-        const errorMessage = fileError instanceof Error ? fileError.message : t('ocr.error.unknown', 'Unknown error');
-        failedFiles.push(`${file.name} (${errorMessage})`);
+        const title =
+          text.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1] ||
+          text.match(/<h1[^>]*>([^<]+)<\/h1>/i)?.[1] ||
+          t('ocr.error.unknown', 'Unknown error');
+        throw new Error(`OCR service error: ${title}`);
      }
+      throw new Error(`Response is not a valid PDF. Header: "${head}"`);
    }

-    if (failedFiles.length > 0 && processedFiles.length === 0) {
-      throw new Error(`Failed to process OCR for all files: ${failedFiles.join(', ')}`);
-    }
-
-    return processedFiles;
-  }, [t]);
+    const base = stripExt(originalFiles[0].name);
+    return [new File([blob], `ocr_${base}.pdf`, { type: 'application/pdf' })];
+  }, [t, extractZipFiles]);

  const ocrConfig: ToolOperationConfig<OCRParameters> = {
    operationType: 'ocr',
-    endpoint: '/api/v1/misc/ocr-pdf', // Not used with customProcessor but required
-    buildFormData, // Not used with customProcessor but required
+    endpoint: '/api/v1/misc/ocr-pdf',
+    buildFormData,
    filePrefix: 'ocr_',
-    customProcessor: customOCRProcessor,
-    validateParams: (params) => {
-      if (params.languages.length === 0) {
-        return { valid: false, errors: [t('ocr.validation.languageRequired', 'Please select at least one language for OCR processing.')] };
-      }
-      return { valid: true };
-    },
-    getErrorMessage: (error) => {
-      // Handle OCR-specific error first
-      if (error.message?.includes('OCR tools') && error.message?.includes('not installed')) {
-        return 'OCR tools (OCRmyPDF or Tesseract) are not installed on the server. Use the standard or fat Docker image instead of ultra-lite, or install OCR tools manually.';
-      }
-      // Fall back to standard error handling
-      return createStandardErrorHandler(t('ocr.error.failed', 'OCR operation failed'))(error);
-    }
+    multiFileEndpoint: false, // Process files individually
+    responseHandler, // use shared flow
+    validateParams: (params) =>
+      params.languages.length === 0
+        ? { valid: false, errors: [t('ocr.validation.languageRequired', 'Please select at least one language for OCR processing.')] }
+        : { valid: true },
+    getErrorMessage: (error) =>
+      error.message?.includes('OCR tools') && error.message?.includes('not installed')
+        ? 'OCR tools (OCRmyPDF or Tesseract) are not installed on the server. Use the standard or fat Docker image instead of ultra-lite, or install OCR tools manually.'
+        : createStandardErrorHandler(t('ocr.error.failed', 'OCR operation failed'))(error),
  };

  return useToolOperation(ocrConfig);
-}; 
+};
--- a/frontend/src/hooks/tools/shared/useToolApiCalls.ts
+++ b/frontend/src/hooks/tools/shared/useToolApiCalls.ts
@ -29,7 +29,7 @@ export const useToolApiCalls = <TParams = void>() => {

    for (let i = 0; i < validFiles.length; i++) {
      const file = validFiles[i];
-      
+
      onProgress({ current: i + 1, total, currentFileName: file.name });
      onStatus(`Processing ${file.name} (${i + 1}/${total})`);

@ -38,12 +38,13 @@ export const useToolApiCalls = <TParams = void>() => {
        const endpoint = typeof config.endpoint === 'function' ? config.endpoint(params) : config.endpoint;
        const response = await axios.post(endpoint, formData, {
          responseType: 'blob',
-          cancelToken: cancelTokenRef.current.token
+          cancelToken: cancelTokenRef.current.token,
        });

+        // Forward to shared response processor (uses tool-specific responseHandler if provided)
        const responseFiles = await processResponse(
-          response.data, 
-          [file], 
+          response.data,
+          [file],
          config.filePrefix,
          config.responseHandler
        );
@ -82,4 +83,4 @@ export const useToolApiCalls = <TParams = void>() => {
    processFiles,
    cancelOperation,
  };
-};
+};
--- a/frontend/src/hooks/tools/shared/useToolOperation.ts
+++ b/frontend/src/hooks/tools/shared/useToolOperation.ts
@ -7,7 +7,7 @@ import { useToolApiCalls, type ApiCallsConfig } from './useToolApiCalls';
 import { useToolResources } from './useToolResources';
 import { extractErrorMessage } from '../../../utils/toolErrorHandler';
 import { createOperation } from '../../../utils/toolOperationTracker';
-import type { ResponseHandler } from '../../../utils/toolResponseProcessor';
+import { type ResponseHandler, processResponse } from '../../../utils/toolResponseProcessor';

 export interface ValidationResult {
  valid: boolean;
@ -19,54 +19,54 @@ export type { ProcessingProgress, ResponseHandler };

 /**
 * Configuration for tool operations defining processing behavior and API integration.
- * 
+ *
 * Supports three patterns:
 * 1. Single-file tools: multiFileEndpoint: false, processes files individually
- * 2. Multi-file tools: multiFileEndpoint: true, single API call with all files  
+ * 2. Multi-file tools: multiFileEndpoint: true, single API call with all files
 * 3. Complex tools: customProcessor handles all processing logic
 */
 export interface ToolOperationConfig<TParams = void> {
  /** Operation identifier for tracking and logging */
  operationType: string;
-  
-  /** 
+
+  /**
   * API endpoint for the operation. Can be static string or function for dynamic routing.
   * Not used when customProcessor is provided.
   */
  endpoint: string | ((params: TParams) => string);
-  
-  /** 
+
+  /**
   * Builds FormData for API request. Signature determines processing approach:
   * - (params, file: File) => FormData: Single-file processing
-   * - (params, files: File[]) => FormData: Multi-file processing  
+   * - (params, files: File[]) => FormData: Multi-file processing
   * Not used when customProcessor is provided.
   */
  buildFormData: ((params: TParams, file: File) => FormData) | ((params: TParams, files: File[]) => FormData);
-  
+
  /** Prefix added to processed filenames (e.g., 'compressed_', 'split_') */
  filePrefix: string;
-  
-  /** 
+
+  /**
   * Whether this tool uses backends that accept MultipartFile[] arrays.
   * - true: Single API call with all files (backend uses MultipartFile[])
   * - false/undefined: Individual API calls per file (backend uses single MultipartFile)
   * Ignored when customProcessor is provided.
   */
  multiFileEndpoint?: boolean;
-  
+
  /** How to handle API responses (e.g., ZIP extraction, single file response) */
  responseHandler?: ResponseHandler;
-  
-  /** 
+
+  /**
   * Custom processing logic that completely bypasses standard file processing.
   * When provided, tool handles all API calls, response processing, and file creation.
   * Use for tools with complex routing logic or non-standard processing requirements.
   */
  customProcessor?: (params: TParams, files: File[]) => Promise<File[]>;
-  
+
  /** Validate parameters before execution. Return validation errors if invalid. */
  validateParams?: (params: TParams) => ValidationResult;
-  
+
  /** Extract user-friendly error messages from API errors */
  getErrorMessage?: (error: any) => string;
 }
@ -85,7 +85,7 @@ export interface ToolOperationHook<TParams = void> {
  status: string;
  errorMessage: string | null;
  progress: ProcessingProgress | null;
-  
+
  // Actions
  executeOperation: (params: TParams, selectedFiles: File[]) => Promise<void>;
  resetResults: () => void;
@ -99,12 +99,12 @@ export { createStandardErrorHandler } from '../../../utils/toolErrorHandler';
 /**
 * Shared hook for tool operations providing consistent error handling, progress tracking,
 * and FileContext integration. Eliminates boilerplate while maintaining flexibility.
- * 
+ *
 * Supports three tool patterns:
 * 1. Single-file tools: Set multiFileEndpoint: false, processes files individually
 * 2. Multi-file tools: Set multiFileEndpoint: true, single API call with all files
 * 3. Complex tools: Provide customProcessor for full control over processing logic
- * 
+ *
 * @param config - Tool operation configuration
 * @returns Hook interface with state and execution methods
 */
@ -113,7 +113,7 @@ export const useToolOperation = <TParams = void>(
 ): ToolOperationHook<TParams> => {
  const { t } = useTranslation();
  const { recordOperation, markOperationApplied, markOperationFailed, addFiles } = useFileContext();
-  
+
  // Composed hooks
  const { state, actions } = useToolState();
  const { processFiles, cancelOperation: cancelApiCalls } = useToolApiCalls<TParams>();
@ -155,7 +155,7 @@ export const useToolOperation = <TParams = void>(

    try {
      let processedFiles: File[];
-      
+
      if (config.customProcessor) {
        actions.setStatus('Processing files...');
        processedFiles = await config.customProcessor(params, validFiles);
@ -166,46 +166,43 @@ export const useToolOperation = <TParams = void>(
          actions.setStatus('Processing files...');
          const formData = (config.buildFormData as (params: TParams, files: File[]) => FormData)(params, validFiles);
          const endpoint = typeof config.endpoint === 'function' ? config.endpoint(params) : config.endpoint;
-          
+
          const response = await axios.post(endpoint, formData, { responseType: 'blob' });
-          
-          // Handle response based on responseHandler
-          if (config.responseHandler?.type === 'zip' && config.responseHandler?.useZipExtractor) {
-            // Use tool resources for ZIP extraction
+
+          // Multi-file responses are typically ZIP files that need extraction
+          if (config.responseHandler) {
+            // Use custom responseHandler for multi-file (handles ZIP extraction)
+            processedFiles = await config.responseHandler(response.data, validFiles);
+          } else {
+            // Default: assume ZIP response for multi-file endpoints
            processedFiles = await extractZipFiles(response.data);
            
            if (processedFiles.length === 0) {
              // Try the generic extraction as fallback
              processedFiles = await extractAllZipFiles(response.data);
            }
-          } else {
-            // Single file response
-            const filename = validFiles.length === 1 
-              ? `${config.filePrefix}${validFiles[0].name}`
-              : `${config.filePrefix}result.pdf`;
-            processedFiles = [new File([response.data], filename, { type: response.data.type })];
          }
        } else {
          // Individual file processing - separate API call per file
          const apiCallsConfig: ApiCallsConfig<TParams> = {
            endpoint: config.endpoint,
-            buildFormData: (file: File, params: TParams) => (config.buildFormData as (params: TParams, file: File) => FormData)(params, file),
+            buildFormData: (file: File, params: TParams) => (config.buildFormData as (file: File, params: TParams) => FormData)(file, params),
            filePrefix: config.filePrefix,
            responseHandler: config.responseHandler
          };
          processedFiles = await processFiles(
-            params, 
-            validFiles, 
+            params,
+            validFiles,
            apiCallsConfig,
            actions.setProgress,
            actions.setStatus
          );
        }
      }
-      
+
      if (processedFiles.length > 0) {
        actions.setFiles(processedFiles);
-        
+
        // Generate thumbnails and download URL concurrently
        actions.setGeneratingThumbnails(true);
        const [thumbnails, downloadInfo] = await Promise.all([
@ -213,13 +210,13 @@ export const useToolOperation = <TParams = void>(
          createDownloadInfo(processedFiles, config.operationType)
        ]);
        actions.setGeneratingThumbnails(false);
-        
+
        actions.setThumbnails(thumbnails);
        actions.setDownloadInfo(downloadInfo.url, downloadInfo.filename);
-        
+
        // Add to file context
        await addFiles(processedFiles);
-        
+
        markOperationApplied(fileId, operationId);
      }

@ -257,11 +254,11 @@ export const useToolOperation = <TParams = void>(
    status: state.status,
    errorMessage: state.errorMessage,
    progress: state.progress,
-    
+
    // Actions
    executeOperation,
    resetResults,
    clearError: actions.clearError,
    cancelOperation
  };
-};
+};
--- a/frontend/src/hooks/tools/split/useSplitOperation.ts
+++ b/frontend/src/hooks/tools/split/useSplitOperation.ts
@ -9,7 +9,7 @@ import { SPLIT_MODES } from '../../../constants/splitConstants';

 const buildFormData = (parameters: SplitParameters, selectedFiles: File[]): FormData => {
  const formData = new FormData();
-  
+
  selectedFiles.forEach(file => {
    formData.append("fileInput", file);
  });
@ -59,28 +59,24 @@ const getEndpoint = (parameters: SplitParameters): string => {

 export const useSplitOperation = () => {
  const { t } = useTranslation();
-  
+
  return useToolOperation<SplitParameters>({
    operationType: 'split',
    endpoint: (params) => getEndpoint(params),
-    buildFormData: buildFormData, // Multi-file signature: (params, selectedFiles) => FormData  
+    buildFormData: buildFormData, // Multi-file signature: (params, selectedFiles) => FormData
    filePrefix: 'split_',
    multiFileEndpoint: true, // Single API call with all files
-    responseHandler: {
-      type: 'zip',
-      useZipExtractor: true
-    },
    validateParams: (params) => {
      if (!params.mode) {
        return { valid: false, errors: [t('split.validation.modeRequired', 'Split mode is required')] };
      }
-      
+
      if (params.mode === SPLIT_MODES.BY_PAGES && !params.pages) {
        return { valid: false, errors: [t('split.validation.pagesRequired', 'Page numbers are required for split by pages')] };
      }
-      
+
      return { valid: true };
    },
    getErrorMessage: createStandardErrorHandler(t('split.error.failed', 'An error occurred while splitting the PDF.'))
  });
-};
+};
--- a/frontend/src/utils/toolResponseProcessor.ts
+++ b/frontend/src/utils/toolResponseProcessor.ts
@ -1,45 +1,25 @@
 // Note: This utility should be used with useToolResources for ZIP operations

-export interface ResponseHandler {
-  type: 'single' | 'zip' | 'custom';
-  processor?: (blob: Blob) => Promise<File[]>;
-  useZipExtractor?: boolean;
-}
-
-const defaultResponseHandler: ResponseHandler = {
-  type: 'single'
-};
+export type ResponseHandler = (blob: Blob, originalFiles: File[]) => Promise<File[]> | File[];

 /**
- * Processes API response blob based on handler configuration
- * Note: For ZIP extraction, use useToolResources.extractZipFiles instead
+ * Processes a blob response into File(s).
+ * - If a tool-specific responseHandler is provided, it is used.
+ * - Otherwise, create a single file using the filePrefix + original name.
 */
-export const processResponse = async (
-  blob: Blob, 
-  originalFiles: File[], 
+export async function processResponse(
+  blob: Blob,
+  originalFiles: File[],
  filePrefix: string,
  responseHandler?: ResponseHandler
-): Promise<File[]> => {
-  const handler = responseHandler || defaultResponseHandler;
-  
-  switch (handler.type) {
-    case 'zip':
-      if (handler.useZipExtractor) {
-        // This path should be avoided - use useToolResources.extractZipFiles instead
-        throw new Error('ZIP extraction should use useToolResources.extractZipFiles');
-      }
-      // Fall through to custom if no zip extractor
-    case 'custom':
-      if (handler.processor) {
-        return await handler.processor(blob);
-      }
-      // Fall through to single
-    case 'single':
-    default:
-      const contentType = blob.type || 'application/pdf';
-      const filename = originalFiles.length === 1 
-        ? `${filePrefix}${originalFiles[0].name}`
-        : `${filePrefix}result.pdf`;
-      return [new File([blob], filename, { type: contentType })];
+): Promise<File[]> {
+  if (responseHandler) {
+    const out = await responseHandler(blob, originalFiles);
+    return Array.isArray(out) ? out : [out as unknown as File];
  }
-};
+
+  const original = originalFiles[0]?.name ?? 'result.pdf';
+  const name = `${filePrefix}${original}`;
+  const type = blob.type || 'application/octet-stream';
+  return [new File([blob], name, { type })];
+}