import { getDocument } from 'pdfjs-dist'; import { FileAnalysis, ProcessingStrategy } from '../types/processing'; export class FileAnalyzer { private static readonly SIZE_THRESHOLDS = { SMALL: 10 * 1024 * 1024, // 10MB MEDIUM: 50 * 1024 * 1024, // 50MB LARGE: 200 * 1024 * 1024, // 200MB }; private static readonly PAGE_THRESHOLDS = { FEW: 10, // < 10 pages - immediate full processing MANY: 50, // < 50 pages - priority pages MASSIVE: 100, // < 100 pages - progressive chunked // >100 pages = metadata only }; /** * Analyze a file to determine optimal processing strategy */ static async analyzeFile(file: File): Promise { const analysis: FileAnalysis = { fileSize: file.size, isEncrypted: false, isCorrupted: false, recommendedStrategy: 'metadata_only', estimatedProcessingTime: 0, }; try { // Quick validation and page count estimation const quickAnalysis = await this.quickPDFAnalysis(file); analysis.estimatedPageCount = quickAnalysis.pageCount; analysis.isEncrypted = quickAnalysis.isEncrypted; analysis.isCorrupted = quickAnalysis.isCorrupted; // Determine strategy based on file characteristics analysis.recommendedStrategy = this.determineStrategy(file.size, quickAnalysis.pageCount); // Estimate processing time analysis.estimatedProcessingTime = this.estimateProcessingTime( file.size, quickAnalysis.pageCount, analysis.recommendedStrategy ); } catch (error) { console.error('File analysis failed:', error); analysis.isCorrupted = true; analysis.recommendedStrategy = 'metadata_only'; } return analysis; } /** * Quick PDF analysis without full processing */ private static async quickPDFAnalysis(file: File): Promise<{ pageCount: number; isEncrypted: boolean; isCorrupted: boolean; }> { try { // For small files, read the whole file // For large files, try the whole file first (PDF.js needs the complete structure) const arrayBuffer = await file.arrayBuffer(); const pdf = await getDocument({ data: arrayBuffer, stopAtErrors: false, // Don't stop at minor errors verbosity: 0 // Suppress PDF.js warnings }).promise; const pageCount = pdf.numPages; const isEncrypted = pdf.isEncrypted; // Clean up pdf.destroy(); return { pageCount, isEncrypted, isCorrupted: false }; } catch (error) { // Try to determine if it's corruption vs encryption const errorMessage = error instanceof Error ? error.message.toLowerCase() : ''; const isEncrypted = errorMessage.includes('password') || errorMessage.includes('encrypted'); return { pageCount: 0, isEncrypted, isCorrupted: !isEncrypted // If not encrypted, probably corrupted }; } } /** * Determine the best processing strategy based on file characteristics */ private static determineStrategy(fileSize: number, pageCount?: number): ProcessingStrategy { // Handle corrupted or encrypted files if (!pageCount || pageCount === 0) { return 'metadata_only'; } // Small files with few pages - process everything immediately if (fileSize <= this.SIZE_THRESHOLDS.SMALL && pageCount <= this.PAGE_THRESHOLDS.FEW) { return 'immediate_full'; } // Medium files or many pages - priority pages first, then progressive if (fileSize <= this.SIZE_THRESHOLDS.MEDIUM && pageCount <= this.PAGE_THRESHOLDS.MANY) { return 'priority_pages'; } // Large files or massive page counts - chunked processing if (fileSize <= this.SIZE_THRESHOLDS.LARGE && pageCount <= this.PAGE_THRESHOLDS.MASSIVE) { return 'progressive_chunked'; } // Very large files - metadata only return 'metadata_only'; } /** * Estimate processing time based on file characteristics and strategy */ private static estimateProcessingTime( fileSize: number, pageCount: number = 0, strategy: ProcessingStrategy ): number { const baseTimes = { immediate_full: 200, // 200ms per page priority_pages: 150, // 150ms per page (optimized) progressive_chunked: 100, // 100ms per page (chunked) metadata_only: 50 // 50ms total }; const baseTime = baseTimes[strategy]; switch (strategy) { case 'metadata_only': return baseTime; case 'immediate_full': return pageCount * baseTime; case 'priority_pages': // Estimate time for priority pages (first 10) const priorityPages = Math.min(pageCount, 10); return priorityPages * baseTime; case 'progressive_chunked': // Estimate time for first chunk (20 pages) const firstChunk = Math.min(pageCount, 20); return firstChunk * baseTime; default: return pageCount * baseTime; } } /** * Get processing recommendations for a set of files */ static async analyzeMultipleFiles(files: File[]): Promise<{ analyses: Map; recommendations: { totalEstimatedTime: number; suggestedBatchSize: number; shouldUseWebWorker: boolean; memoryWarning: boolean; }; }> { const analyses = new Map(); let totalEstimatedTime = 0; let totalSize = 0; let totalPages = 0; // Analyze each file for (const file of files) { const analysis = await this.analyzeFile(file); analyses.set(file, analysis); totalEstimatedTime += analysis.estimatedProcessingTime; totalSize += file.size; totalPages += analysis.estimatedPageCount || 0; } // Generate recommendations const recommendations = { totalEstimatedTime, suggestedBatchSize: this.calculateBatchSize(files.length, totalSize), shouldUseWebWorker: totalPages > 100 || totalSize > this.SIZE_THRESHOLDS.MEDIUM, memoryWarning: totalSize > this.SIZE_THRESHOLDS.LARGE || totalPages > this.PAGE_THRESHOLDS.MASSIVE }; return { analyses, recommendations }; } /** * Calculate optimal batch size for processing multiple files */ private static calculateBatchSize(fileCount: number, totalSize: number): number { // Process small batches for large total sizes if (totalSize > this.SIZE_THRESHOLDS.LARGE) { return Math.max(1, Math.floor(fileCount / 4)); } if (totalSize > this.SIZE_THRESHOLDS.MEDIUM) { return Math.max(2, Math.floor(fileCount / 2)); } // Process all at once for smaller total sizes return fileCount; } /** * Check if a file appears to be a valid PDF */ static async isValidPDF(file: File): Promise { if (file.type !== 'application/pdf' && !file.name.toLowerCase().endsWith('.pdf')) { return false; } try { // Read first few bytes to check PDF header const header = file.slice(0, 8); const headerBytes = new Uint8Array(await header.arrayBuffer()); const headerString = String.fromCharCode(...headerBytes); return headerString.startsWith('%PDF-'); } catch (error) { return false; } } }