mirror of
https://github.com/Stirling-Tools/Stirling-PDF.git
synced 2025-08-22 04:09:22 +00:00
240 lines
7.2 KiB
TypeScript
240 lines
7.2 KiB
TypeScript
![]() |
import { getDocument } from 'pdfjs-dist';
|
||
|
import { FileAnalysis, ProcessingStrategy } from '../types/processing';
|
||
|
|
||
|
export class FileAnalyzer {
|
||
|
private static readonly SIZE_THRESHOLDS = {
|
||
|
SMALL: 10 * 1024 * 1024, // 10MB
|
||
|
MEDIUM: 50 * 1024 * 1024, // 50MB
|
||
|
LARGE: 200 * 1024 * 1024, // 200MB
|
||
|
};
|
||
|
|
||
|
private static readonly PAGE_THRESHOLDS = {
|
||
|
FEW: 10, // < 10 pages - immediate full processing
|
||
|
MANY: 50, // < 50 pages - priority pages
|
||
|
MASSIVE: 100, // < 100 pages - progressive chunked
|
||
|
// >100 pages = metadata only
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Analyze a file to determine optimal processing strategy
|
||
|
*/
|
||
|
static async analyzeFile(file: File): Promise<FileAnalysis> {
|
||
|
const analysis: FileAnalysis = {
|
||
|
fileSize: file.size,
|
||
|
isEncrypted: false,
|
||
|
isCorrupted: false,
|
||
|
recommendedStrategy: 'metadata_only',
|
||
|
estimatedProcessingTime: 0,
|
||
|
};
|
||
|
|
||
|
try {
|
||
|
// Quick validation and page count estimation
|
||
|
const quickAnalysis = await this.quickPDFAnalysis(file);
|
||
|
analysis.estimatedPageCount = quickAnalysis.pageCount;
|
||
|
analysis.isEncrypted = quickAnalysis.isEncrypted;
|
||
|
analysis.isCorrupted = quickAnalysis.isCorrupted;
|
||
|
|
||
|
// Determine strategy based on file characteristics
|
||
|
analysis.recommendedStrategy = this.determineStrategy(file.size, quickAnalysis.pageCount);
|
||
|
|
||
|
// Estimate processing time
|
||
|
analysis.estimatedProcessingTime = this.estimateProcessingTime(
|
||
|
file.size,
|
||
|
quickAnalysis.pageCount,
|
||
|
analysis.recommendedStrategy
|
||
|
);
|
||
|
|
||
|
} catch (error) {
|
||
|
console.error('File analysis failed:', error);
|
||
|
analysis.isCorrupted = true;
|
||
|
analysis.recommendedStrategy = 'metadata_only';
|
||
|
}
|
||
|
|
||
|
return analysis;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Quick PDF analysis without full processing
|
||
|
*/
|
||
|
private static async quickPDFAnalysis(file: File): Promise<{
|
||
|
pageCount: number;
|
||
|
isEncrypted: boolean;
|
||
|
isCorrupted: boolean;
|
||
|
}> {
|
||
|
try {
|
||
|
// For small files, read the whole file
|
||
|
// For large files, try the whole file first (PDF.js needs the complete structure)
|
||
|
const arrayBuffer = await file.arrayBuffer();
|
||
|
|
||
|
const pdf = await getDocument({
|
||
|
data: arrayBuffer,
|
||
|
stopAtErrors: false, // Don't stop at minor errors
|
||
|
verbosity: 0 // Suppress PDF.js warnings
|
||
|
}).promise;
|
||
|
|
||
|
const pageCount = pdf.numPages;
|
||
|
const isEncrypted = pdf.isEncrypted;
|
||
|
|
||
|
// Clean up
|
||
|
pdf.destroy();
|
||
|
|
||
|
return {
|
||
|
pageCount,
|
||
|
isEncrypted,
|
||
|
isCorrupted: false
|
||
|
};
|
||
|
|
||
|
} catch (error) {
|
||
|
// Try to determine if it's corruption vs encryption
|
||
|
const errorMessage = error instanceof Error ? error.message.toLowerCase() : '';
|
||
|
const isEncrypted = errorMessage.includes('password') || errorMessage.includes('encrypted');
|
||
|
|
||
|
return {
|
||
|
pageCount: 0,
|
||
|
isEncrypted,
|
||
|
isCorrupted: !isEncrypted // If not encrypted, probably corrupted
|
||
|
};
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Determine the best processing strategy based on file characteristics
|
||
|
*/
|
||
|
private static determineStrategy(fileSize: number, pageCount?: number): ProcessingStrategy {
|
||
|
// Handle corrupted or encrypted files
|
||
|
if (!pageCount || pageCount === 0) {
|
||
|
return 'metadata_only';
|
||
|
}
|
||
|
|
||
|
// Small files with few pages - process everything immediately
|
||
|
if (fileSize <= this.SIZE_THRESHOLDS.SMALL && pageCount <= this.PAGE_THRESHOLDS.FEW) {
|
||
|
return 'immediate_full';
|
||
|
}
|
||
|
|
||
|
// Medium files or many pages - priority pages first, then progressive
|
||
|
if (fileSize <= this.SIZE_THRESHOLDS.MEDIUM && pageCount <= this.PAGE_THRESHOLDS.MANY) {
|
||
|
return 'priority_pages';
|
||
|
}
|
||
|
|
||
|
// Large files or massive page counts - chunked processing
|
||
|
if (fileSize <= this.SIZE_THRESHOLDS.LARGE && pageCount <= this.PAGE_THRESHOLDS.MASSIVE) {
|
||
|
return 'progressive_chunked';
|
||
|
}
|
||
|
|
||
|
// Very large files - metadata only
|
||
|
return 'metadata_only';
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Estimate processing time based on file characteristics and strategy
|
||
|
*/
|
||
|
private static estimateProcessingTime(
|
||
|
fileSize: number,
|
||
|
pageCount: number = 0,
|
||
|
strategy: ProcessingStrategy
|
||
|
): number {
|
||
|
const baseTimes = {
|
||
|
immediate_full: 200, // 200ms per page
|
||
|
priority_pages: 150, // 150ms per page (optimized)
|
||
|
progressive_chunked: 100, // 100ms per page (chunked)
|
||
|
metadata_only: 50 // 50ms total
|
||
|
};
|
||
|
|
||
|
const baseTime = baseTimes[strategy];
|
||
|
|
||
|
switch (strategy) {
|
||
|
case 'metadata_only':
|
||
|
return baseTime;
|
||
|
|
||
|
case 'immediate_full':
|
||
|
return pageCount * baseTime;
|
||
|
|
||
|
case 'priority_pages':
|
||
|
// Estimate time for priority pages (first 10)
|
||
|
const priorityPages = Math.min(pageCount, 10);
|
||
|
return priorityPages * baseTime;
|
||
|
|
||
|
case 'progressive_chunked':
|
||
|
// Estimate time for first chunk (20 pages)
|
||
|
const firstChunk = Math.min(pageCount, 20);
|
||
|
return firstChunk * baseTime;
|
||
|
|
||
|
default:
|
||
|
return pageCount * baseTime;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get processing recommendations for a set of files
|
||
|
*/
|
||
|
static async analyzeMultipleFiles(files: File[]): Promise<{
|
||
|
analyses: Map<File, FileAnalysis>;
|
||
|
recommendations: {
|
||
|
totalEstimatedTime: number;
|
||
|
suggestedBatchSize: number;
|
||
|
shouldUseWebWorker: boolean;
|
||
|
memoryWarning: boolean;
|
||
|
};
|
||
|
}> {
|
||
|
const analyses = new Map<File, FileAnalysis>();
|
||
|
let totalEstimatedTime = 0;
|
||
|
let totalSize = 0;
|
||
|
let totalPages = 0;
|
||
|
|
||
|
// Analyze each file
|
||
|
for (const file of files) {
|
||
|
const analysis = await this.analyzeFile(file);
|
||
|
analyses.set(file, analysis);
|
||
|
totalEstimatedTime += analysis.estimatedProcessingTime;
|
||
|
totalSize += file.size;
|
||
|
totalPages += analysis.estimatedPageCount || 0;
|
||
|
}
|
||
|
|
||
|
// Generate recommendations
|
||
|
const recommendations = {
|
||
|
totalEstimatedTime,
|
||
|
suggestedBatchSize: this.calculateBatchSize(files.length, totalSize),
|
||
|
shouldUseWebWorker: totalPages > 100 || totalSize > this.SIZE_THRESHOLDS.MEDIUM,
|
||
|
memoryWarning: totalSize > this.SIZE_THRESHOLDS.LARGE || totalPages > this.PAGE_THRESHOLDS.MASSIVE
|
||
|
};
|
||
|
|
||
|
return { analyses, recommendations };
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Calculate optimal batch size for processing multiple files
|
||
|
*/
|
||
|
private static calculateBatchSize(fileCount: number, totalSize: number): number {
|
||
|
// Process small batches for large total sizes
|
||
|
if (totalSize > this.SIZE_THRESHOLDS.LARGE) {
|
||
|
return Math.max(1, Math.floor(fileCount / 4));
|
||
|
}
|
||
|
|
||
|
if (totalSize > this.SIZE_THRESHOLDS.MEDIUM) {
|
||
|
return Math.max(2, Math.floor(fileCount / 2));
|
||
|
}
|
||
|
|
||
|
// Process all at once for smaller total sizes
|
||
|
return fileCount;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Check if a file appears to be a valid PDF
|
||
|
*/
|
||
|
static async isValidPDF(file: File): Promise<boolean> {
|
||
|
if (file.type !== 'application/pdf' && !file.name.toLowerCase().endsWith('.pdf')) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
try {
|
||
|
// Read first few bytes to check PDF header
|
||
|
const header = file.slice(0, 8);
|
||
|
const headerBytes = new Uint8Array(await header.arrayBuffer());
|
||
|
const headerString = String.fromCharCode(...headerBytes);
|
||
|
|
||
|
return headerString.startsWith('%PDF-');
|
||
|
} catch (error) {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
}
|