mirror of
https://github.com/Stirling-Tools/Stirling-PDF.git
synced 2025-08-21 19:59:24 +00:00

# Description of Changes <!-- File context for managing files between tools and views Optimisation for large files Updated Split to work with new file system and match Matts stepped design closer --> --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing) for more details. --------- Co-authored-by: Anthony Stirling <77850077+Frooodle@users.noreply.github.com>
240 lines
7.2 KiB
TypeScript
240 lines
7.2 KiB
TypeScript
import { getDocument } from 'pdfjs-dist';
|
|
import { FileAnalysis, ProcessingStrategy } from '../types/processing';
|
|
|
|
export class FileAnalyzer {
|
|
private static readonly SIZE_THRESHOLDS = {
|
|
SMALL: 10 * 1024 * 1024, // 10MB
|
|
MEDIUM: 50 * 1024 * 1024, // 50MB
|
|
LARGE: 200 * 1024 * 1024, // 200MB
|
|
};
|
|
|
|
private static readonly PAGE_THRESHOLDS = {
|
|
FEW: 10, // < 10 pages - immediate full processing
|
|
MANY: 50, // < 50 pages - priority pages
|
|
MASSIVE: 100, // < 100 pages - progressive chunked
|
|
// >100 pages = metadata only
|
|
};
|
|
|
|
/**
|
|
* Analyze a file to determine optimal processing strategy
|
|
*/
|
|
static async analyzeFile(file: File): Promise<FileAnalysis> {
|
|
const analysis: FileAnalysis = {
|
|
fileSize: file.size,
|
|
isEncrypted: false,
|
|
isCorrupted: false,
|
|
recommendedStrategy: 'metadata_only',
|
|
estimatedProcessingTime: 0,
|
|
};
|
|
|
|
try {
|
|
// Quick validation and page count estimation
|
|
const quickAnalysis = await this.quickPDFAnalysis(file);
|
|
analysis.estimatedPageCount = quickAnalysis.pageCount;
|
|
analysis.isEncrypted = quickAnalysis.isEncrypted;
|
|
analysis.isCorrupted = quickAnalysis.isCorrupted;
|
|
|
|
// Determine strategy based on file characteristics
|
|
analysis.recommendedStrategy = this.determineStrategy(file.size, quickAnalysis.pageCount);
|
|
|
|
// Estimate processing time
|
|
analysis.estimatedProcessingTime = this.estimateProcessingTime(
|
|
file.size,
|
|
quickAnalysis.pageCount,
|
|
analysis.recommendedStrategy
|
|
);
|
|
|
|
} catch (error) {
|
|
console.error('File analysis failed:', error);
|
|
analysis.isCorrupted = true;
|
|
analysis.recommendedStrategy = 'metadata_only';
|
|
}
|
|
|
|
return analysis;
|
|
}
|
|
|
|
/**
|
|
* Quick PDF analysis without full processing
|
|
*/
|
|
private static async quickPDFAnalysis(file: File): Promise<{
|
|
pageCount: number;
|
|
isEncrypted: boolean;
|
|
isCorrupted: boolean;
|
|
}> {
|
|
try {
|
|
// For small files, read the whole file
|
|
// For large files, try the whole file first (PDF.js needs the complete structure)
|
|
const arrayBuffer = await file.arrayBuffer();
|
|
|
|
const pdf = await getDocument({
|
|
data: arrayBuffer,
|
|
stopAtErrors: false, // Don't stop at minor errors
|
|
verbosity: 0 // Suppress PDF.js warnings
|
|
}).promise;
|
|
|
|
const pageCount = pdf.numPages;
|
|
const isEncrypted = pdf.isEncrypted;
|
|
|
|
// Clean up
|
|
pdf.destroy();
|
|
|
|
return {
|
|
pageCount,
|
|
isEncrypted,
|
|
isCorrupted: false
|
|
};
|
|
|
|
} catch (error) {
|
|
// Try to determine if it's corruption vs encryption
|
|
const errorMessage = error instanceof Error ? error.message.toLowerCase() : '';
|
|
const isEncrypted = errorMessage.includes('password') || errorMessage.includes('encrypted');
|
|
|
|
return {
|
|
pageCount: 0,
|
|
isEncrypted,
|
|
isCorrupted: !isEncrypted // If not encrypted, probably corrupted
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Determine the best processing strategy based on file characteristics
|
|
*/
|
|
private static determineStrategy(fileSize: number, pageCount?: number): ProcessingStrategy {
|
|
// Handle corrupted or encrypted files
|
|
if (!pageCount || pageCount === 0) {
|
|
return 'metadata_only';
|
|
}
|
|
|
|
// Small files with few pages - process everything immediately
|
|
if (fileSize <= this.SIZE_THRESHOLDS.SMALL && pageCount <= this.PAGE_THRESHOLDS.FEW) {
|
|
return 'immediate_full';
|
|
}
|
|
|
|
// Medium files or many pages - priority pages first, then progressive
|
|
if (fileSize <= this.SIZE_THRESHOLDS.MEDIUM && pageCount <= this.PAGE_THRESHOLDS.MANY) {
|
|
return 'priority_pages';
|
|
}
|
|
|
|
// Large files or massive page counts - chunked processing
|
|
if (fileSize <= this.SIZE_THRESHOLDS.LARGE && pageCount <= this.PAGE_THRESHOLDS.MASSIVE) {
|
|
return 'progressive_chunked';
|
|
}
|
|
|
|
// Very large files - metadata only
|
|
return 'metadata_only';
|
|
}
|
|
|
|
/**
|
|
* Estimate processing time based on file characteristics and strategy
|
|
*/
|
|
private static estimateProcessingTime(
|
|
fileSize: number,
|
|
pageCount: number = 0,
|
|
strategy: ProcessingStrategy
|
|
): number {
|
|
const baseTimes = {
|
|
immediate_full: 200, // 200ms per page
|
|
priority_pages: 150, // 150ms per page (optimized)
|
|
progressive_chunked: 100, // 100ms per page (chunked)
|
|
metadata_only: 50 // 50ms total
|
|
};
|
|
|
|
const baseTime = baseTimes[strategy];
|
|
|
|
switch (strategy) {
|
|
case 'metadata_only':
|
|
return baseTime;
|
|
|
|
case 'immediate_full':
|
|
return pageCount * baseTime;
|
|
|
|
case 'priority_pages':
|
|
// Estimate time for priority pages (first 10)
|
|
const priorityPages = Math.min(pageCount, 10);
|
|
return priorityPages * baseTime;
|
|
|
|
case 'progressive_chunked':
|
|
// Estimate time for first chunk (20 pages)
|
|
const firstChunk = Math.min(pageCount, 20);
|
|
return firstChunk * baseTime;
|
|
|
|
default:
|
|
return pageCount * baseTime;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get processing recommendations for a set of files
|
|
*/
|
|
static async analyzeMultipleFiles(files: File[]): Promise<{
|
|
analyses: Map<File, FileAnalysis>;
|
|
recommendations: {
|
|
totalEstimatedTime: number;
|
|
suggestedBatchSize: number;
|
|
shouldUseWebWorker: boolean;
|
|
memoryWarning: boolean;
|
|
};
|
|
}> {
|
|
const analyses = new Map<File, FileAnalysis>();
|
|
let totalEstimatedTime = 0;
|
|
let totalSize = 0;
|
|
let totalPages = 0;
|
|
|
|
// Analyze each file
|
|
for (const file of files) {
|
|
const analysis = await this.analyzeFile(file);
|
|
analyses.set(file, analysis);
|
|
totalEstimatedTime += analysis.estimatedProcessingTime;
|
|
totalSize += file.size;
|
|
totalPages += analysis.estimatedPageCount || 0;
|
|
}
|
|
|
|
// Generate recommendations
|
|
const recommendations = {
|
|
totalEstimatedTime,
|
|
suggestedBatchSize: this.calculateBatchSize(files.length, totalSize),
|
|
shouldUseWebWorker: totalPages > 100 || totalSize > this.SIZE_THRESHOLDS.MEDIUM,
|
|
memoryWarning: totalSize > this.SIZE_THRESHOLDS.LARGE || totalPages > this.PAGE_THRESHOLDS.MASSIVE
|
|
};
|
|
|
|
return { analyses, recommendations };
|
|
}
|
|
|
|
/**
|
|
* Calculate optimal batch size for processing multiple files
|
|
*/
|
|
private static calculateBatchSize(fileCount: number, totalSize: number): number {
|
|
// Process small batches for large total sizes
|
|
if (totalSize > this.SIZE_THRESHOLDS.LARGE) {
|
|
return Math.max(1, Math.floor(fileCount / 4));
|
|
}
|
|
|
|
if (totalSize > this.SIZE_THRESHOLDS.MEDIUM) {
|
|
return Math.max(2, Math.floor(fileCount / 2));
|
|
}
|
|
|
|
// Process all at once for smaller total sizes
|
|
return fileCount;
|
|
}
|
|
|
|
/**
|
|
* Check if a file appears to be a valid PDF
|
|
*/
|
|
static async isValidPDF(file: File): Promise<boolean> {
|
|
if (file.type !== 'application/pdf' && !file.name.toLowerCase().endsWith('.pdf')) {
|
|
return false;
|
|
}
|
|
|
|
try {
|
|
// Read first few bytes to check PDF header
|
|
const header = file.slice(0, 8);
|
|
const headerBytes = new Uint8Array(await header.arrayBuffer());
|
|
const headerString = String.fromCharCode(...headerBytes);
|
|
|
|
return headerString.startsWith('%PDF-');
|
|
} catch (error) {
|
|
return false;
|
|
}
|
|
}
|
|
} |