mirror of
https://github.com/Stirling-Tools/Stirling-PDF.git
synced 2025-08-27 06:39:24 +00:00

# Description of Changes A new universal file context rather than the splintered ones for the main views, tools and manager we had before (manager still has its own but its better integreated with the core context) File context has been split it into a handful of different files managing various file related issues separately to reduce the monolith - FileReducer.ts - State management fileActions.ts - File operations fileSelectors.ts - Data access patterns lifecycle.ts - Resource cleanup and memory management fileHooks.ts - React hooks interface contexts.ts - Context providers Improved thumbnail generation Improved indexxedb handling Stopped handling files as blobs were not necessary to improve performance A new library handling drag and drop https://github.com/atlassian/pragmatic-drag-and-drop (Out of scope yes but I broke the old one with the new filecontext and it needed doing so it was a might as well) A new library handling virtualisation on page editor @tanstack/react-virtual, as above. Quickly ripped out the last remnants of the old URL params stuff and replaced with the beginnings of what will later become the new URL navigation system (for now it just restores the tool name in url behavior) Fixed selected file not regestered when opening a tool Fixed png thumbnails Closes #(issue_number) --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing) for more details. --------- Co-authored-by: Reece Browne <you@example.com>
240 lines
7.2 KiB
TypeScript
240 lines
7.2 KiB
TypeScript
import { FileAnalysis, ProcessingStrategy } from '../types/processing';
|
|
import { pdfWorkerManager } from './pdfWorkerManager';
|
|
|
|
export class FileAnalyzer {
|
|
private static readonly SIZE_THRESHOLDS = {
|
|
SMALL: 10 * 1024 * 1024, // 10MB
|
|
MEDIUM: 50 * 1024 * 1024, // 50MB
|
|
LARGE: 200 * 1024 * 1024, // 200MB
|
|
};
|
|
|
|
private static readonly PAGE_THRESHOLDS = {
|
|
FEW: 10, // < 10 pages - immediate full processing
|
|
MANY: 50, // < 50 pages - priority pages
|
|
MASSIVE: 100, // < 100 pages - progressive chunked
|
|
// >100 pages = metadata only
|
|
};
|
|
|
|
/**
|
|
* Analyze a file to determine optimal processing strategy
|
|
*/
|
|
static async analyzeFile(file: File): Promise<FileAnalysis> {
|
|
const analysis: FileAnalysis = {
|
|
fileSize: file.size,
|
|
isEncrypted: false,
|
|
isCorrupted: false,
|
|
recommendedStrategy: 'metadata_only',
|
|
estimatedProcessingTime: 0,
|
|
};
|
|
|
|
try {
|
|
// Quick validation and page count estimation
|
|
const quickAnalysis = await this.quickPDFAnalysis(file);
|
|
analysis.estimatedPageCount = quickAnalysis.pageCount;
|
|
analysis.isEncrypted = quickAnalysis.isEncrypted;
|
|
analysis.isCorrupted = quickAnalysis.isCorrupted;
|
|
|
|
// Determine strategy based on file characteristics
|
|
analysis.recommendedStrategy = this.determineStrategy(file.size, quickAnalysis.pageCount);
|
|
|
|
// Estimate processing time
|
|
analysis.estimatedProcessingTime = this.estimateProcessingTime(
|
|
file.size,
|
|
quickAnalysis.pageCount,
|
|
analysis.recommendedStrategy
|
|
);
|
|
|
|
} catch (error) {
|
|
console.error('File analysis failed:', error);
|
|
analysis.isCorrupted = true;
|
|
analysis.recommendedStrategy = 'metadata_only';
|
|
}
|
|
|
|
return analysis;
|
|
}
|
|
|
|
/**
|
|
* Quick PDF analysis without full processing
|
|
*/
|
|
private static async quickPDFAnalysis(file: File): Promise<{
|
|
pageCount: number;
|
|
isEncrypted: boolean;
|
|
isCorrupted: boolean;
|
|
}> {
|
|
try {
|
|
// For small files, read the whole file
|
|
// For large files, try the whole file first (PDF.js needs the complete structure)
|
|
const arrayBuffer = await file.arrayBuffer();
|
|
|
|
const pdf = await pdfWorkerManager.createDocument(arrayBuffer, {
|
|
stopAtErrors: false, // Don't stop at minor errors
|
|
verbosity: 0 // Suppress PDF.js warnings
|
|
});
|
|
|
|
const pageCount = pdf.numPages;
|
|
const isEncrypted = (pdf as any).isEncrypted;
|
|
|
|
// Clean up using worker manager
|
|
pdfWorkerManager.destroyDocument(pdf);
|
|
|
|
return {
|
|
pageCount,
|
|
isEncrypted,
|
|
isCorrupted: false
|
|
};
|
|
|
|
} catch (error) {
|
|
// Try to determine if it's corruption vs encryption
|
|
const errorMessage = error instanceof Error ? error.message.toLowerCase() : '';
|
|
const isEncrypted = errorMessage.includes('password') || errorMessage.includes('encrypted');
|
|
|
|
return {
|
|
pageCount: 0,
|
|
isEncrypted,
|
|
isCorrupted: !isEncrypted // If not encrypted, probably corrupted
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Determine the best processing strategy based on file characteristics
|
|
*/
|
|
private static determineStrategy(fileSize: number, pageCount?: number): ProcessingStrategy {
|
|
// Handle corrupted or encrypted files
|
|
if (!pageCount || pageCount === 0) {
|
|
return 'metadata_only';
|
|
}
|
|
|
|
// Small files with few pages - process everything immediately
|
|
if (fileSize <= this.SIZE_THRESHOLDS.SMALL && pageCount <= this.PAGE_THRESHOLDS.FEW) {
|
|
return 'immediate_full';
|
|
}
|
|
|
|
// Medium files or many pages - priority pages first, then progressive
|
|
if (fileSize <= this.SIZE_THRESHOLDS.MEDIUM && pageCount <= this.PAGE_THRESHOLDS.MANY) {
|
|
return 'priority_pages';
|
|
}
|
|
|
|
// Large files or massive page counts - chunked processing
|
|
if (fileSize <= this.SIZE_THRESHOLDS.LARGE && pageCount <= this.PAGE_THRESHOLDS.MASSIVE) {
|
|
return 'progressive_chunked';
|
|
}
|
|
|
|
// Very large files - metadata only
|
|
return 'metadata_only';
|
|
}
|
|
|
|
/**
|
|
* Estimate processing time based on file characteristics and strategy
|
|
*/
|
|
private static estimateProcessingTime(
|
|
fileSize: number,
|
|
pageCount: number = 0,
|
|
strategy: ProcessingStrategy
|
|
): number {
|
|
const baseTimes = {
|
|
immediate_full: 200, // 200ms per page
|
|
priority_pages: 150, // 150ms per page (optimized)
|
|
progressive_chunked: 100, // 100ms per page (chunked)
|
|
metadata_only: 50 // 50ms total
|
|
};
|
|
|
|
const baseTime = baseTimes[strategy];
|
|
|
|
switch (strategy) {
|
|
case 'metadata_only':
|
|
return baseTime;
|
|
|
|
case 'immediate_full':
|
|
return pageCount * baseTime;
|
|
|
|
case 'priority_pages':
|
|
// Estimate time for priority pages (first 10)
|
|
const priorityPages = Math.min(pageCount, 10);
|
|
return priorityPages * baseTime;
|
|
|
|
case 'progressive_chunked':
|
|
// Estimate time for first chunk (20 pages)
|
|
const firstChunk = Math.min(pageCount, 20);
|
|
return firstChunk * baseTime;
|
|
|
|
default:
|
|
return pageCount * baseTime;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get processing recommendations for a set of files
|
|
*/
|
|
static async analyzeMultipleFiles(files: File[]): Promise<{
|
|
analyses: Map<File, FileAnalysis>;
|
|
recommendations: {
|
|
totalEstimatedTime: number;
|
|
suggestedBatchSize: number;
|
|
shouldUseWebWorker: boolean;
|
|
memoryWarning: boolean;
|
|
};
|
|
}> {
|
|
const analyses = new Map<File, FileAnalysis>();
|
|
let totalEstimatedTime = 0;
|
|
let totalSize = 0;
|
|
let totalPages = 0;
|
|
|
|
// Analyze each file
|
|
for (const file of files) {
|
|
const analysis = await this.analyzeFile(file);
|
|
analyses.set(file, analysis);
|
|
totalEstimatedTime += analysis.estimatedProcessingTime;
|
|
totalSize += file.size;
|
|
totalPages += analysis.estimatedPageCount || 0;
|
|
}
|
|
|
|
// Generate recommendations
|
|
const recommendations = {
|
|
totalEstimatedTime,
|
|
suggestedBatchSize: this.calculateBatchSize(files.length, totalSize),
|
|
shouldUseWebWorker: totalPages > 100 || totalSize > this.SIZE_THRESHOLDS.MEDIUM,
|
|
memoryWarning: totalSize > this.SIZE_THRESHOLDS.LARGE || totalPages > this.PAGE_THRESHOLDS.MASSIVE
|
|
};
|
|
|
|
return { analyses, recommendations };
|
|
}
|
|
|
|
/**
|
|
* Calculate optimal batch size for processing multiple files
|
|
*/
|
|
private static calculateBatchSize(fileCount: number, totalSize: number): number {
|
|
// Process small batches for large total sizes
|
|
if (totalSize > this.SIZE_THRESHOLDS.LARGE) {
|
|
return Math.max(1, Math.floor(fileCount / 4));
|
|
}
|
|
|
|
if (totalSize > this.SIZE_THRESHOLDS.MEDIUM) {
|
|
return Math.max(2, Math.floor(fileCount / 2));
|
|
}
|
|
|
|
// Process all at once for smaller total sizes
|
|
return fileCount;
|
|
}
|
|
|
|
/**
|
|
* Check if a file appears to be a valid PDF
|
|
*/
|
|
static async isValidPDF(file: File): Promise<boolean> {
|
|
if (file.type !== 'application/pdf' && !file.name.toLowerCase().endsWith('.pdf')) {
|
|
return false;
|
|
}
|
|
|
|
try {
|
|
// Read first few bytes to check PDF header
|
|
const header = file.slice(0, 8);
|
|
const headerBytes = new Uint8Array(await header.arrayBuffer());
|
|
const headerString = String.fromCharCode(...headerBytes);
|
|
|
|
return headerString.startsWith('%PDF-');
|
|
} catch (error) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|