Stirling-PDF/frontend/src/services/fileAnalyzer.ts
Reece Browne 949ffa01ad
Feature/v2/file handling improvements (#4222)
# Description of Changes

A new universal file context rather than the splintered ones for the
main views, tools and manager we had before (manager still has its own
but its better integreated with the core context)
File context has been split it into a handful of different files
managing various file related issues separately to reduce the monolith -
FileReducer.ts - State management
  fileActions.ts - File operations
  fileSelectors.ts - Data access patterns
  lifecycle.ts - Resource cleanup and memory management
  fileHooks.ts - React hooks interface
  contexts.ts - Context providers
Improved thumbnail generation
Improved indexxedb handling
Stopped handling files as blobs were not necessary to improve
performance
A new library handling drag and drop
https://github.com/atlassian/pragmatic-drag-and-drop (Out of scope yes
but I broke the old one with the new filecontext and it needed doing so
it was a might as well)
A new library handling virtualisation on page editor
@tanstack/react-virtual, as above.
Quickly ripped out the last remnants of the old URL params stuff and
replaced with the beginnings of what will later become the new URL
navigation system (for now it just restores the tool name in url
behavior)
Fixed selected file not regestered when opening a tool
Fixed png thumbnails
Closes #(issue_number)

---

## Checklist

### General

- [ ] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [ ] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md)
(if applicable)
- [ ] I have performed a self-review of my own code
- [ ] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### UI Changes (if applicable)

- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [ ] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing)
for more details.

---------

Co-authored-by: Reece Browne <you@example.com>
2025-08-21 17:30:26 +01:00

240 lines
7.2 KiB
TypeScript

import { FileAnalysis, ProcessingStrategy } from '../types/processing';
import { pdfWorkerManager } from './pdfWorkerManager';
export class FileAnalyzer {
private static readonly SIZE_THRESHOLDS = {
SMALL: 10 * 1024 * 1024, // 10MB
MEDIUM: 50 * 1024 * 1024, // 50MB
LARGE: 200 * 1024 * 1024, // 200MB
};
private static readonly PAGE_THRESHOLDS = {
FEW: 10, // < 10 pages - immediate full processing
MANY: 50, // < 50 pages - priority pages
MASSIVE: 100, // < 100 pages - progressive chunked
// >100 pages = metadata only
};
/**
* Analyze a file to determine optimal processing strategy
*/
static async analyzeFile(file: File): Promise<FileAnalysis> {
const analysis: FileAnalysis = {
fileSize: file.size,
isEncrypted: false,
isCorrupted: false,
recommendedStrategy: 'metadata_only',
estimatedProcessingTime: 0,
};
try {
// Quick validation and page count estimation
const quickAnalysis = await this.quickPDFAnalysis(file);
analysis.estimatedPageCount = quickAnalysis.pageCount;
analysis.isEncrypted = quickAnalysis.isEncrypted;
analysis.isCorrupted = quickAnalysis.isCorrupted;
// Determine strategy based on file characteristics
analysis.recommendedStrategy = this.determineStrategy(file.size, quickAnalysis.pageCount);
// Estimate processing time
analysis.estimatedProcessingTime = this.estimateProcessingTime(
file.size,
quickAnalysis.pageCount,
analysis.recommendedStrategy
);
} catch (error) {
console.error('File analysis failed:', error);
analysis.isCorrupted = true;
analysis.recommendedStrategy = 'metadata_only';
}
return analysis;
}
/**
* Quick PDF analysis without full processing
*/
private static async quickPDFAnalysis(file: File): Promise<{
pageCount: number;
isEncrypted: boolean;
isCorrupted: boolean;
}> {
try {
// For small files, read the whole file
// For large files, try the whole file first (PDF.js needs the complete structure)
const arrayBuffer = await file.arrayBuffer();
const pdf = await pdfWorkerManager.createDocument(arrayBuffer, {
stopAtErrors: false, // Don't stop at minor errors
verbosity: 0 // Suppress PDF.js warnings
});
const pageCount = pdf.numPages;
const isEncrypted = (pdf as any).isEncrypted;
// Clean up using worker manager
pdfWorkerManager.destroyDocument(pdf);
return {
pageCount,
isEncrypted,
isCorrupted: false
};
} catch (error) {
// Try to determine if it's corruption vs encryption
const errorMessage = error instanceof Error ? error.message.toLowerCase() : '';
const isEncrypted = errorMessage.includes('password') || errorMessage.includes('encrypted');
return {
pageCount: 0,
isEncrypted,
isCorrupted: !isEncrypted // If not encrypted, probably corrupted
};
}
}
/**
* Determine the best processing strategy based on file characteristics
*/
private static determineStrategy(fileSize: number, pageCount?: number): ProcessingStrategy {
// Handle corrupted or encrypted files
if (!pageCount || pageCount === 0) {
return 'metadata_only';
}
// Small files with few pages - process everything immediately
if (fileSize <= this.SIZE_THRESHOLDS.SMALL && pageCount <= this.PAGE_THRESHOLDS.FEW) {
return 'immediate_full';
}
// Medium files or many pages - priority pages first, then progressive
if (fileSize <= this.SIZE_THRESHOLDS.MEDIUM && pageCount <= this.PAGE_THRESHOLDS.MANY) {
return 'priority_pages';
}
// Large files or massive page counts - chunked processing
if (fileSize <= this.SIZE_THRESHOLDS.LARGE && pageCount <= this.PAGE_THRESHOLDS.MASSIVE) {
return 'progressive_chunked';
}
// Very large files - metadata only
return 'metadata_only';
}
/**
* Estimate processing time based on file characteristics and strategy
*/
private static estimateProcessingTime(
fileSize: number,
pageCount: number = 0,
strategy: ProcessingStrategy
): number {
const baseTimes = {
immediate_full: 200, // 200ms per page
priority_pages: 150, // 150ms per page (optimized)
progressive_chunked: 100, // 100ms per page (chunked)
metadata_only: 50 // 50ms total
};
const baseTime = baseTimes[strategy];
switch (strategy) {
case 'metadata_only':
return baseTime;
case 'immediate_full':
return pageCount * baseTime;
case 'priority_pages':
// Estimate time for priority pages (first 10)
const priorityPages = Math.min(pageCount, 10);
return priorityPages * baseTime;
case 'progressive_chunked':
// Estimate time for first chunk (20 pages)
const firstChunk = Math.min(pageCount, 20);
return firstChunk * baseTime;
default:
return pageCount * baseTime;
}
}
/**
* Get processing recommendations for a set of files
*/
static async analyzeMultipleFiles(files: File[]): Promise<{
analyses: Map<File, FileAnalysis>;
recommendations: {
totalEstimatedTime: number;
suggestedBatchSize: number;
shouldUseWebWorker: boolean;
memoryWarning: boolean;
};
}> {
const analyses = new Map<File, FileAnalysis>();
let totalEstimatedTime = 0;
let totalSize = 0;
let totalPages = 0;
// Analyze each file
for (const file of files) {
const analysis = await this.analyzeFile(file);
analyses.set(file, analysis);
totalEstimatedTime += analysis.estimatedProcessingTime;
totalSize += file.size;
totalPages += analysis.estimatedPageCount || 0;
}
// Generate recommendations
const recommendations = {
totalEstimatedTime,
suggestedBatchSize: this.calculateBatchSize(files.length, totalSize),
shouldUseWebWorker: totalPages > 100 || totalSize > this.SIZE_THRESHOLDS.MEDIUM,
memoryWarning: totalSize > this.SIZE_THRESHOLDS.LARGE || totalPages > this.PAGE_THRESHOLDS.MASSIVE
};
return { analyses, recommendations };
}
/**
* Calculate optimal batch size for processing multiple files
*/
private static calculateBatchSize(fileCount: number, totalSize: number): number {
// Process small batches for large total sizes
if (totalSize > this.SIZE_THRESHOLDS.LARGE) {
return Math.max(1, Math.floor(fileCount / 4));
}
if (totalSize > this.SIZE_THRESHOLDS.MEDIUM) {
return Math.max(2, Math.floor(fileCount / 2));
}
// Process all at once for smaller total sizes
return fileCount;
}
/**
* Check if a file appears to be a valid PDF
*/
static async isValidPDF(file: File): Promise<boolean> {
if (file.type !== 'application/pdf' && !file.name.toLowerCase().endsWith('.pdf')) {
return false;
}
try {
// Read first few bytes to check PDF header
const header = file.slice(0, 8);
const headerBytes = new Uint8Array(await header.arrayBuffer());
const headerString = String.fromCharCode(...headerBytes);
return headerString.startsWith('%PDF-');
} catch (error) {
return false;
}
}
}