Stirling-PDF/frontend/src/utils/fileHash.ts

/**
 * File hashing utilities for cache key generation
 */

export class FileHasher {
  private static readonly CHUNK_SIZE = 64 * 1024; // 64KB chunks for hashing

  /**
   * Generate a content-based hash for a file
   * Uses first + last + middle chunks to create a reasonably unique hash
   * without reading the entire file (which would be expensive for large files)
   */
  static async generateContentHash(file: File): Promise<string> {
    const chunks = await this.getFileChunks(file);
    const combined = await this.combineChunks(chunks);
    return await this.hashArrayBuffer(combined);
  }

  /**
   * Generate a fast hash based on file metadata
   * Faster but less collision-resistant than content hash
   */
  static generateMetadataHash(file: File): string {
    const data = `${file.name}-${file.size}-${file.lastModified}-${file.type}`;
    return this.simpleHash(data);
  }

  /**
   * Generate a hybrid hash that balances speed and uniqueness
   * Uses metadata + small content sample
   */
  static async generateHybridHash(file: File): Promise<string> {
    const metadataHash = this.generateMetadataHash(file);
    
    // For small files, use full content hash
    if (file.size <= 1024 * 1024) { // 1MB
      const contentHash = await this.generateContentHash(file);
      return `${metadataHash}-${contentHash}`;
    }
    
    // For large files, use first chunk only
    const firstChunk = file.slice(0, this.CHUNK_SIZE);
    const firstChunkBuffer = await firstChunk.arrayBuffer();
    const firstChunkHash = await this.hashArrayBuffer(firstChunkBuffer);
    
    return `${metadataHash}-${firstChunkHash}`;
  }

  private static async getFileChunks(file: File): Promise<ArrayBuffer[]> {
    const chunks: ArrayBuffer[] = [];
    
    // First chunk
    if (file.size > 0) {
      const firstChunk = file.slice(0, Math.min(this.CHUNK_SIZE, file.size));
      chunks.push(await firstChunk.arrayBuffer());
    }
    
    // Middle chunk (if file is large enough)
    if (file.size > this.CHUNK_SIZE * 2) {
      const middleStart = Math.floor(file.size / 2) - Math.floor(this.CHUNK_SIZE / 2);
      const middleEnd = middleStart + this.CHUNK_SIZE;
      const middleChunk = file.slice(middleStart, middleEnd);
      chunks.push(await middleChunk.arrayBuffer());
    }
    
    // Last chunk (if file is large enough and different from first)
    if (file.size > this.CHUNK_SIZE) {
      const lastStart = Math.max(file.size - this.CHUNK_SIZE, this.CHUNK_SIZE);
      const lastChunk = file.slice(lastStart);
      chunks.push(await lastChunk.arrayBuffer());
    }
    
    return chunks;
  }

  private static async combineChunks(chunks: ArrayBuffer[]): Promise<ArrayBuffer> {
    const totalLength = chunks.reduce((sum, chunk) => sum + chunk.byteLength, 0);
    const combined = new Uint8Array(totalLength);
    
    let offset = 0;
    for (const chunk of chunks) {
      combined.set(new Uint8Array(chunk), offset);
      offset += chunk.byteLength;
    }
    
    return combined.buffer;
  }

  private static async hashArrayBuffer(buffer: ArrayBuffer): Promise<string> {
    // Use Web Crypto API for proper hashing
    if (crypto.subtle) {
      const hashBuffer = await crypto.subtle.digest('SHA-256', buffer);
      const hashArray = Array.from(new Uint8Array(hashBuffer));
      return hashArray.map(b => b.toString(16).padStart(2, '0')).join('');
    }
    
    // Fallback for environments without crypto.subtle
    return this.simpleHash(Array.from(new Uint8Array(buffer)).join(''));
  }

  private static simpleHash(str: string): string {
    let hash = 0;
    if (str.length === 0) return hash.toString();
    
    for (let i = 0; i < str.length; i++) {
      const char = str.charCodeAt(i);
      hash = ((hash << 5) - hash) + char;
      hash = hash & hash; // Convert to 32-bit integer
    }
    
    return Math.abs(hash).toString(16);
  }

  /**
   * Validate that a file matches its expected hash
   * Useful for detecting file corruption or changes
   */
  static async validateFileHash(file: File, expectedHash: string): Promise<boolean> {
    try {
      const actualHash = await this.generateHybridHash(file);
      return actualHash === expectedHash;
    } catch (error) {
      console.error('Hash validation failed:', error);
      return false;
    }
  }
}
Stirling 2.0 (#3928) # Description of Changes <!-- File context for managing files between tools and views Optimisation for large files Updated Split to work with new file system and match Matts stepped design closer --> --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing) for more details. --------- Co-authored-by: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> 2025-07-16 17:53:50 +01:00			`/**`
			`* File hashing utilities for cache key generation`
			`*/`

			`export class FileHasher {`
			`private static readonly CHUNK_SIZE = 64 * 1024; // 64KB chunks for hashing`

			`/**`
			`* Generate a content-based hash for a file`
			`* Uses first + last + middle chunks to create a reasonably unique hash`
			`* without reading the entire file (which would be expensive for large files)`
			`*/`
			`static async generateContentHash(file: File): Promise<string> {`
			`const chunks = await this.getFileChunks(file);`
			`const combined = await this.combineChunks(chunks);`
			`return await this.hashArrayBuffer(combined);`
			`}`

			`/**`
			`* Generate a fast hash based on file metadata`
			`* Faster but less collision-resistant than content hash`
			`*/`
			`static generateMetadataHash(file: File): string {`
			const data = `${file.name}-${file.size}-${file.lastModified}-${file.type}`;
			`return this.simpleHash(data);`
			`}`

			`/**`
			`* Generate a hybrid hash that balances speed and uniqueness`
			`* Uses metadata + small content sample`
			`*/`
			`static async generateHybridHash(file: File): Promise<string> {`
			`const metadataHash = this.generateMetadataHash(file);`

			`// For small files, use full content hash`
			`if (file.size <= 1024 * 1024) { // 1MB`
			`const contentHash = await this.generateContentHash(file);`
			return `${metadataHash}-${contentHash}`;
			`}`

			`// For large files, use first chunk only`
			`const firstChunk = file.slice(0, this.CHUNK_SIZE);`
			`const firstChunkBuffer = await firstChunk.arrayBuffer();`
			`const firstChunkHash = await this.hashArrayBuffer(firstChunkBuffer);`

			return `${metadataHash}-${firstChunkHash}`;
			`}`

			`private static async getFileChunks(file: File): Promise<ArrayBuffer[]> {`
			`const chunks: ArrayBuffer[] = [];`

			`// First chunk`
			`if (file.size > 0) {`
			`const firstChunk = file.slice(0, Math.min(this.CHUNK_SIZE, file.size));`
			`chunks.push(await firstChunk.arrayBuffer());`
			`}`

			`// Middle chunk (if file is large enough)`
			`if (file.size > this.CHUNK_SIZE * 2) {`
			`const middleStart = Math.floor(file.size / 2) - Math.floor(this.CHUNK_SIZE / 2);`
			`const middleEnd = middleStart + this.CHUNK_SIZE;`
			`const middleChunk = file.slice(middleStart, middleEnd);`
			`chunks.push(await middleChunk.arrayBuffer());`
			`}`

			`// Last chunk (if file is large enough and different from first)`
			`if (file.size > this.CHUNK_SIZE) {`
			`const lastStart = Math.max(file.size - this.CHUNK_SIZE, this.CHUNK_SIZE);`
			`const lastChunk = file.slice(lastStart);`
			`chunks.push(await lastChunk.arrayBuffer());`
			`}`

			`return chunks;`
			`}`

			`private static async combineChunks(chunks: ArrayBuffer[]): Promise<ArrayBuffer> {`
			`const totalLength = chunks.reduce((sum, chunk) => sum + chunk.byteLength, 0);`
			`const combined = new Uint8Array(totalLength);`

			`let offset = 0;`
			`for (const chunk of chunks) {`
			`combined.set(new Uint8Array(chunk), offset);`
			`offset += chunk.byteLength;`
			`}`

			`return combined.buffer;`
			`}`

			`private static async hashArrayBuffer(buffer: ArrayBuffer): Promise<string> {`
			`// Use Web Crypto API for proper hashing`
			`if (crypto.subtle) {`
			`const hashBuffer = await crypto.subtle.digest('SHA-256', buffer);`
			`const hashArray = Array.from(new Uint8Array(hashBuffer));`
			`return hashArray.map(b => b.toString(16).padStart(2, '0')).join('');`
			`}`

			`// Fallback for environments without crypto.subtle`
			`return this.simpleHash(Array.from(new Uint8Array(buffer)).join(''));`
			`}`

			`private static simpleHash(str: string): string {`
			`let hash = 0;`
			`if (str.length === 0) return hash.toString();`

			`for (let i = 0; i < str.length; i++) {`
			`const char = str.charCodeAt(i);`
			`hash = ((hash << 5) - hash) + char;`
			`hash = hash & hash; // Convert to 32-bit integer`
			`}`

			`return Math.abs(hash).toString(16);`
			`}`

			`/**`
			`* Validate that a file matches its expected hash`
			`* Useful for detecting file corruption or changes`
			`*/`
			`static async validateFileHash(file: File, expectedHash: string): Promise<boolean> {`
			`try {`
			`const actualHash = await this.generateHybridHash(file);`
			`return actualHash === expectedHash;`
			`} catch (error) {`
			`console.error('Hash validation failed:', error);`
			`return false;`
			`}`
			`}`
			`}`