Stirling-PDF/frontend/src/services/pdfMetadataService.ts

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

451 lines
15 KiB
TypeScript
Raw Normal View History

2025-09-02 17:24:26 +01:00
/**
* PDF Metadata Service - File History Tracking with pdf-lib
*
* Handles injection and extraction of file history metadata in PDFs using pdf-lib.
* This service embeds file history directly into PDF metadata, making it persistent
* across all tool operations and downloads.
*/
import { PDFDocument } from 'pdf-lib';
import { FileId } from '../types/file';
2025-09-02 18:15:13 +01:00
import { ContentCache, type CacheConfig } from '../utils/ContentCache';
2025-09-02 17:24:26 +01:00
const DEBUG = process.env.NODE_ENV === 'development';
/**
* Tool operation metadata for history tracking
*/
export interface ToolOperation {
toolName: string;
timestamp: number;
parameters?: Record<string, any>;
}
/**
* Complete file history metadata structure
2025-09-03 14:48:14 +01:00
* Uses standard PDF metadata fields (Creator, Producer, CreationDate, ModificationDate)
* and embeds Stirling-specific history in keywords
2025-09-02 17:24:26 +01:00
*/
export interface PDFHistoryMetadata {
stirlingHistory: {
originalFileId: string;
parentFileId?: string;
versionNumber: number;
toolChain: ToolOperation[];
formatVersion: '1.0';
};
}
/**
* Service for managing PDF file history metadata
*/
export class PDFMetadataService {
private static readonly HISTORY_KEYWORD = 'stirling-history';
private static readonly FORMAT_VERSION = '1.0';
2025-09-02 18:15:13 +01:00
private metadataCache: ContentCache<PDFHistoryMetadata | null>;
constructor(cacheConfig?: Partial<CacheConfig>) {
const defaultConfig: CacheConfig = {
ttl: 5 * 60 * 1000, // 5 minutes
maxSize: 100, // 100 files
enableWarnings: DEBUG
};
this.metadataCache = new ContentCache<PDFHistoryMetadata | null>({
...defaultConfig,
...cacheConfig
});
}
2025-09-02 17:24:26 +01:00
/**
* Inject file history metadata into a PDF
*/
async injectHistoryMetadata(
pdfBytes: ArrayBuffer,
originalFileId: string,
parentFileId?: string,
toolChain: ToolOperation[] = [],
versionNumber: number = 1
): Promise<ArrayBuffer> {
try {
2025-09-02 18:15:13 +01:00
const pdfDoc = await PDFDocument.load(pdfBytes, { ignoreEncryption: true });
2025-09-02 17:24:26 +01:00
const historyMetadata: PDFHistoryMetadata = {
stirlingHistory: {
originalFileId,
parentFileId,
versionNumber,
toolChain: [...toolChain],
2025-09-03 14:48:14 +01:00
formatVersion: PDFMetadataService.FORMAT_VERSION
2025-09-02 17:24:26 +01:00
}
};
2025-09-03 14:48:14 +01:00
// Set Stirling-PDF identification fields only (don't touch dates)
2025-09-02 17:24:26 +01:00
pdfDoc.setCreator('Stirling-PDF');
pdfDoc.setProducer('Stirling-PDF');
// Embed history metadata in keywords field (most compatible)
const historyJson = JSON.stringify(historyMetadata);
const existingKeywords = pdfDoc.getKeywords();
// Handle keywords as array (pdf-lib stores them as array)
let keywordList: string[] = [];
if (Array.isArray(existingKeywords)) {
// Remove any existing history keywords to avoid duplicates
keywordList = existingKeywords.filter(keyword =>
!keyword.startsWith(`${PDFMetadataService.HISTORY_KEYWORD}:`)
);
} else if (existingKeywords) {
// Remove history from single keyword string
const cleanKeyword = this.extractHistoryFromKeywords(existingKeywords, true);
if (cleanKeyword) {
keywordList = [cleanKeyword];
}
}
// Add our new history metadata as a keyword (replacing any previous history)
const historyKeyword = `${PDFMetadataService.HISTORY_KEYWORD}:${historyJson}`;
keywordList.push(historyKeyword);
pdfDoc.setKeywords(keywordList);
if (DEBUG) {
console.log('📄 Injected PDF history metadata:', {
originalFileId,
parentFileId,
versionNumber,
toolCount: toolChain.length
});
}
2025-09-05 17:41:53 +01:00
const savedPdfBytes = await pdfDoc.save();
// Convert Uint8Array to ArrayBuffer
const arrayBuffer = new ArrayBuffer(savedPdfBytes.byteLength);
new Uint8Array(arrayBuffer).set(savedPdfBytes);
return arrayBuffer;
2025-09-02 17:24:26 +01:00
} catch (error) {
if (DEBUG) console.error('📄 Failed to inject PDF metadata:', error);
// Return original bytes if metadata injection fails
return pdfBytes;
}
}
/**
* Extract file history metadata from a PDF
*/
async extractHistoryMetadata(pdfBytes: ArrayBuffer): Promise<PDFHistoryMetadata | null> {
2025-09-02 18:15:13 +01:00
const cacheKey = this.metadataCache.generateKeyFromBuffer(pdfBytes);
// Check cache first
const cached = this.metadataCache.get(cacheKey);
if (cached !== null) {
return cached;
}
// Extract from PDF
const metadata = await this.extractHistoryMetadataInternal(pdfBytes);
// Cache the result
this.metadataCache.set(cacheKey, metadata);
return metadata;
}
/**
* Internal method for actual PDF metadata extraction
*/
private async extractHistoryMetadataInternal(pdfBytes: ArrayBuffer): Promise<PDFHistoryMetadata | null> {
2025-09-02 17:24:26 +01:00
try {
2025-09-02 18:15:13 +01:00
const pdfDoc = await PDFDocument.load(pdfBytes, { ignoreEncryption: true });
2025-09-02 17:24:26 +01:00
const keywords = pdfDoc.getKeywords();
// Look for history keyword directly in array or convert to string
let historyJson: string | null = null;
if (Array.isArray(keywords)) {
// Search through keywords array for our history keyword - get the LATEST one
const historyKeywords = keywords.filter(keyword =>
keyword.startsWith(`${PDFMetadataService.HISTORY_KEYWORD}:`)
);
if (historyKeywords.length > 0) {
// If multiple history keywords exist, parse all and get the highest version number
let latestVersionNumber = 0;
for (const historyKeyword of historyKeywords) {
try {
const json = historyKeyword.substring(`${PDFMetadataService.HISTORY_KEYWORD}:`.length);
const parsed = JSON.parse(json) as PDFHistoryMetadata;
if (parsed.stirlingHistory.versionNumber > latestVersionNumber) {
latestVersionNumber = parsed.stirlingHistory.versionNumber;
historyJson = json;
}
} catch (error) {
// Silent fallback for corrupted history
}
}
}
} else if (keywords) {
// Fallback to string parsing
historyJson = this.extractHistoryFromKeywords(keywords);
}
if (!historyJson) return null;
const metadata = JSON.parse(historyJson) as PDFHistoryMetadata;
// Validate metadata structure
if (!this.isValidHistoryMetadata(metadata)) {
return null;
}
return metadata;
} catch (error) {
2025-09-02 18:15:13 +01:00
if (DEBUG) console.error('📄 Failed to extract PDF metadata:', error);
2025-09-02 17:24:26 +01:00
return null;
}
}
/**
* Add a tool operation to existing PDF history
*/
async addToolOperation(
pdfBytes: ArrayBuffer,
toolOperation: ToolOperation
): Promise<ArrayBuffer> {
try {
// Extract existing history
const existingHistory = await this.extractHistoryMetadata(pdfBytes);
if (!existingHistory) {
if (DEBUG) console.warn('📄 No existing history found, cannot add tool operation');
return pdfBytes;
}
// Add new tool operation
const updatedToolChain = [...existingHistory.stirlingHistory.toolChain, toolOperation];
// Re-inject with updated history
return await this.injectHistoryMetadata(
pdfBytes,
existingHistory.stirlingHistory.originalFileId,
existingHistory.stirlingHistory.parentFileId,
updatedToolChain,
existingHistory.stirlingHistory.versionNumber
);
} catch (error) {
if (DEBUG) console.error('📄 Failed to add tool operation:', error);
return pdfBytes;
}
}
/**
* Create a new version of a PDF with incremented version number
*/
async createNewVersion(
pdfBytes: ArrayBuffer,
parentFileId: string,
toolOperation: ToolOperation
): Promise<ArrayBuffer> {
try {
const parentHistory = await this.extractHistoryMetadata(pdfBytes);
const originalFileId = parentHistory?.stirlingHistory.originalFileId || parentFileId;
const parentToolChain = parentHistory?.stirlingHistory.toolChain || [];
const newVersionNumber = (parentHistory?.stirlingHistory.versionNumber || 0) + 1;
// Create new tool chain with the new operation
const newToolChain = [...parentToolChain, toolOperation];
return await this.injectHistoryMetadata(
pdfBytes,
originalFileId,
parentFileId,
newToolChain,
newVersionNumber
);
} catch (error) {
if (DEBUG) console.error('📄 Failed to create new version:', error);
return pdfBytes;
}
}
2025-09-03 14:48:14 +01:00
/**
* Extract standard PDF document metadata
*/
async extractStandardMetadata(pdfBytes: ArrayBuffer): Promise<{
title?: string;
author?: string;
subject?: string;
creator?: string;
producer?: string;
creationDate?: Date;
modificationDate?: Date;
} | null> {
try {
const pdfDoc = await PDFDocument.load(pdfBytes, { ignoreEncryption: true });
return {
title: pdfDoc.getTitle() || undefined,
author: pdfDoc.getAuthor() || undefined,
subject: pdfDoc.getSubject() || undefined,
creator: pdfDoc.getCreator() || undefined,
producer: pdfDoc.getProducer() || undefined,
creationDate: pdfDoc.getCreationDate() || undefined,
modificationDate: pdfDoc.getModificationDate() || undefined
};
} catch (error) {
if (DEBUG) console.warn('📄 Failed to extract standard PDF metadata:', error);
return null;
}
}
/**
* Verify that tool preserved standard PDF metadata
* Logs warnings for tools that strip metadata
*/
async verifyMetadataPreservation(
originalBytes: ArrayBuffer,
processedBytes: ArrayBuffer,
toolName: string
): Promise<void> {
try {
const [originalMetadata, processedMetadata] = await Promise.all([
this.extractStandardMetadata(originalBytes),
this.extractStandardMetadata(processedBytes)
]);
if (!originalMetadata || !processedMetadata) return;
// Check each metadata field for preservation
const issues: string[] = [];
if (originalMetadata.title && !processedMetadata.title) {
issues.push('Title stripped');
}
if (originalMetadata.author && !processedMetadata.author) {
issues.push('Author stripped');
}
if (originalMetadata.subject && !processedMetadata.subject) {
issues.push('Subject stripped');
}
if (originalMetadata.creationDate && !processedMetadata.creationDate) {
issues.push('CreationDate stripped');
}
if (originalMetadata.creationDate && processedMetadata.creationDate &&
Math.abs(originalMetadata.creationDate.getTime() - processedMetadata.creationDate.getTime()) > 1000) {
issues.push(`CreationDate modified (${originalMetadata.creationDate.toISOString()}${processedMetadata.creationDate.toISOString()})`);
}
// Note: We don't check ModificationDate preservation since we use File.lastModified as source of truth
if (issues.length > 0) {
console.warn(`⚠️ METADATA LOSS: Tool '${toolName}' did not preserve PDF metadata:`, issues.join(', '));
console.warn(`⚠️ This backend tool should be updated to preserve standard PDF metadata fields.`);
} else {
console.log(`✅ METADATA PRESERVED: Tool '${toolName}' correctly preserved all PDF metadata`);
}
} catch (error) {
if (DEBUG) console.warn(`📄 Failed to verify metadata preservation for ${toolName}:`, error);
}
}
2025-09-02 17:24:26 +01:00
/**
* Check if a PDF has Stirling history metadata
*/
async hasStirlingHistory(pdfBytes: ArrayBuffer): Promise<boolean> {
const metadata = await this.extractHistoryMetadata(pdfBytes);
return metadata !== null;
}
/**
* Get version information from PDF
*/
async getVersionInfo(pdfBytes: ArrayBuffer): Promise<{
originalFileId: string;
versionNumber: number;
toolCount: number;
parentFileId?: string;
} | null> {
const metadata = await this.extractHistoryMetadata(pdfBytes);
if (!metadata) return null;
return {
originalFileId: metadata.stirlingHistory.originalFileId,
versionNumber: metadata.stirlingHistory.versionNumber,
toolCount: metadata.stirlingHistory.toolChain.length,
parentFileId: metadata.stirlingHistory.parentFileId
};
}
/**
* Embed history JSON in keywords field with delimiter
*/
private embedHistoryInKeywords(existingKeywords: string, historyJson: string): string {
// Remove any existing history
const cleanKeywords = this.extractHistoryFromKeywords(existingKeywords, true) || existingKeywords;
// Add new history with delimiter
const historyKeyword = `${PDFMetadataService.HISTORY_KEYWORD}:${historyJson}`;
if (cleanKeywords.trim()) {
return `${cleanKeywords.trim()} ${historyKeyword}`;
}
return historyKeyword;
}
/**
* Extract history JSON from keywords field
*/
private extractHistoryFromKeywords(keywords: string, returnRemainder = false): string | null {
const historyPrefix = `${PDFMetadataService.HISTORY_KEYWORD}:`;
const historyIndex = keywords.indexOf(historyPrefix);
if (historyIndex === -1) return null;
const historyStart = historyIndex + historyPrefix.length;
let historyEnd = keywords.length;
// Look for the next keyword (space followed by non-JSON content)
// Simple heuristic: find space followed by word that doesn't look like JSON
const afterHistory = keywords.substring(historyStart);
const nextSpaceIndex = afterHistory.indexOf(' ');
if (nextSpaceIndex > 0) {
const afterSpace = afterHistory.substring(nextSpaceIndex + 1);
if (afterSpace && !afterSpace.trim().startsWith('{')) {
historyEnd = historyStart + nextSpaceIndex;
}
}
if (returnRemainder) {
// Return keywords with history removed
const before = keywords.substring(0, historyIndex);
const after = keywords.substring(historyEnd);
return `${before}${after}`.replace(/\s+/g, ' ').trim();
}
return keywords.substring(historyStart, historyEnd).trim();
}
/**
* Validate metadata structure
*/
private isValidHistoryMetadata(metadata: any): metadata is PDFHistoryMetadata {
return metadata &&
metadata.stirlingHistory &&
typeof metadata.stirlingHistory.originalFileId === 'string' &&
typeof metadata.stirlingHistory.versionNumber === 'number' &&
Array.isArray(metadata.stirlingHistory.toolChain) &&
metadata.stirlingHistory.formatVersion === PDFMetadataService.FORMAT_VERSION;
}
}
2025-09-02 18:15:13 +01:00
// Export singleton instance with optimized cache settings
export const pdfMetadataService = new PDFMetadataService({
ttl: 10 * 60 * 1000, // 10 minutes for PDF metadata (longer than default)
maxSize: 50, // Smaller cache for memory efficiency
enableWarnings: DEBUG
});