Stirling-PDF/frontend/src/services/pdfMetadataService.ts

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

331 lines
11 KiB
TypeScript
Raw Normal View History

2025-09-02 17:24:26 +01:00
/**
* PDF Metadata Service - File History Tracking with pdf-lib
*
* Handles injection and extraction of file history metadata in PDFs using pdf-lib.
* This service embeds file history directly into PDF metadata, making it persistent
* across all tool operations and downloads.
*/
import { PDFDocument } from 'pdf-lib';
import { FileId } from '../types/file';
const DEBUG = process.env.NODE_ENV === 'development';
/**
* Tool operation metadata for history tracking
*/
export interface ToolOperation {
toolName: string;
timestamp: number;
parameters?: Record<string, any>;
}
/**
* Complete file history metadata structure
*/
export interface PDFHistoryMetadata {
stirlingHistory: {
originalFileId: string;
parentFileId?: string;
versionNumber: number;
toolChain: ToolOperation[];
createdBy: 'Stirling-PDF';
formatVersion: '1.0';
createdAt: number;
lastModified: number;
};
}
/**
* Service for managing PDF file history metadata
*/
export class PDFMetadataService {
private static readonly HISTORY_KEYWORD = 'stirling-history';
private static readonly FORMAT_VERSION = '1.0';
/**
* Inject file history metadata into a PDF
*/
async injectHistoryMetadata(
pdfBytes: ArrayBuffer,
originalFileId: string,
parentFileId?: string,
toolChain: ToolOperation[] = [],
versionNumber: number = 1
): Promise<ArrayBuffer> {
try {
const pdfDoc = await PDFDocument.load(pdfBytes);
const historyMetadata: PDFHistoryMetadata = {
stirlingHistory: {
originalFileId,
parentFileId,
versionNumber,
toolChain: [...toolChain],
createdBy: 'Stirling-PDF',
formatVersion: PDFMetadataService.FORMAT_VERSION,
createdAt: Date.now(),
lastModified: Date.now()
}
};
// Set basic metadata
pdfDoc.setCreator('Stirling-PDF');
pdfDoc.setProducer('Stirling-PDF');
pdfDoc.setModificationDate(new Date());
// Embed history metadata in keywords field (most compatible)
const historyJson = JSON.stringify(historyMetadata);
const existingKeywords = pdfDoc.getKeywords();
// Handle keywords as array (pdf-lib stores them as array)
let keywordList: string[] = [];
if (Array.isArray(existingKeywords)) {
// Remove any existing history keywords to avoid duplicates
keywordList = existingKeywords.filter(keyword =>
!keyword.startsWith(`${PDFMetadataService.HISTORY_KEYWORD}:`)
);
} else if (existingKeywords) {
// Remove history from single keyword string
const cleanKeyword = this.extractHistoryFromKeywords(existingKeywords, true);
if (cleanKeyword) {
keywordList = [cleanKeyword];
}
}
// Add our new history metadata as a keyword (replacing any previous history)
const historyKeyword = `${PDFMetadataService.HISTORY_KEYWORD}:${historyJson}`;
keywordList.push(historyKeyword);
pdfDoc.setKeywords(keywordList);
if (DEBUG) {
console.log('📄 Injected PDF history metadata:', {
originalFileId,
parentFileId,
versionNumber,
toolCount: toolChain.length
});
}
return await pdfDoc.save();
} catch (error) {
if (DEBUG) console.error('📄 Failed to inject PDF metadata:', error);
// Return original bytes if metadata injection fails
return pdfBytes;
}
}
/**
* Extract file history metadata from a PDF
*/
async extractHistoryMetadata(pdfBytes: ArrayBuffer): Promise<PDFHistoryMetadata | null> {
try {
const pdfDoc = await PDFDocument.load(pdfBytes);
const keywords = pdfDoc.getKeywords();
// Look for history keyword directly in array or convert to string
let historyJson: string | null = null;
if (Array.isArray(keywords)) {
// Search through keywords array for our history keyword - get the LATEST one
const historyKeywords = keywords.filter(keyword =>
keyword.startsWith(`${PDFMetadataService.HISTORY_KEYWORD}:`)
);
if (historyKeywords.length > 0) {
// If multiple history keywords exist, parse all and get the highest version number
let latestVersionNumber = 0;
for (const historyKeyword of historyKeywords) {
try {
const json = historyKeyword.substring(`${PDFMetadataService.HISTORY_KEYWORD}:`.length);
const parsed = JSON.parse(json) as PDFHistoryMetadata;
if (parsed.stirlingHistory.versionNumber > latestVersionNumber) {
latestVersionNumber = parsed.stirlingHistory.versionNumber;
historyJson = json;
}
} catch (error) {
// Silent fallback for corrupted history
}
}
}
} else if (keywords) {
// Fallback to string parsing
historyJson = this.extractHistoryFromKeywords(keywords);
}
if (!historyJson) return null;
const metadata = JSON.parse(historyJson) as PDFHistoryMetadata;
// Validate metadata structure
if (!this.isValidHistoryMetadata(metadata)) {
return null;
}
return metadata;
} catch (error) {
if (DEBUG) console.error('📄 pdfMetadataService.extractHistoryMetadata: Failed to extract:', error);
return null;
}
}
/**
* Add a tool operation to existing PDF history
*/
async addToolOperation(
pdfBytes: ArrayBuffer,
toolOperation: ToolOperation
): Promise<ArrayBuffer> {
try {
// Extract existing history
const existingHistory = await this.extractHistoryMetadata(pdfBytes);
if (!existingHistory) {
if (DEBUG) console.warn('📄 No existing history found, cannot add tool operation');
return pdfBytes;
}
// Add new tool operation
const updatedToolChain = [...existingHistory.stirlingHistory.toolChain, toolOperation];
// Re-inject with updated history
return await this.injectHistoryMetadata(
pdfBytes,
existingHistory.stirlingHistory.originalFileId,
existingHistory.stirlingHistory.parentFileId,
updatedToolChain,
existingHistory.stirlingHistory.versionNumber
);
} catch (error) {
if (DEBUG) console.error('📄 Failed to add tool operation:', error);
return pdfBytes;
}
}
/**
* Create a new version of a PDF with incremented version number
*/
async createNewVersion(
pdfBytes: ArrayBuffer,
parentFileId: string,
toolOperation: ToolOperation
): Promise<ArrayBuffer> {
try {
const parentHistory = await this.extractHistoryMetadata(pdfBytes);
const originalFileId = parentHistory?.stirlingHistory.originalFileId || parentFileId;
const parentToolChain = parentHistory?.stirlingHistory.toolChain || [];
const newVersionNumber = (parentHistory?.stirlingHistory.versionNumber || 0) + 1;
// Create new tool chain with the new operation
const newToolChain = [...parentToolChain, toolOperation];
return await this.injectHistoryMetadata(
pdfBytes,
originalFileId,
parentFileId,
newToolChain,
newVersionNumber
);
} catch (error) {
if (DEBUG) console.error('📄 Failed to create new version:', error);
return pdfBytes;
}
}
/**
* Check if a PDF has Stirling history metadata
*/
async hasStirlingHistory(pdfBytes: ArrayBuffer): Promise<boolean> {
const metadata = await this.extractHistoryMetadata(pdfBytes);
return metadata !== null;
}
/**
* Get version information from PDF
*/
async getVersionInfo(pdfBytes: ArrayBuffer): Promise<{
originalFileId: string;
versionNumber: number;
toolCount: number;
parentFileId?: string;
} | null> {
const metadata = await this.extractHistoryMetadata(pdfBytes);
if (!metadata) return null;
return {
originalFileId: metadata.stirlingHistory.originalFileId,
versionNumber: metadata.stirlingHistory.versionNumber,
toolCount: metadata.stirlingHistory.toolChain.length,
parentFileId: metadata.stirlingHistory.parentFileId
};
}
/**
* Embed history JSON in keywords field with delimiter
*/
private embedHistoryInKeywords(existingKeywords: string, historyJson: string): string {
// Remove any existing history
const cleanKeywords = this.extractHistoryFromKeywords(existingKeywords, true) || existingKeywords;
// Add new history with delimiter
const historyKeyword = `${PDFMetadataService.HISTORY_KEYWORD}:${historyJson}`;
if (cleanKeywords.trim()) {
return `${cleanKeywords.trim()} ${historyKeyword}`;
}
return historyKeyword;
}
/**
* Extract history JSON from keywords field
*/
private extractHistoryFromKeywords(keywords: string, returnRemainder = false): string | null {
const historyPrefix = `${PDFMetadataService.HISTORY_KEYWORD}:`;
const historyIndex = keywords.indexOf(historyPrefix);
if (historyIndex === -1) return null;
const historyStart = historyIndex + historyPrefix.length;
let historyEnd = keywords.length;
// Look for the next keyword (space followed by non-JSON content)
// Simple heuristic: find space followed by word that doesn't look like JSON
const afterHistory = keywords.substring(historyStart);
const nextSpaceIndex = afterHistory.indexOf(' ');
if (nextSpaceIndex > 0) {
const afterSpace = afterHistory.substring(nextSpaceIndex + 1);
if (afterSpace && !afterSpace.trim().startsWith('{')) {
historyEnd = historyStart + nextSpaceIndex;
}
}
if (returnRemainder) {
// Return keywords with history removed
const before = keywords.substring(0, historyIndex);
const after = keywords.substring(historyEnd);
return `${before}${after}`.replace(/\s+/g, ' ').trim();
}
return keywords.substring(historyStart, historyEnd).trim();
}
/**
* Validate metadata structure
*/
private isValidHistoryMetadata(metadata: any): metadata is PDFHistoryMetadata {
return metadata &&
metadata.stirlingHistory &&
typeof metadata.stirlingHistory.originalFileId === 'string' &&
typeof metadata.stirlingHistory.versionNumber === 'number' &&
Array.isArray(metadata.stirlingHistory.toolChain) &&
metadata.stirlingHistory.createdBy === 'Stirling-PDF' &&
metadata.stirlingHistory.formatVersion === PDFMetadataService.FORMAT_VERSION;
}
}
// Export singleton instance
export const pdfMetadataService = new PDFMetadataService();