Properly type PDFs

This commit is contained in:
James Brunton 2025-09-12 12:14:54 +01:00
parent e31e6461e4
commit 6a1caf0904
2 changed files with 31 additions and 21 deletions

View File

@ -1,6 +1,7 @@
import { pdfWorkerManager } from './pdfWorkerManager'; import { pdfWorkerManager } from './pdfWorkerManager';
import { FileAnalyzer } from './fileAnalyzer'; import { FileAnalyzer } from './fileAnalyzer';
import { TrappedStatus, CustomMetadataEntry, ExtractedPDFMetadata } from '../types/metadata'; import { TrappedStatus, CustomMetadataEntry, ExtractedPDFMetadata } from '../types/metadata';
import { PDFDocumentProxy } from 'pdfjs-dist/types/src/display/api';
export interface MetadataExtractionResult { export interface MetadataExtractionResult {
success: true; success: true;
@ -18,8 +19,8 @@ export type MetadataExtractionResponse = MetadataExtractionResult | MetadataExtr
* Utility to format PDF date strings to required format (yyyy/MM/dd HH:mm:ss) * Utility to format PDF date strings to required format (yyyy/MM/dd HH:mm:ss)
* Handles PDF date format: "D:YYYYMMDDHHmmSSOHH'mm'" or standard date strings * Handles PDF date format: "D:YYYYMMDDHHmmSSOHH'mm'" or standard date strings
*/ */
function formatPDFDate(dateString: unknown): string { function formatPDFDate(dateString: string): string {
if (!dateString || typeof dateString !== 'string') { if (!dateString) {
return ''; return '';
} }
@ -80,14 +81,14 @@ function convertTrappedStatus(trapped: unknown): TrappedStatus {
* Extract custom metadata fields from PDF.js info object * Extract custom metadata fields from PDF.js info object
* Custom metadata is nested under the "Custom" key * Custom metadata is nested under the "Custom" key
*/ */
function extractCustomMetadata(info: Record<string, unknown>): CustomMetadataEntry[] { function extractCustomMetadata(custom: unknown): CustomMetadataEntry[] {
const customMetadata: CustomMetadataEntry[] = []; const customMetadata: CustomMetadataEntry[] = [];
let customIdCounter = 1; let customIdCounter = 1;
// Check if there's a Custom object containing the custom metadata // Check if there's a Custom object containing the custom metadata
if (info.Custom && typeof info.Custom === 'object' && info.Custom !== null) { if (typeof custom === 'object' && custom !== null) {
const customObj = info.Custom as Record<string, unknown>; const customObj = custom as Record<string, unknown>;
Object.entries(customObj).forEach(([key, value]) => { Object.entries(customObj).forEach(([key, value]) => {
if (value != null && value !== '') { if (value != null && value !== '') {
@ -107,7 +108,7 @@ function extractCustomMetadata(info: Record<string, unknown>): CustomMetadataEnt
/** /**
* Safely cleanup PDF document with error handling * Safely cleanup PDF document with error handling
*/ */
function cleanupPdfDocument(pdfDoc: any): void { function cleanupPdfDocument(pdfDoc: PDFDocumentProxy | null): void {
if (pdfDoc) { if (pdfDoc) {
try { try {
pdfWorkerManager.destroyDocument(pdfDoc); pdfWorkerManager.destroyDocument(pdfDoc);
@ -117,6 +118,14 @@ function cleanupPdfDocument(pdfDoc: any): void {
} }
} }
function getStringMetadata(info: Record<string, unknown>, key: string): string {
if (typeof info[key] === 'string') {
return info[key];
} else {
return '';
}
}
/** /**
* Extract all metadata from a PDF file * Extract all metadata from a PDF file
* Returns a result object with success/error state * Returns a result object with success/error state
@ -131,9 +140,9 @@ export async function extractPDFMetadata(file: File): Promise<MetadataExtraction
}; };
} }
let pdfDoc: any = null; let pdfDoc: PDFDocumentProxy | null = null;
let arrayBuffer: ArrayBuffer; let arrayBuffer: ArrayBuffer;
let metadata: any; let metadata;
try { try {
arrayBuffer = await file.arrayBuffer(); arrayBuffer = await file.arrayBuffer();
@ -151,20 +160,20 @@ export async function extractPDFMetadata(file: File): Promise<MetadataExtraction
}; };
} }
const info = metadata.info || {}; const info = metadata.info as Record<string, unknown>;
// Safely extract metadata with proper type checking // Safely extract metadata with proper type checking
const extractedMetadata: ExtractedPDFMetadata = { const extractedMetadata: ExtractedPDFMetadata = {
title: typeof info.Title === 'string' ? info.Title : '', title: getStringMetadata(info, 'Title'),
author: typeof info.Author === 'string' ? info.Author : '', author: getStringMetadata(info, 'Author'),
subject: typeof info.Subject === 'string' ? info.Subject : '', subject: getStringMetadata(info, 'Subject'),
keywords: typeof info.Keywords === 'string' ? info.Keywords : '', keywords: getStringMetadata(info, 'Keywords'),
creator: typeof info.Creator === 'string' ? info.Creator : '', creator: getStringMetadata(info, 'Creator'),
producer: typeof info.Producer === 'string' ? info.Producer : '', producer: getStringMetadata(info, 'Producer'),
creationDate: formatPDFDate(info.CreationDate), creationDate: formatPDFDate(getStringMetadata(info, 'CreationDate')),
modificationDate: formatPDFDate(info.ModDate), modificationDate: formatPDFDate(getStringMetadata(info, 'ModDate')),
trapped: convertTrappedStatus(info.Trapped), trapped: convertTrappedStatus(info.Trapped),
customMetadata: extractCustomMetadata(info) customMetadata: extractCustomMetadata(info.Custom),
}; };
cleanupPdfDocument(pdfDoc); cleanupPdfDocument(pdfDoc);

View File

@ -6,11 +6,12 @@
*/ */
import * as pdfjsLib from 'pdfjs-dist'; import * as pdfjsLib from 'pdfjs-dist';
import { PDFDocumentProxy } from 'pdfjs-dist/types/src/display/api';
const { getDocument, GlobalWorkerOptions } = pdfjsLib; const { getDocument, GlobalWorkerOptions } = pdfjsLib;
class PDFWorkerManager { class PDFWorkerManager {
private static instance: PDFWorkerManager; private static instance: PDFWorkerManager;
private activeDocuments = new Set<any>(); private activeDocuments = new Set<PDFDocumentProxy>();
private workerCount = 0; private workerCount = 0;
private maxWorkers = 10; // Limit concurrent workers private maxWorkers = 10; // Limit concurrent workers
private isInitialized = false; private isInitialized = false;
@ -48,7 +49,7 @@ class PDFWorkerManager {
stopAtErrors?: boolean; stopAtErrors?: boolean;
verbosity?: number; verbosity?: number;
} = {} } = {}
): Promise<any> { ): Promise<PDFDocumentProxy> {
// Wait if we've hit the worker limit // Wait if we've hit the worker limit
if (this.activeDocuments.size >= this.maxWorkers) { if (this.activeDocuments.size >= this.maxWorkers) {
await this.waitForAvailableWorker(); await this.waitForAvailableWorker();
@ -104,7 +105,7 @@ class PDFWorkerManager {
/** /**
* Properly destroy a PDF document and clean up resources * Properly destroy a PDF document and clean up resources
*/ */
destroyDocument(pdf: any): void { destroyDocument(pdf: PDFDocumentProxy): void {
if (this.activeDocuments.has(pdf)) { if (this.activeDocuments.has(pdf)) {
try { try {
pdf.destroy(); pdf.destroy();