Properly type PDFs

2025-09-18 01:19:24 +00:00 · 2025-09-12 12:14:54 +01:00 · 2025-09-12 12:14:54 +01:00 · 6a1caf0904
commit 6a1caf0904
parent e31e6461e4
2 changed files with 31 additions and 21 deletions
--- a/frontend/src/services/pdfMetadataService.ts
+++ b/frontend/src/services/pdfMetadataService.ts
@ -1,6 +1,7 @@
 import { pdfWorkerManager } from './pdfWorkerManager';
 import { FileAnalyzer } from './fileAnalyzer';
 import { TrappedStatus, CustomMetadataEntry, ExtractedPDFMetadata } from '../types/metadata';
+import { PDFDocumentProxy } from 'pdfjs-dist/types/src/display/api';

 export interface MetadataExtractionResult {
  success: true;
@ -18,8 +19,8 @@ export type MetadataExtractionResponse = MetadataExtractionResult | MetadataExtr
 * Utility to format PDF date strings to required format (yyyy/MM/dd HH:mm:ss)
 * Handles PDF date format: "D:YYYYMMDDHHmmSSOHH'mm'" or standard date strings
 */
-function formatPDFDate(dateString: unknown): string {
-  if (!dateString || typeof dateString !== 'string') {
+function formatPDFDate(dateString: string): string {
+  if (!dateString) {
    return '';
  }

@ -80,14 +81,14 @@ function convertTrappedStatus(trapped: unknown): TrappedStatus {
 * Extract custom metadata fields from PDF.js info object
 * Custom metadata is nested under the "Custom" key
 */
-function extractCustomMetadata(info: Record<string, unknown>): CustomMetadataEntry[] {
+function extractCustomMetadata(custom: unknown): CustomMetadataEntry[] {
  const customMetadata: CustomMetadataEntry[] = [];
  let customIdCounter = 1;


  // Check if there's a Custom object containing the custom metadata
-  if (info.Custom && typeof info.Custom === 'object' && info.Custom !== null) {
-    const customObj = info.Custom as Record<string, unknown>;
+  if (typeof custom === 'object' && custom !== null) {
+    const customObj = custom as Record<string, unknown>;

    Object.entries(customObj).forEach(([key, value]) => {
      if (value != null && value !== '') {
@ -107,7 +108,7 @@ function extractCustomMetadata(info: Record<string, unknown>): CustomMetadataEnt
 /**
 * Safely cleanup PDF document with error handling
 */
-function cleanupPdfDocument(pdfDoc: any): void {
+function cleanupPdfDocument(pdfDoc: PDFDocumentProxy | null): void {
  if (pdfDoc) {
    try {
      pdfWorkerManager.destroyDocument(pdfDoc);
@ -117,6 +118,14 @@ function cleanupPdfDocument(pdfDoc: any): void {
  }
 }

+function getStringMetadata(info: Record<string, unknown>, key: string): string {
+  if (typeof info[key] === 'string') {
+    return info[key];
+  } else {
+    return '';
+  }
+}
+
 /**
 * Extract all metadata from a PDF file
 * Returns a result object with success/error state
@ -131,9 +140,9 @@ export async function extractPDFMetadata(file: File): Promise<MetadataExtraction
    };
  }

-  let pdfDoc: any = null;
+  let pdfDoc: PDFDocumentProxy | null = null;
  let arrayBuffer: ArrayBuffer;
-  let metadata: any;
+  let metadata;

  try {
    arrayBuffer = await file.arrayBuffer();
@ -151,20 +160,20 @@ export async function extractPDFMetadata(file: File): Promise<MetadataExtraction
    };
  }

-  const info = metadata.info || {};
+  const info = metadata.info as Record<string, unknown>;

  // Safely extract metadata with proper type checking
  const extractedMetadata: ExtractedPDFMetadata = {
-    title: typeof info.Title === 'string' ? info.Title : '',
-    author: typeof info.Author === 'string' ? info.Author : '',
-    subject: typeof info.Subject === 'string' ? info.Subject : '',
-    keywords: typeof info.Keywords === 'string' ? info.Keywords : '',
-    creator: typeof info.Creator === 'string' ? info.Creator : '',
-    producer: typeof info.Producer === 'string' ? info.Producer : '',
-    creationDate: formatPDFDate(info.CreationDate),
-    modificationDate: formatPDFDate(info.ModDate),
+    title: getStringMetadata(info, 'Title'),
+    author: getStringMetadata(info, 'Author'),
+    subject: getStringMetadata(info, 'Subject'),
+    keywords: getStringMetadata(info, 'Keywords'),
+    creator: getStringMetadata(info, 'Creator'),
+    producer: getStringMetadata(info, 'Producer'),
+    creationDate: formatPDFDate(getStringMetadata(info, 'CreationDate')),
+    modificationDate: formatPDFDate(getStringMetadata(info, 'ModDate')),
    trapped: convertTrappedStatus(info.Trapped),
-    customMetadata: extractCustomMetadata(info)
+    customMetadata: extractCustomMetadata(info.Custom),
  };

  cleanupPdfDocument(pdfDoc);
--- a/frontend/src/services/pdfWorkerManager.ts
+++ b/frontend/src/services/pdfWorkerManager.ts
@ -6,11 +6,12 @@
 */

 import * as pdfjsLib from 'pdfjs-dist';
+import { PDFDocumentProxy } from 'pdfjs-dist/types/src/display/api';
 const { getDocument, GlobalWorkerOptions } = pdfjsLib;

 class PDFWorkerManager {
  private static instance: PDFWorkerManager;
-  private activeDocuments = new Set<any>();
+  private activeDocuments = new Set<PDFDocumentProxy>();
  private workerCount = 0;
  private maxWorkers = 10; // Limit concurrent workers
  private isInitialized = false;
@ -48,7 +49,7 @@ class PDFWorkerManager {
      stopAtErrors?: boolean;
      verbosity?: number;
    } = {}
-  ): Promise<any> {
+  ): Promise<PDFDocumentProxy> {
    // Wait if we've hit the worker limit
    if (this.activeDocuments.size >= this.maxWorkers) {
      await this.waitForAvailableWorker();
@ -104,7 +105,7 @@ class PDFWorkerManager {
  /**
   * Properly destroy a PDF document and clean up resources
   */
-  destroyDocument(pdf: any): void {
+  destroyDocument(pdf: PDFDocumentProxy): void {
    if (this.activeDocuments.has(pdf)) {
      try {
        pdf.destroy();