From 8c680b639eb62e28397823ca808d33503aa2294a Mon Sep 17 00:00:00 2001 From: EthanHealy01 Date: Thu, 31 Jul 2025 19:29:06 +0100 Subject: [PATCH] merge language files to one object --- .../components/tools/ocr/LanguagePicker.tsx | 3 +- frontend/src/utils/languageMapping.ts | 1045 +++++++++++++---- frontend/src/utils/tempOcrLanguages.ts | 135 --- 3 files changed, 842 insertions(+), 341 deletions(-) delete mode 100644 frontend/src/utils/tempOcrLanguages.ts diff --git a/frontend/src/components/tools/ocr/LanguagePicker.tsx b/frontend/src/components/tools/ocr/LanguagePicker.tsx index 8d425d1ed..31f0fe301 100644 --- a/frontend/src/components/tools/ocr/LanguagePicker.tsx +++ b/frontend/src/components/tools/ocr/LanguagePicker.tsx @@ -1,8 +1,7 @@ import React, { useState, useEffect } from 'react'; import { Text, Loader } from '@mantine/core'; import { useTranslation } from 'react-i18next'; -import { tempOcrLanguages } from '../../../utils/tempOcrLanguages'; -import { getAutoOcrLanguage } from '../../../utils/languageMapping'; +import { tempOcrLanguages, getAutoOcrLanguage } from '../../../utils/languageMapping'; import DropdownListWithFooter, { DropdownItem } from '../../shared/DropdownListWithFooter'; export interface LanguageOption { diff --git a/frontend/src/utils/languageMapping.ts b/frontend/src/utils/languageMapping.ts index c716c8843..687c63258 100644 --- a/frontend/src/utils/languageMapping.ts +++ b/frontend/src/utils/languageMapping.ts @@ -1,336 +1,899 @@ -// Mapping from browser language codes to OCR language codes -// Handles exact matches and similar language fallbacks +// Unified Language System - Tri-directional mapping between browser languages, OCR codes, and display names +// Replaces both languageMapping.ts and tempOcrLanguages.ts -interface LanguageMapping { - [browserCode: string]: string; +interface LanguageDefinition { + ocrCode: string; + displayName: string; + browserCodes: string[]; } -// Primary mapping from browser language codes to OCR language codes -const browserToOcrMapping: LanguageMapping = { - // English variants - 'en': 'eng', - 'en-US': 'eng', - 'en-GB': 'eng', - 'en-AU': 'eng', - 'en-CA': 'eng', - 'en-IE': 'eng', - 'en-NZ': 'eng', - 'en-ZA': 'eng', +// Comprehensive language definitions with all mappings +const languageDefinitions: LanguageDefinition[] = [ + // English + { + ocrCode: 'eng', + displayName: 'English', + browserCodes: ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IE', 'en-NZ', 'en-ZA'] + }, - // Spanish variants - 'es': 'spa', - 'es-ES': 'spa', - 'es-MX': 'spa', - 'es-AR': 'spa', - 'es-CO': 'spa', - 'es-CL': 'spa', - 'es-PE': 'spa', - 'es-VE': 'spa', + // Spanish + { + ocrCode: 'spa', + displayName: 'Spanish', + browserCodes: ['es', 'es-ES', 'es-MX', 'es-AR', 'es-CO', 'es-CL', 'es-PE', 'es-VE'] + }, - // French variants - 'fr': 'fra', - 'fr-FR': 'fra', - 'fr-CA': 'fra', - 'fr-BE': 'fra', - 'fr-CH': 'fra', + // French + { + ocrCode: 'fra', + displayName: 'French', + browserCodes: ['fr', 'fr-FR', 'fr-CA', 'fr-BE', 'fr-CH'] + }, - // German variants - 'de': 'deu', - 'de-DE': 'deu', - 'de-AT': 'deu', - 'de-CH': 'deu', + // German + { + ocrCode: 'deu', + displayName: 'German', + browserCodes: ['de', 'de-DE', 'de-AT', 'de-CH'] + }, - // Portuguese variants - 'pt': 'por', - 'pt-PT': 'por', - 'pt-BR': 'por', + // Portuguese + { + ocrCode: 'por', + displayName: 'Portuguese', + browserCodes: ['pt', 'pt-PT', 'pt-BR'] + }, - // Italian variants - 'it': 'ita', - 'it-IT': 'ita', - 'it-CH': 'ita', + // Italian + { + ocrCode: 'ita', + displayName: 'Italian', + browserCodes: ['it', 'it-IT', 'it-CH'] + }, - // Chinese variants - 'zh': 'chi_sim', - 'zh-CN': 'chi_sim', - 'zh-Hans': 'chi_sim', - 'zh-TW': 'chi_tra', - 'zh-HK': 'chi_tra', - 'zh-Hant': 'chi_tra', - 'zh-BO': 'bod', + // Chinese Simplified + { + ocrCode: 'chi_sim', + displayName: 'Chinese (Simplified)', + browserCodes: ['zh', 'zh-CN', 'zh-Hans'] + }, + + // Chinese Traditional + { + ocrCode: 'chi_tra', + displayName: 'Chinese (Traditional)', + browserCodes: ['zh-TW', 'zh-HK', 'zh-Hant'] + }, + + // Tibetan + { + ocrCode: 'bod', + displayName: 'Tibetan', + browserCodes: ['bo', 'zh-BO'] + }, // Japanese - 'ja': 'jpn', - 'ja-JP': 'jpn', + { + ocrCode: 'jpn', + displayName: 'Japanese', + browserCodes: ['ja', 'ja-JP'] + }, // Korean - 'ko': 'kor', - 'ko-KR': 'kor', + { + ocrCode: 'kor', + displayName: 'Korean', + browserCodes: ['ko', 'ko-KR'] + }, - // Russian variants - 'ru': 'rus', - 'ru-RU': 'rus', + // Russian + { + ocrCode: 'rus', + displayName: 'Russian', + browserCodes: ['ru', 'ru-RU'] + }, - // Arabic variants - 'ar': 'ara', - 'ar-SA': 'ara', - 'ar-EG': 'ara', - 'ar-AE': 'ara', - 'ar-MA': 'ara', + // Arabic + { + ocrCode: 'ara', + displayName: 'Arabic', + browserCodes: ['ar', 'ar-SA', 'ar-EG', 'ar-AE', 'ar-MA'] + }, - // Dutch variants - 'nl': 'nld', - 'nl-NL': 'nld', - 'nl-BE': 'nld', + // Dutch + { + ocrCode: 'nld', + displayName: 'Dutch; Flemish', + browserCodes: ['nl', 'nl-NL', 'nl-BE'] + }, // Polish - 'pl': 'pol', - 'pl-PL': 'pol', + { + ocrCode: 'pol', + displayName: 'Polish', + browserCodes: ['pl', 'pl-PL'] + }, // Czech - 'cs': 'ces', - 'cs-CZ': 'ces', + { + ocrCode: 'ces', + displayName: 'Czech', + browserCodes: ['cs', 'cs-CZ'] + }, // Slovak - 'sk': 'slk', - 'sk-SK': 'slk', + { + ocrCode: 'slk', + displayName: 'Slovak', + browserCodes: ['sk', 'sk-SK'] + }, // Hungarian - 'hu': 'hun', - 'hu-HU': 'hun', + { + ocrCode: 'hun', + displayName: 'Hungarian', + browserCodes: ['hu', 'hu-HU'] + }, // Romanian - 'ro': 'ron', - 'ro-RO': 'ron', + { + ocrCode: 'ron', + displayName: 'Romanian, Moldavian, Moldovan', + browserCodes: ['ro', 'ro-RO'] + }, // Bulgarian - 'bg': 'bul', - 'bg-BG': 'bul', + { + ocrCode: 'bul', + displayName: 'Bulgarian', + browserCodes: ['bg', 'bg-BG'] + }, // Croatian - 'hr': 'hrv', - 'hr-HR': 'hrv', + { + ocrCode: 'hrv', + displayName: 'Croatian', + browserCodes: ['hr', 'hr-HR'] + }, // Serbian - 'sr': 'srp', - 'sr-RS': 'srp', - 'sr-Latn': 'srp_latn', + { + ocrCode: 'srp', + displayName: 'Serbian', + browserCodes: ['sr', 'sr-RS'] + }, + + // Serbian Latin + { + ocrCode: 'srp_latn', + displayName: 'Serbian (Latin)', + browserCodes: ['sr-Latn'] + }, // Slovenian - 'sl': 'slv', - 'sl-SI': 'slv', + { + ocrCode: 'slv', + displayName: 'Slovenian', + browserCodes: ['sl', 'sl-SI'] + }, // Estonian - 'et': 'est', - 'et-EE': 'est', + { + ocrCode: 'est', + displayName: 'Estonian', + browserCodes: ['et', 'et-EE'] + }, // Latvian - 'lv': 'lav', - 'lv-LV': 'lav', + { + ocrCode: 'lav', + displayName: 'Latvian', + browserCodes: ['lv', 'lv-LV'] + }, // Lithuanian - 'lt': 'lit', - 'lt-LT': 'lit', + { + ocrCode: 'lit', + displayName: 'Lithuanian', + browserCodes: ['lt', 'lt-LT'] + }, // Finnish - 'fi': 'fin', - 'fi-FI': 'fin', + { + ocrCode: 'fin', + displayName: 'Finnish', + browserCodes: ['fi', 'fi-FI'] + }, // Swedish - 'sv': 'swe', - 'sv-SE': 'swe', + { + ocrCode: 'swe', + displayName: 'Swedish', + browserCodes: ['sv', 'sv-SE'] + }, // Norwegian - 'no': 'nor', - 'nb': 'nor', - 'nn': 'nor', - 'no-NO': 'nor', - 'nb-NO': 'nor', - 'nn-NO': 'nor', + { + ocrCode: 'nor', + displayName: 'Norwegian', + browserCodes: ['no', 'nb', 'nn', 'no-NO', 'nb-NO', 'nn-NO'] + }, // Danish - 'da': 'dan', - 'da-DK': 'dan', + { + ocrCode: 'dan', + displayName: 'Danish', + browserCodes: ['da', 'da-DK'] + }, // Icelandic - 'is': 'isl', - 'is-IS': 'isl', + { + ocrCode: 'isl', + displayName: 'Icelandic', + browserCodes: ['is', 'is-IS'] + }, // Greek - 'el': 'ell', - 'el-GR': 'ell', + { + ocrCode: 'ell', + displayName: 'Greek', + browserCodes: ['el', 'el-GR'] + }, // Turkish - 'tr': 'tur', - 'tr-TR': 'tur', + { + ocrCode: 'tur', + displayName: 'Turkish', + browserCodes: ['tr', 'tr-TR'] + }, // Hebrew - 'he': 'heb', - 'he-IL': 'heb', + { + ocrCode: 'heb', + displayName: 'Hebrew', + browserCodes: ['he', 'he-IL'] + }, // Hindi - 'hi': 'hin', - 'hi-IN': 'hin', + { + ocrCode: 'hin', + displayName: 'Hindi', + browserCodes: ['hi', 'hi-IN'] + }, // Thai - 'th': 'tha', - 'th-TH': 'tha', + { + ocrCode: 'tha', + displayName: 'Thai', + browserCodes: ['th', 'th-TH'] + }, // Vietnamese - 'vi': 'vie', - 'vi-VN': 'vie', + { + ocrCode: 'vie', + displayName: 'Vietnamese', + browserCodes: ['vi', 'vi-VN'] + }, // Indonesian - 'id': 'ind', - 'id-ID': 'ind', + { + ocrCode: 'ind', + displayName: 'Indonesian', + browserCodes: ['id', 'id-ID'] + }, // Malay - 'ms': 'msa', - 'ms-MY': 'msa', + { + ocrCode: 'msa', + displayName: 'Malay', + browserCodes: ['ms', 'ms-MY'] + }, - // Filipino/Tagalog - 'fil': 'fil', - 'tl': 'tgl', + // Filipino + { + ocrCode: 'fil', + displayName: 'Filipino', + browserCodes: ['fil'] + }, + + // Tagalog + { + ocrCode: 'tgl', + displayName: 'Tagalog', + browserCodes: ['tl'] + }, // Ukrainian - 'uk': 'ukr', - 'uk-UA': 'ukr', + { + ocrCode: 'ukr', + displayName: 'Ukrainian', + browserCodes: ['uk', 'uk-UA'] + }, // Belarusian - 'be': 'bel', - 'be-BY': 'bel', + { + ocrCode: 'bel', + displayName: 'Belarusian', + browserCodes: ['be', 'be-BY'] + }, // Kazakh - 'kk': 'kaz', - 'kk-KZ': 'kaz', + { + ocrCode: 'kaz', + displayName: 'Kazakh', + browserCodes: ['kk', 'kk-KZ'] + }, // Uzbek - 'uz': 'uzb', - 'uz-UZ': 'uzb', + { + ocrCode: 'uzb', + displayName: 'Uzbek', + browserCodes: ['uz', 'uz-UZ'] + }, // Georgian - 'ka': 'kat', - 'ka-GE': 'kat', + { + ocrCode: 'kat', + displayName: 'Georgian', + browserCodes: ['ka', 'ka-GE'] + }, // Armenian - 'hy': 'hye', - 'hy-AM': 'hye', + { + ocrCode: 'hye', + displayName: 'Armenian', + browserCodes: ['hy', 'hy-AM'] + }, // Azerbaijani - 'az': 'aze', - 'az-AZ': 'aze', + { + ocrCode: 'aze', + displayName: 'Azerbaijani', + browserCodes: ['az', 'az-AZ'] + }, // Persian/Farsi - 'fa': 'fas', - 'fa-IR': 'fas', + { + ocrCode: 'fas', + displayName: 'Persian', + browserCodes: ['fa', 'fa-IR'] + }, // Urdu - 'ur': 'urd', - 'ur-PK': 'urd', + { + ocrCode: 'urd', + displayName: 'Urdu', + browserCodes: ['ur', 'ur-PK'] + }, // Bengali - 'bn': 'ben', - 'bn-BD': 'ben', - 'bn-IN': 'ben', + { + ocrCode: 'ben', + displayName: 'Bengali', + browserCodes: ['bn', 'bn-BD', 'bn-IN'] + }, // Tamil - 'ta': 'tam', - 'ta-IN': 'tam', - 'ta-LK': 'tam', + { + ocrCode: 'tam', + displayName: 'Tamil', + browserCodes: ['ta', 'ta-IN', 'ta-LK'] + }, // Telugu - 'te': 'tel', - 'te-IN': 'tel', + { + ocrCode: 'tel', + displayName: 'Telugu', + browserCodes: ['te', 'te-IN'] + }, // Kannada - 'kn': 'kan', - 'kn-IN': 'kan', + { + ocrCode: 'kan', + displayName: 'Kannada', + browserCodes: ['kn', 'kn-IN'] + }, // Malayalam - 'ml': 'mal', - 'ml-IN': 'mal', + { + ocrCode: 'mal', + displayName: 'Malayalam', + browserCodes: ['ml', 'ml-IN'] + }, // Gujarati - 'gu': 'guj', - 'gu-IN': 'guj', + { + ocrCode: 'guj', + displayName: 'Gujarati', + browserCodes: ['gu', 'gu-IN'] + }, // Marathi - 'mr': 'mar', - 'mr-IN': 'mar', + { + ocrCode: 'mar', + displayName: 'Marathi', + browserCodes: ['mr', 'mr-IN'] + }, // Punjabi - 'pa': 'pan', - 'pa-IN': 'pan', + { + ocrCode: 'pan', + displayName: 'Panjabi, Punjabi', + browserCodes: ['pa', 'pa-IN'] + }, // Nepali - 'ne': 'nep', - 'ne-NP': 'nep', + { + ocrCode: 'nep', + displayName: 'Nepali', + browserCodes: ['ne', 'ne-NP'] + }, // Sinhala - 'si': 'sin', - 'si-LK': 'sin', + { + ocrCode: 'sin', + displayName: 'Sinhala, Sinhalese', + browserCodes: ['si', 'si-LK'] + }, // Burmese - 'my': 'mya', - 'my-MM': 'mya', + { + ocrCode: 'mya', + displayName: 'Burmese', + browserCodes: ['my', 'my-MM'] + }, // Khmer - 'km': 'khm', - 'km-KH': 'khm', + { + ocrCode: 'khm', + displayName: 'Central Khmer', + browserCodes: ['km', 'km-KH'] + }, // Lao - 'lo': 'lao', - 'lo-LA': 'lao', + { + ocrCode: 'lao', + displayName: 'Lao', + browserCodes: ['lo', 'lo-LA'] + }, // Mongolian - 'mn': 'mon', - 'mn-MN': 'mon', + { + ocrCode: 'mon', + displayName: 'Mongolian', + browserCodes: ['mn', 'mn-MN'] + }, // Welsh - 'cy': 'cym', - 'cy-GB': 'cym', + { + ocrCode: 'cym', + displayName: 'Welsh', + browserCodes: ['cy', 'cy-GB'] + }, // Irish - 'ga': 'gle', - 'ga-IE': 'gle', + { + ocrCode: 'gle', + displayName: 'Irish', + browserCodes: ['ga', 'ga-IE'] + }, // Scottish Gaelic - 'gd': 'gla', - 'gd-GB': 'gla', + { + ocrCode: 'gla', + displayName: 'Scottish Gaelic', + browserCodes: ['gd', 'gd-GB'] + }, // Basque - 'eu': 'eus', - 'eu-ES': 'eus', + { + ocrCode: 'eus', + displayName: 'Basque', + browserCodes: ['eu', 'eu-ES'] + }, // Catalan - 'ca': 'cat', - 'ca-ES': 'cat', + { + ocrCode: 'cat', + displayName: 'Catalan', + browserCodes: ['ca', 'ca-ES'] + }, // Galician - 'gl': 'glg', - 'gl-ES': 'glg', + { + ocrCode: 'glg', + displayName: 'Galician', + browserCodes: ['gl', 'gl-ES'] + }, // Macedonian - 'mk': 'mkd', - 'mk-MK': 'mkd', + { + ocrCode: 'mkd', + displayName: 'Macedonian', + browserCodes: ['mk', 'mk-MK'] + }, // Albanian - 'sq': 'sqi', - 'sq-AL': 'sqi', + { + ocrCode: 'sqi', + displayName: 'Albanian', + browserCodes: ['sq', 'sq-AL'] + }, // Maltese - 'mt': 'mlt', - 'mt-MT': 'mlt', + { + ocrCode: 'mlt', + displayName: 'Maltese', + browserCodes: ['mt', 'mt-MT'] + }, // Afrikaans - 'af': 'afr', - 'af-ZA': 'afr', + { + ocrCode: 'afr', + displayName: 'Afrikaans', + browserCodes: ['af', 'af-ZA'] + }, // Swahili - 'sw': 'swa', - 'sw-KE': 'swa', - 'sw-TZ': 'swa', -}; + { + ocrCode: 'swa', + displayName: 'Swahili', + browserCodes: ['sw', 'sw-KE', 'sw-TZ'] + }, + + // Amharic + { + ocrCode: 'amh', + displayName: 'Amharic', + browserCodes: ['am'] + }, + + // Assamese + { + ocrCode: 'asm', + displayName: 'Assamese', + browserCodes: ['as'] + }, + + // Azerbaijani (Cyrillic) + { + ocrCode: 'aze_cyrl', + displayName: 'Azerbaijani (Cyrillic)', + browserCodes: [] + }, + + // Bosnian + { + ocrCode: 'bos', + displayName: 'Bosnian', + browserCodes: ['bs'] + }, + + // Breton + { + ocrCode: 'bre', + displayName: 'Breton', + browserCodes: ['br'] + }, + + // Bambara + { + ocrCode: 'bam', + displayName: 'Bambara', + browserCodes: ['bm'] + }, + + // Bashkir + { + ocrCode: 'bak', + displayName: 'Bashkir', + browserCodes: ['ba'] + }, + + // Cornish + { + ocrCode: 'cor', + displayName: 'Cornish', + browserCodes: ['kw'] + }, + + // Corsican + { + ocrCode: 'cos', + displayName: 'Corsican', + browserCodes: ['co'] + }, + + // Ewe + { + ocrCode: 'ewe', + displayName: 'Ewe', + browserCodes: ['ee'] + }, + + // Faroese + { + ocrCode: 'fao', + displayName: 'Faroese', + browserCodes: ['fo'] + }, + + // Fijian + { + ocrCode: 'fij', + displayName: 'Fijian', + browserCodes: ['fj'] + }, + + // Haitian Creole + { + ocrCode: 'hat', + displayName: 'Haitian, Haitian Creole', + browserCodes: ['ht'] + }, + + // Javanese + { + ocrCode: 'jav', + displayName: 'Javanese', + browserCodes: ['jv'] + }, + + // Kirghiz + { + ocrCode: 'kir', + displayName: 'Kirghiz, Kyrgyz', + browserCodes: ['ky'] + }, + + // Quechua + { + ocrCode: 'que', + displayName: 'Quechua', + browserCodes: ['qu'] + }, + + // Sindhi + { + ocrCode: 'snd', + displayName: 'Sindhi', + browserCodes: ['sd'] + }, + + // Yiddish + { + ocrCode: 'yid', + displayName: 'Yiddish', + browserCodes: ['yi'] + }, + + // Yoruba + { + ocrCode: 'yor', + displayName: 'Yoruba', + browserCodes: ['yo'] + }, + + // Additional OCR languages without browser mappings or with very specific/rare codes + { + ocrCode: 'ceb', + displayName: 'Cebuano', + browserCodes: [] + }, + { + ocrCode: 'chi_sim_vert', + displayName: 'Chinese (Simplified, Vertical)', + browserCodes: [] + }, + { + ocrCode: 'chi_tra_vert', + displayName: 'Chinese (Traditional, Vertical)', + browserCodes: [] + }, + { + ocrCode: 'chr', + displayName: 'Cherokee', + browserCodes: [] + }, + { + ocrCode: 'dan_frak', + displayName: 'Danish (Fraktur)', + browserCodes: [] + }, + { + ocrCode: 'deu_frak', + displayName: 'German (Fraktur)', + browserCodes: [] + }, + { + ocrCode: 'div', + displayName: 'Divehi', + browserCodes: ['dv'] + }, + { + ocrCode: 'dzo', + displayName: 'Dzongkha', + browserCodes: ['dz'] + }, + { + ocrCode: 'enm', + displayName: 'English, Middle (1100-1500)', + browserCodes: [] + }, + { + ocrCode: 'epo', + displayName: 'Esperanto', + browserCodes: ['eo'] + }, + { + ocrCode: 'equ', + displayName: 'Math / equation detection module', + browserCodes: [] + }, + { + ocrCode: 'frk', + displayName: 'Frankish', + browserCodes: [] + }, + { + ocrCode: 'frm', + displayName: 'French, Middle (ca.1400-1600)', + browserCodes: [] + }, + { + ocrCode: 'fry', + displayName: 'Western Frisian', + browserCodes: ['fy'] + }, + { + ocrCode: 'grc', + displayName: 'Ancient Greek', + browserCodes: [] + }, + { + ocrCode: 'iku', + displayName: 'Inuktitut', + browserCodes: ['iu'] + }, + { + ocrCode: 'ita_old', + displayName: 'Italian (Old)', + browserCodes: [] + }, + { + ocrCode: 'jpn_vert', + displayName: 'Japanese (Vertical)', + browserCodes: [] + }, + { + ocrCode: 'kat_old', + displayName: 'Georgian (Old)', + browserCodes: [] + }, + { + ocrCode: 'kmr', + displayName: 'Northern Kurdish', + browserCodes: ['ku'] + }, + { + ocrCode: 'kor_vert', + displayName: 'Korean (Vertical)', + browserCodes: [] + }, + { + ocrCode: 'lat', + displayName: 'Latin', + browserCodes: ['la'] + }, + { + ocrCode: 'ltz', + displayName: 'Luxembourgish', + browserCodes: ['lb'] + }, + { + ocrCode: 'mri', + displayName: 'Maori', + browserCodes: ['mi'] + }, + { + ocrCode: 'oci', + displayName: 'Occitan (post 1500)', + browserCodes: ['oc'] + }, + { + ocrCode: 'ori', + displayName: 'Oriya', + browserCodes: ['or'] + }, + { + ocrCode: 'osd', + displayName: 'Orientation and script detection module', + browserCodes: [] + }, + { + ocrCode: 'pus', + displayName: 'Pushto, Pashto', + browserCodes: ['ps'] + }, + { + ocrCode: 'san', + displayName: 'Sanskrit', + browserCodes: ['sa'] + }, + { + ocrCode: 'slk_frak', + displayName: 'Slovak (Fraktur)', + browserCodes: [] + }, + { + ocrCode: 'spa_old', + displayName: 'Spanish (Old)', + browserCodes: [] + }, + { + ocrCode: 'sun', + displayName: 'Sundanese', + browserCodes: ['su'] + }, + { + ocrCode: 'syr', + displayName: 'Syriac', + browserCodes: [] + }, + { + ocrCode: 'tat', + displayName: 'Tatar', + browserCodes: ['tt'] + }, + { + ocrCode: 'tgk', + displayName: 'Tajik', + browserCodes: ['tg'] + }, + { + ocrCode: 'tir', + displayName: 'Tigrinya', + browserCodes: ['ti'] + }, + { + ocrCode: 'ton', + displayName: 'Tonga (Tonga Islands)', + browserCodes: ['to'] + }, + { + ocrCode: 'uig', + displayName: 'Uighur, Uyghur', + browserCodes: ['ug'] + }, + { + ocrCode: 'uzb_cyrl', + displayName: 'Uzbek (Cyrillic)', + browserCodes: [] + } +]; + +// Build lookup maps for efficient access +const browserToOcrMap = new Map(); +const ocrToDisplayMap = new Map(); +const displayToOcrMap = new Map(); +const ocrToBrowserMap = new Map(); + +// Populate lookup maps +languageDefinitions.forEach(lang => { + // OCR code to display name + ocrToDisplayMap.set(lang.ocrCode, lang.displayName); + + // Display name to OCR code + displayToOcrMap.set(lang.displayName.toLowerCase(), lang.ocrCode); + + // OCR code to browser codes + ocrToBrowserMap.set(lang.ocrCode, lang.browserCodes); + + // Browser codes to OCR code + lang.browserCodes.forEach(browserCode => { + browserToOcrMap.set(browserCode.toLowerCase(), lang.ocrCode); + }); +}); /** * Maps a browser language code to an OCR language code @@ -338,6 +901,11 @@ const browserToOcrMapping: LanguageMapping = { * * @param browserLanguage - The browser language code (e.g., 'en-GB', 'fr-FR') * @returns OCR language code if found, null if no match + * + * @example + * mapBrowserLanguageToOcr('de-DE') // Returns 'deu' + * mapBrowserLanguageToOcr('en-GB') // Returns 'eng' + * mapBrowserLanguageToOcr('zh-CN') // Returns 'chi_sim' */ export function mapBrowserLanguageToOcr(browserLanguage: string): string | null { if (!browserLanguage) return null; @@ -346,38 +914,107 @@ export function mapBrowserLanguageToOcr(browserLanguage: string): string | null const normalizedInput = browserLanguage.toLowerCase().replace('_', '-'); // Try exact match first - const exactMatch = browserToOcrMapping[normalizedInput]; + const exactMatch = browserToOcrMap.get(normalizedInput); if (exactMatch) return exactMatch; // Try with different casing variations const variations = [ - browserLanguage, browserLanguage.toLowerCase(), - browserLanguage.toUpperCase(), + browserLanguage.toUpperCase().toLowerCase(), normalizedInput, ]; for (const variant of variations) { - const match = browserToOcrMapping[variant]; + const match = browserToOcrMap.get(variant); if (match) return match; } // Try base language code (e.g., 'en' from 'en-GB') const baseLanguage = normalizedInput.split('-')[0]; - const baseMatch = browserToOcrMapping[baseLanguage]; + const baseMatch = browserToOcrMap.get(baseLanguage); if (baseMatch) return baseMatch; // No match found return null; } +/** + * Gets the display name for an OCR language code + * + * @param ocrCode - The OCR language code (e.g., 'eng', 'deu') + * @returns Display name if found, the original code if not found + * + * @example + * getOcrDisplayName('deu') // Returns 'German' + * getOcrDisplayName('eng') // Returns 'English' + * getOcrDisplayName('chi_sim') // Returns 'Chinese (Simplified)' + */ +export function getOcrDisplayName(ocrCode: string): string { + return ocrToDisplayMap.get(ocrCode) || ocrCode; +} + +/** + * Gets the OCR code from a display name + * + * @param displayName - The display name (e.g., 'English', 'German') + * @returns OCR code if found, null if no match + * + * @example + * getOcrCodeFromDisplayName('German') // Returns 'deu' + * getOcrCodeFromDisplayName('English') // Returns 'eng' + * getOcrCodeFromDisplayName('chinese (simplified)') // Returns 'chi_sim' (case insensitive) + */ +export function getOcrCodeFromDisplayName(displayName: string): string | null { + return displayToOcrMap.get(displayName.toLowerCase()) || null; +} + +/** + * Gets the browser language codes for an OCR language code + * + * @param ocrCode - The OCR language code (e.g., 'eng', 'deu') + * @returns Array of browser language codes + * + * @example + * getBrowserLanguagesForOcr('deu') // Returns ['de', 'de-DE', 'de-AT', 'de-CH'] + * getBrowserLanguagesForOcr('eng') // Returns ['en', 'en-US', 'en-GB', 'en-AU', ...] + * getBrowserLanguagesForOcr('nor') // Returns ['no', 'nb', 'nn', 'no-NO', 'nb-NO', 'nn-NO'] + */ +export function getBrowserLanguagesForOcr(ocrCode: string): string[] { + return ocrToBrowserMap.get(ocrCode) || []; +} + /** * Gets the OCR language code for the current browser language * * @param currentLanguage - Current i18n language * @returns OCR language code array (empty if no match) + * + * @example + * getAutoOcrLanguage('de-DE') // Returns ['deu'] + * getAutoOcrLanguage('en-GB') // Returns ['eng'] + * getAutoOcrLanguage('unknown') // Returns [] */ export function getAutoOcrLanguage(currentLanguage: string): string[] { const ocrLanguage = mapBrowserLanguageToOcr(currentLanguage); return ocrLanguage ? [ocrLanguage] : []; -} \ No newline at end of file +} + +/** + * Gets all available language definitions + * + * @returns Array of all language definitions + * + * @example + * const allLanguages = getAllLanguageDefinitions(); + * // Returns: [{ ocrCode: 'eng', displayName: 'English', browserCodes: ['en', 'en-US', ...] }, ...] + */ +export function getAllLanguageDefinitions(): LanguageDefinition[] { + return [...languageDefinitions]; +} + +/** + * Legacy compatibility - provides the same interface as tempOcrLanguages.ts + */ +export const tempOcrLanguages = { + lang: Object.fromEntries(ocrToDisplayMap) +} as const; \ No newline at end of file diff --git a/frontend/src/utils/tempOcrLanguages.ts b/frontend/src/utils/tempOcrLanguages.ts deleted file mode 100644 index b42c54985..000000000 --- a/frontend/src/utils/tempOcrLanguages.ts +++ /dev/null @@ -1,135 +0,0 @@ -// TODO: Use actual language translations when they become available -// Temporary OCR language translations for development -export const tempOcrLanguages = { - "lang": { - "afr": "Afrikaans", - "amh": "Amharic", - "ara": "Arabic", - "asm": "Assamese", - "aze": "Azerbaijani", - "aze_cyrl": "Azerbaijani (Cyrillic)", - "bel": "Belarusian", - "ben": "Bengali", - "bod": "Tibetan", - "bos": "Bosnian", - "bre": "Breton", - "bul": "Bulgarian", - "cat": "Catalan", - "ceb": "Cebuano", - "ces": "Czech", - "chi_sim": "Chinese (Simplified)", - "chi_sim_vert": "Chinese (Simplified, Vertical)", - "chi_tra": "Chinese (Traditional)", - "chi_tra_vert": "Chinese (Traditional, Vertical)", - "chr": "Cherokee", - "cos": "Corsican", - "cym": "Welsh", - "dan": "Danish", - "dan_frak": "Danish (Fraktur)", - "deu": "German", - "deu_frak": "German (Fraktur)", - "div": "Divehi", - "dzo": "Dzongkha", - "ell": "Greek", - "eng": "English", - "enm": "English, Middle (1100-1500)", - "epo": "Esperanto", - "equ": "Math / equation detection module", - "est": "Estonian", - "eus": "Basque", - "fao": "Faroese", - "fas": "Persian", - "fil": "Filipino", - "fin": "Finnish", - "fra": "French", - "frk": "Frankish", - "frm": "French, Middle (ca.1400-1600)", - "fry": "Western Frisian", - "gla": "Scottish Gaelic", - "gle": "Irish", - "glg": "Galician", - "grc": "Ancient Greek", - "guj": "Gujarati", - "hat": "Haitian, Haitian Creole", - "heb": "Hebrew", - "hin": "Hindi", - "hrv": "Croatian", - "hun": "Hungarian", - "hye": "Armenian", - "iku": "Inuktitut", - "ind": "Indonesian", - "isl": "Icelandic", - "ita": "Italian", - "ita_old": "Italian (Old)", - "jav": "Javanese", - "jpn": "Japanese", - "jpn_vert": "Japanese (Vertical)", - "kan": "Kannada", - "kat": "Georgian", - "kat_old": "Georgian (Old)", - "kaz": "Kazakh", - "khm": "Central Khmer", - "kir": "Kirghiz, Kyrgyz", - "kmr": "Northern Kurdish", - "kor": "Korean", - "kor_vert": "Korean (Vertical)", - "lao": "Lao", - "lat": "Latin", - "lav": "Latvian", - "lit": "Lithuanian", - "ltz": "Luxembourgish", - "mal": "Malayalam", - "mar": "Marathi", - "mkd": "Macedonian", - "mlt": "Maltese", - "mon": "Mongolian", - "mri": "Maori", - "msa": "Malay", - "mya": "Burmese", - "nep": "Nepali", - "nld": "Dutch; Flemish", - "nor": "Norwegian", - "oci": "Occitan (post 1500)", - "ori": "Oriya", - "osd": "Orientation and script detection module", - "pan": "Panjabi, Punjabi", - "pol": "Polish", - "por": "Portuguese", - "pus": "Pushto, Pashto", - "que": "Quechua", - "ron": "Romanian, Moldavian, Moldovan", - "rus": "Russian", - "san": "Sanskrit", - "sin": "Sinhala, Sinhalese", - "slk": "Slovak", - "slk_frak": "Slovak (Fraktur)", - "slv": "Slovenian", - "snd": "Sindhi", - "spa": "Spanish", - "spa_old": "Spanish (Old)", - "sqi": "Albanian", - "srp": "Serbian", - "srp_latn": "Serbian (Latin)", - "sun": "Sundanese", - "swa": "Swahili", - "swe": "Swedish", - "syr": "Syriac", - "tam": "Tamil", - "tat": "Tatar", - "tel": "Telugu", - "tgk": "Tajik", - "tgl": "Tagalog", - "tha": "Thai", - "tir": "Tigrinya", - "ton": "Tonga (Tonga Islands)", - "tur": "Turkish", - "uig": "Uighur, Uyghur", - "ukr": "Ukrainian", - "urd": "Urdu", - "uzb": "Uzbek", - "uzb_cyrl": "Uzbek (Cyrillic)", - "vie": "Vietnamese", - "yid": "Yiddish", - "yor": "Yoruba" - } -} as const; \ No newline at end of file