mirror of
https://github.com/Stirling-Tools/Stirling-PDF.git
synced 2025-08-02 18:45:21 +00:00
1020 lines
19 KiB
TypeScript
1020 lines
19 KiB
TypeScript
// Unified Language System - Tri-directional mapping between browser languages, OCR codes, and display names
|
|
// Replaces both languageMapping.ts and tempOcrLanguages.ts
|
|
|
|
interface LanguageDefinition {
|
|
ocrCode: string;
|
|
displayName: string;
|
|
browserCodes: string[];
|
|
}
|
|
|
|
// Comprehensive language definitions with all mappings
|
|
const languageDefinitions: LanguageDefinition[] = [
|
|
// English
|
|
{
|
|
ocrCode: 'eng',
|
|
displayName: 'English',
|
|
browserCodes: ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IE', 'en-NZ', 'en-ZA']
|
|
},
|
|
|
|
// Spanish
|
|
{
|
|
ocrCode: 'spa',
|
|
displayName: 'Spanish',
|
|
browserCodes: ['es', 'es-ES', 'es-MX', 'es-AR', 'es-CO', 'es-CL', 'es-PE', 'es-VE']
|
|
},
|
|
|
|
// French
|
|
{
|
|
ocrCode: 'fra',
|
|
displayName: 'French',
|
|
browserCodes: ['fr', 'fr-FR', 'fr-CA', 'fr-BE', 'fr-CH']
|
|
},
|
|
|
|
// German
|
|
{
|
|
ocrCode: 'deu',
|
|
displayName: 'German',
|
|
browserCodes: ['de', 'de-DE', 'de-AT', 'de-CH']
|
|
},
|
|
|
|
// Portuguese
|
|
{
|
|
ocrCode: 'por',
|
|
displayName: 'Portuguese',
|
|
browserCodes: ['pt', 'pt-PT', 'pt-BR']
|
|
},
|
|
|
|
// Italian
|
|
{
|
|
ocrCode: 'ita',
|
|
displayName: 'Italian',
|
|
browserCodes: ['it', 'it-IT', 'it-CH']
|
|
},
|
|
|
|
// Chinese Simplified
|
|
{
|
|
ocrCode: 'chi_sim',
|
|
displayName: 'Chinese (Simplified)',
|
|
browserCodes: ['zh', 'zh-CN', 'zh-Hans']
|
|
},
|
|
|
|
// Chinese Traditional
|
|
{
|
|
ocrCode: 'chi_tra',
|
|
displayName: 'Chinese (Traditional)',
|
|
browserCodes: ['zh-TW', 'zh-HK', 'zh-Hant']
|
|
},
|
|
|
|
// Tibetan
|
|
{
|
|
ocrCode: 'bod',
|
|
displayName: 'Tibetan',
|
|
browserCodes: ['bo', 'zh-BO']
|
|
},
|
|
|
|
// Japanese
|
|
{
|
|
ocrCode: 'jpn',
|
|
displayName: 'Japanese',
|
|
browserCodes: ['ja', 'ja-JP']
|
|
},
|
|
|
|
// Korean
|
|
{
|
|
ocrCode: 'kor',
|
|
displayName: 'Korean',
|
|
browserCodes: ['ko', 'ko-KR']
|
|
},
|
|
|
|
// Russian
|
|
{
|
|
ocrCode: 'rus',
|
|
displayName: 'Russian',
|
|
browserCodes: ['ru', 'ru-RU']
|
|
},
|
|
|
|
// Arabic
|
|
{
|
|
ocrCode: 'ara',
|
|
displayName: 'Arabic',
|
|
browserCodes: ['ar', 'ar-SA', 'ar-EG', 'ar-AE', 'ar-MA']
|
|
},
|
|
|
|
// Dutch
|
|
{
|
|
ocrCode: 'nld',
|
|
displayName: 'Dutch; Flemish',
|
|
browserCodes: ['nl', 'nl-NL', 'nl-BE']
|
|
},
|
|
|
|
// Polish
|
|
{
|
|
ocrCode: 'pol',
|
|
displayName: 'Polish',
|
|
browserCodes: ['pl', 'pl-PL']
|
|
},
|
|
|
|
// Czech
|
|
{
|
|
ocrCode: 'ces',
|
|
displayName: 'Czech',
|
|
browserCodes: ['cs', 'cs-CZ']
|
|
},
|
|
|
|
// Slovak
|
|
{
|
|
ocrCode: 'slk',
|
|
displayName: 'Slovak',
|
|
browserCodes: ['sk', 'sk-SK']
|
|
},
|
|
|
|
// Hungarian
|
|
{
|
|
ocrCode: 'hun',
|
|
displayName: 'Hungarian',
|
|
browserCodes: ['hu', 'hu-HU']
|
|
},
|
|
|
|
// Romanian
|
|
{
|
|
ocrCode: 'ron',
|
|
displayName: 'Romanian, Moldavian, Moldovan',
|
|
browserCodes: ['ro', 'ro-RO']
|
|
},
|
|
|
|
// Bulgarian
|
|
{
|
|
ocrCode: 'bul',
|
|
displayName: 'Bulgarian',
|
|
browserCodes: ['bg', 'bg-BG']
|
|
},
|
|
|
|
// Croatian
|
|
{
|
|
ocrCode: 'hrv',
|
|
displayName: 'Croatian',
|
|
browserCodes: ['hr', 'hr-HR']
|
|
},
|
|
|
|
// Serbian
|
|
{
|
|
ocrCode: 'srp',
|
|
displayName: 'Serbian',
|
|
browserCodes: ['sr', 'sr-RS']
|
|
},
|
|
|
|
// Serbian Latin
|
|
{
|
|
ocrCode: 'srp_latn',
|
|
displayName: 'Serbian (Latin)',
|
|
browserCodes: ['sr-Latn']
|
|
},
|
|
|
|
// Slovenian
|
|
{
|
|
ocrCode: 'slv',
|
|
displayName: 'Slovenian',
|
|
browserCodes: ['sl', 'sl-SI']
|
|
},
|
|
|
|
// Estonian
|
|
{
|
|
ocrCode: 'est',
|
|
displayName: 'Estonian',
|
|
browserCodes: ['et', 'et-EE']
|
|
},
|
|
|
|
// Latvian
|
|
{
|
|
ocrCode: 'lav',
|
|
displayName: 'Latvian',
|
|
browserCodes: ['lv', 'lv-LV']
|
|
},
|
|
|
|
// Lithuanian
|
|
{
|
|
ocrCode: 'lit',
|
|
displayName: 'Lithuanian',
|
|
browserCodes: ['lt', 'lt-LT']
|
|
},
|
|
|
|
// Finnish
|
|
{
|
|
ocrCode: 'fin',
|
|
displayName: 'Finnish',
|
|
browserCodes: ['fi', 'fi-FI']
|
|
},
|
|
|
|
// Swedish
|
|
{
|
|
ocrCode: 'swe',
|
|
displayName: 'Swedish',
|
|
browserCodes: ['sv', 'sv-SE']
|
|
},
|
|
|
|
// Norwegian
|
|
{
|
|
ocrCode: 'nor',
|
|
displayName: 'Norwegian',
|
|
browserCodes: ['no', 'nb', 'nn', 'no-NO', 'nb-NO', 'nn-NO']
|
|
},
|
|
|
|
// Danish
|
|
{
|
|
ocrCode: 'dan',
|
|
displayName: 'Danish',
|
|
browserCodes: ['da', 'da-DK']
|
|
},
|
|
|
|
// Icelandic
|
|
{
|
|
ocrCode: 'isl',
|
|
displayName: 'Icelandic',
|
|
browserCodes: ['is', 'is-IS']
|
|
},
|
|
|
|
// Greek
|
|
{
|
|
ocrCode: 'ell',
|
|
displayName: 'Greek',
|
|
browserCodes: ['el', 'el-GR']
|
|
},
|
|
|
|
// Turkish
|
|
{
|
|
ocrCode: 'tur',
|
|
displayName: 'Turkish',
|
|
browserCodes: ['tr', 'tr-TR']
|
|
},
|
|
|
|
// Hebrew
|
|
{
|
|
ocrCode: 'heb',
|
|
displayName: 'Hebrew',
|
|
browserCodes: ['he', 'he-IL']
|
|
},
|
|
|
|
// Hindi
|
|
{
|
|
ocrCode: 'hin',
|
|
displayName: 'Hindi',
|
|
browserCodes: ['hi', 'hi-IN']
|
|
},
|
|
|
|
// Thai
|
|
{
|
|
ocrCode: 'tha',
|
|
displayName: 'Thai',
|
|
browserCodes: ['th', 'th-TH']
|
|
},
|
|
|
|
// Vietnamese
|
|
{
|
|
ocrCode: 'vie',
|
|
displayName: 'Vietnamese',
|
|
browserCodes: ['vi', 'vi-VN']
|
|
},
|
|
|
|
// Indonesian
|
|
{
|
|
ocrCode: 'ind',
|
|
displayName: 'Indonesian',
|
|
browserCodes: ['id', 'id-ID']
|
|
},
|
|
|
|
// Malay
|
|
{
|
|
ocrCode: 'msa',
|
|
displayName: 'Malay',
|
|
browserCodes: ['ms', 'ms-MY']
|
|
},
|
|
|
|
// Filipino
|
|
{
|
|
ocrCode: 'fil',
|
|
displayName: 'Filipino',
|
|
browserCodes: ['fil']
|
|
},
|
|
|
|
// Tagalog
|
|
{
|
|
ocrCode: 'tgl',
|
|
displayName: 'Tagalog',
|
|
browserCodes: ['tl']
|
|
},
|
|
|
|
// Ukrainian
|
|
{
|
|
ocrCode: 'ukr',
|
|
displayName: 'Ukrainian',
|
|
browserCodes: ['uk', 'uk-UA']
|
|
},
|
|
|
|
// Belarusian
|
|
{
|
|
ocrCode: 'bel',
|
|
displayName: 'Belarusian',
|
|
browserCodes: ['be', 'be-BY']
|
|
},
|
|
|
|
// Kazakh
|
|
{
|
|
ocrCode: 'kaz',
|
|
displayName: 'Kazakh',
|
|
browserCodes: ['kk', 'kk-KZ']
|
|
},
|
|
|
|
// Uzbek
|
|
{
|
|
ocrCode: 'uzb',
|
|
displayName: 'Uzbek',
|
|
browserCodes: ['uz', 'uz-UZ']
|
|
},
|
|
|
|
// Georgian
|
|
{
|
|
ocrCode: 'kat',
|
|
displayName: 'Georgian',
|
|
browserCodes: ['ka', 'ka-GE']
|
|
},
|
|
|
|
// Armenian
|
|
{
|
|
ocrCode: 'hye',
|
|
displayName: 'Armenian',
|
|
browserCodes: ['hy', 'hy-AM']
|
|
},
|
|
|
|
// Azerbaijani
|
|
{
|
|
ocrCode: 'aze',
|
|
displayName: 'Azerbaijani',
|
|
browserCodes: ['az', 'az-AZ']
|
|
},
|
|
|
|
// Persian/Farsi
|
|
{
|
|
ocrCode: 'fas',
|
|
displayName: 'Persian',
|
|
browserCodes: ['fa', 'fa-IR']
|
|
},
|
|
|
|
// Urdu
|
|
{
|
|
ocrCode: 'urd',
|
|
displayName: 'Urdu',
|
|
browserCodes: ['ur', 'ur-PK']
|
|
},
|
|
|
|
// Bengali
|
|
{
|
|
ocrCode: 'ben',
|
|
displayName: 'Bengali',
|
|
browserCodes: ['bn', 'bn-BD', 'bn-IN']
|
|
},
|
|
|
|
// Tamil
|
|
{
|
|
ocrCode: 'tam',
|
|
displayName: 'Tamil',
|
|
browserCodes: ['ta', 'ta-IN', 'ta-LK']
|
|
},
|
|
|
|
// Telugu
|
|
{
|
|
ocrCode: 'tel',
|
|
displayName: 'Telugu',
|
|
browserCodes: ['te', 'te-IN']
|
|
},
|
|
|
|
// Kannada
|
|
{
|
|
ocrCode: 'kan',
|
|
displayName: 'Kannada',
|
|
browserCodes: ['kn', 'kn-IN']
|
|
},
|
|
|
|
// Malayalam
|
|
{
|
|
ocrCode: 'mal',
|
|
displayName: 'Malayalam',
|
|
browserCodes: ['ml', 'ml-IN']
|
|
},
|
|
|
|
// Gujarati
|
|
{
|
|
ocrCode: 'guj',
|
|
displayName: 'Gujarati',
|
|
browserCodes: ['gu', 'gu-IN']
|
|
},
|
|
|
|
// Marathi
|
|
{
|
|
ocrCode: 'mar',
|
|
displayName: 'Marathi',
|
|
browserCodes: ['mr', 'mr-IN']
|
|
},
|
|
|
|
// Punjabi
|
|
{
|
|
ocrCode: 'pan',
|
|
displayName: 'Panjabi, Punjabi',
|
|
browserCodes: ['pa', 'pa-IN']
|
|
},
|
|
|
|
// Nepali
|
|
{
|
|
ocrCode: 'nep',
|
|
displayName: 'Nepali',
|
|
browserCodes: ['ne', 'ne-NP']
|
|
},
|
|
|
|
// Sinhala
|
|
{
|
|
ocrCode: 'sin',
|
|
displayName: 'Sinhala, Sinhalese',
|
|
browserCodes: ['si', 'si-LK']
|
|
},
|
|
|
|
// Burmese
|
|
{
|
|
ocrCode: 'mya',
|
|
displayName: 'Burmese',
|
|
browserCodes: ['my', 'my-MM']
|
|
},
|
|
|
|
// Khmer
|
|
{
|
|
ocrCode: 'khm',
|
|
displayName: 'Central Khmer',
|
|
browserCodes: ['km', 'km-KH']
|
|
},
|
|
|
|
// Lao
|
|
{
|
|
ocrCode: 'lao',
|
|
displayName: 'Lao',
|
|
browserCodes: ['lo', 'lo-LA']
|
|
},
|
|
|
|
// Mongolian
|
|
{
|
|
ocrCode: 'mon',
|
|
displayName: 'Mongolian',
|
|
browserCodes: ['mn', 'mn-MN']
|
|
},
|
|
|
|
// Welsh
|
|
{
|
|
ocrCode: 'cym',
|
|
displayName: 'Welsh',
|
|
browserCodes: ['cy', 'cy-GB']
|
|
},
|
|
|
|
// Irish
|
|
{
|
|
ocrCode: 'gle',
|
|
displayName: 'Irish',
|
|
browserCodes: ['ga', 'ga-IE']
|
|
},
|
|
|
|
// Scottish Gaelic
|
|
{
|
|
ocrCode: 'gla',
|
|
displayName: 'Scottish Gaelic',
|
|
browserCodes: ['gd', 'gd-GB']
|
|
},
|
|
|
|
// Basque
|
|
{
|
|
ocrCode: 'eus',
|
|
displayName: 'Basque',
|
|
browserCodes: ['eu', 'eu-ES']
|
|
},
|
|
|
|
// Catalan
|
|
{
|
|
ocrCode: 'cat',
|
|
displayName: 'Catalan',
|
|
browserCodes: ['ca', 'ca-ES']
|
|
},
|
|
|
|
// Galician
|
|
{
|
|
ocrCode: 'glg',
|
|
displayName: 'Galician',
|
|
browserCodes: ['gl', 'gl-ES']
|
|
},
|
|
|
|
// Macedonian
|
|
{
|
|
ocrCode: 'mkd',
|
|
displayName: 'Macedonian',
|
|
browserCodes: ['mk', 'mk-MK']
|
|
},
|
|
|
|
// Albanian
|
|
{
|
|
ocrCode: 'sqi',
|
|
displayName: 'Albanian',
|
|
browserCodes: ['sq', 'sq-AL']
|
|
},
|
|
|
|
// Maltese
|
|
{
|
|
ocrCode: 'mlt',
|
|
displayName: 'Maltese',
|
|
browserCodes: ['mt', 'mt-MT']
|
|
},
|
|
|
|
// Afrikaans
|
|
{
|
|
ocrCode: 'afr',
|
|
displayName: 'Afrikaans',
|
|
browserCodes: ['af', 'af-ZA']
|
|
},
|
|
|
|
// Swahili
|
|
{
|
|
ocrCode: 'swa',
|
|
displayName: 'Swahili',
|
|
browserCodes: ['sw', 'sw-KE', 'sw-TZ']
|
|
},
|
|
|
|
// Amharic
|
|
{
|
|
ocrCode: 'amh',
|
|
displayName: 'Amharic',
|
|
browserCodes: ['am']
|
|
},
|
|
|
|
// Assamese
|
|
{
|
|
ocrCode: 'asm',
|
|
displayName: 'Assamese',
|
|
browserCodes: ['as']
|
|
},
|
|
|
|
// Azerbaijani (Cyrillic)
|
|
{
|
|
ocrCode: 'aze_cyrl',
|
|
displayName: 'Azerbaijani (Cyrillic)',
|
|
browserCodes: []
|
|
},
|
|
|
|
// Bosnian
|
|
{
|
|
ocrCode: 'bos',
|
|
displayName: 'Bosnian',
|
|
browserCodes: ['bs']
|
|
},
|
|
|
|
// Breton
|
|
{
|
|
ocrCode: 'bre',
|
|
displayName: 'Breton',
|
|
browserCodes: ['br']
|
|
},
|
|
|
|
// Bambara
|
|
{
|
|
ocrCode: 'bam',
|
|
displayName: 'Bambara',
|
|
browserCodes: ['bm']
|
|
},
|
|
|
|
// Bashkir
|
|
{
|
|
ocrCode: 'bak',
|
|
displayName: 'Bashkir',
|
|
browserCodes: ['ba']
|
|
},
|
|
|
|
// Cornish
|
|
{
|
|
ocrCode: 'cor',
|
|
displayName: 'Cornish',
|
|
browserCodes: ['kw']
|
|
},
|
|
|
|
// Corsican
|
|
{
|
|
ocrCode: 'cos',
|
|
displayName: 'Corsican',
|
|
browserCodes: ['co']
|
|
},
|
|
|
|
// Ewe
|
|
{
|
|
ocrCode: 'ewe',
|
|
displayName: 'Ewe',
|
|
browserCodes: ['ee']
|
|
},
|
|
|
|
// Faroese
|
|
{
|
|
ocrCode: 'fao',
|
|
displayName: 'Faroese',
|
|
browserCodes: ['fo']
|
|
},
|
|
|
|
// Fijian
|
|
{
|
|
ocrCode: 'fij',
|
|
displayName: 'Fijian',
|
|
browserCodes: ['fj']
|
|
},
|
|
|
|
// Haitian Creole
|
|
{
|
|
ocrCode: 'hat',
|
|
displayName: 'Haitian, Haitian Creole',
|
|
browserCodes: ['ht']
|
|
},
|
|
|
|
// Javanese
|
|
{
|
|
ocrCode: 'jav',
|
|
displayName: 'Javanese',
|
|
browserCodes: ['jv']
|
|
},
|
|
|
|
// Kirghiz
|
|
{
|
|
ocrCode: 'kir',
|
|
displayName: 'Kirghiz, Kyrgyz',
|
|
browserCodes: ['ky']
|
|
},
|
|
|
|
// Quechua
|
|
{
|
|
ocrCode: 'que',
|
|
displayName: 'Quechua',
|
|
browserCodes: ['qu']
|
|
},
|
|
|
|
// Sindhi
|
|
{
|
|
ocrCode: 'snd',
|
|
displayName: 'Sindhi',
|
|
browserCodes: ['sd']
|
|
},
|
|
|
|
// Yiddish
|
|
{
|
|
ocrCode: 'yid',
|
|
displayName: 'Yiddish',
|
|
browserCodes: ['yi']
|
|
},
|
|
|
|
// Yoruba
|
|
{
|
|
ocrCode: 'yor',
|
|
displayName: 'Yoruba',
|
|
browserCodes: ['yo']
|
|
},
|
|
|
|
// Additional OCR languages without browser mappings or with very specific/rare codes
|
|
{
|
|
ocrCode: 'ceb',
|
|
displayName: 'Cebuano',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'chi_sim_vert',
|
|
displayName: 'Chinese (Simplified, Vertical)',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'chi_tra_vert',
|
|
displayName: 'Chinese (Traditional, Vertical)',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'chr',
|
|
displayName: 'Cherokee',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'dan_frak',
|
|
displayName: 'Danish (Fraktur)',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'deu_frak',
|
|
displayName: 'German (Fraktur)',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'div',
|
|
displayName: 'Divehi',
|
|
browserCodes: ['dv']
|
|
},
|
|
{
|
|
ocrCode: 'dzo',
|
|
displayName: 'Dzongkha',
|
|
browserCodes: ['dz']
|
|
},
|
|
{
|
|
ocrCode: 'enm',
|
|
displayName: 'English, Middle (1100-1500)',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'epo',
|
|
displayName: 'Esperanto',
|
|
browserCodes: ['eo']
|
|
},
|
|
{
|
|
ocrCode: 'equ',
|
|
displayName: 'Math / equation detection module',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'frk',
|
|
displayName: 'Frankish',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'frm',
|
|
displayName: 'French, Middle (ca.1400-1600)',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'fry',
|
|
displayName: 'Western Frisian',
|
|
browserCodes: ['fy']
|
|
},
|
|
{
|
|
ocrCode: 'grc',
|
|
displayName: 'Ancient Greek',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'iku',
|
|
displayName: 'Inuktitut',
|
|
browserCodes: ['iu']
|
|
},
|
|
{
|
|
ocrCode: 'ita_old',
|
|
displayName: 'Italian (Old)',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'jpn_vert',
|
|
displayName: 'Japanese (Vertical)',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'kat_old',
|
|
displayName: 'Georgian (Old)',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'kmr',
|
|
displayName: 'Northern Kurdish',
|
|
browserCodes: ['ku']
|
|
},
|
|
{
|
|
ocrCode: 'kor_vert',
|
|
displayName: 'Korean (Vertical)',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'lat',
|
|
displayName: 'Latin',
|
|
browserCodes: ['la']
|
|
},
|
|
{
|
|
ocrCode: 'ltz',
|
|
displayName: 'Luxembourgish',
|
|
browserCodes: ['lb']
|
|
},
|
|
{
|
|
ocrCode: 'mri',
|
|
displayName: 'Maori',
|
|
browserCodes: ['mi']
|
|
},
|
|
{
|
|
ocrCode: 'oci',
|
|
displayName: 'Occitan (post 1500)',
|
|
browserCodes: ['oc']
|
|
},
|
|
{
|
|
ocrCode: 'ori',
|
|
displayName: 'Oriya',
|
|
browserCodes: ['or']
|
|
},
|
|
{
|
|
ocrCode: 'osd',
|
|
displayName: 'Orientation and script detection module',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'pus',
|
|
displayName: 'Pushto, Pashto',
|
|
browserCodes: ['ps']
|
|
},
|
|
{
|
|
ocrCode: 'san',
|
|
displayName: 'Sanskrit',
|
|
browserCodes: ['sa']
|
|
},
|
|
{
|
|
ocrCode: 'slk_frak',
|
|
displayName: 'Slovak (Fraktur)',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'spa_old',
|
|
displayName: 'Spanish (Old)',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'sun',
|
|
displayName: 'Sundanese',
|
|
browserCodes: ['su']
|
|
},
|
|
{
|
|
ocrCode: 'syr',
|
|
displayName: 'Syriac',
|
|
browserCodes: []
|
|
},
|
|
{
|
|
ocrCode: 'tat',
|
|
displayName: 'Tatar',
|
|
browserCodes: ['tt']
|
|
},
|
|
{
|
|
ocrCode: 'tgk',
|
|
displayName: 'Tajik',
|
|
browserCodes: ['tg']
|
|
},
|
|
{
|
|
ocrCode: 'tir',
|
|
displayName: 'Tigrinya',
|
|
browserCodes: ['ti']
|
|
},
|
|
{
|
|
ocrCode: 'ton',
|
|
displayName: 'Tonga (Tonga Islands)',
|
|
browserCodes: ['to']
|
|
},
|
|
{
|
|
ocrCode: 'uig',
|
|
displayName: 'Uighur, Uyghur',
|
|
browserCodes: ['ug']
|
|
},
|
|
{
|
|
ocrCode: 'uzb_cyrl',
|
|
displayName: 'Uzbek (Cyrillic)',
|
|
browserCodes: []
|
|
}
|
|
];
|
|
|
|
// Build lookup maps for efficient access
|
|
const browserToOcrMap = new Map<string, string>();
|
|
const ocrToDisplayMap = new Map<string, string>();
|
|
const displayToOcrMap = new Map<string, string>();
|
|
const ocrToBrowserMap = new Map<string, string[]>();
|
|
|
|
// Populate lookup maps
|
|
languageDefinitions.forEach(lang => {
|
|
// OCR code to display name
|
|
ocrToDisplayMap.set(lang.ocrCode, lang.displayName);
|
|
|
|
// Display name to OCR code
|
|
displayToOcrMap.set(lang.displayName.toLowerCase(), lang.ocrCode);
|
|
|
|
// OCR code to browser codes
|
|
ocrToBrowserMap.set(lang.ocrCode, lang.browserCodes);
|
|
|
|
// Browser codes to OCR code
|
|
lang.browserCodes.forEach(browserCode => {
|
|
browserToOcrMap.set(browserCode.toLowerCase(), lang.ocrCode);
|
|
});
|
|
});
|
|
|
|
/**
|
|
* Maps a browser language code to an OCR language code
|
|
* Handles exact matches and similar language fallbacks
|
|
*
|
|
* @param browserLanguage - The browser language code (e.g., 'en-GB', 'fr-FR')
|
|
* @returns OCR language code if found, null if no match
|
|
*
|
|
* @example
|
|
* mapBrowserLanguageToOcr('de-DE') // Returns 'deu'
|
|
* mapBrowserLanguageToOcr('en-GB') // Returns 'eng'
|
|
* mapBrowserLanguageToOcr('zh-CN') // Returns 'chi_sim'
|
|
*/
|
|
export function mapBrowserLanguageToOcr(browserLanguage: string): string | null {
|
|
if (!browserLanguage) return null;
|
|
|
|
// Normalize the input
|
|
const normalizedInput = browserLanguage.toLowerCase().replace('_', '-');
|
|
|
|
// Try exact match first
|
|
const exactMatch = browserToOcrMap.get(normalizedInput);
|
|
if (exactMatch) return exactMatch;
|
|
|
|
// Try with different casing variations
|
|
const variations = [
|
|
browserLanguage.toLowerCase(),
|
|
browserLanguage.toUpperCase().toLowerCase(),
|
|
normalizedInput,
|
|
];
|
|
|
|
for (const variant of variations) {
|
|
const match = browserToOcrMap.get(variant);
|
|
if (match) return match;
|
|
}
|
|
|
|
// Try base language code (e.g., 'en' from 'en-GB')
|
|
const baseLanguage = normalizedInput.split('-')[0];
|
|
const baseMatch = browserToOcrMap.get(baseLanguage);
|
|
if (baseMatch) return baseMatch;
|
|
|
|
// No match found
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Gets the display name for an OCR language code
|
|
*
|
|
* @param ocrCode - The OCR language code (e.g., 'eng', 'deu')
|
|
* @returns Display name if found, the original code if not found
|
|
*
|
|
* @example
|
|
* getOcrDisplayName('deu') // Returns 'German'
|
|
* getOcrDisplayName('eng') // Returns 'English'
|
|
* getOcrDisplayName('chi_sim') // Returns 'Chinese (Simplified)'
|
|
*/
|
|
export function getOcrDisplayName(ocrCode: string): string {
|
|
return ocrToDisplayMap.get(ocrCode) || ocrCode;
|
|
}
|
|
|
|
/**
|
|
* Gets the OCR code from a display name
|
|
*
|
|
* @param displayName - The display name (e.g., 'English', 'German')
|
|
* @returns OCR code if found, null if no match
|
|
*
|
|
* @example
|
|
* getOcrCodeFromDisplayName('German') // Returns 'deu'
|
|
* getOcrCodeFromDisplayName('English') // Returns 'eng'
|
|
* getOcrCodeFromDisplayName('chinese (simplified)') // Returns 'chi_sim' (case insensitive)
|
|
*/
|
|
export function getOcrCodeFromDisplayName(displayName: string): string | null {
|
|
return displayToOcrMap.get(displayName.toLowerCase()) || null;
|
|
}
|
|
|
|
/**
|
|
* Gets the browser language codes for an OCR language code
|
|
*
|
|
* @param ocrCode - The OCR language code (e.g., 'eng', 'deu')
|
|
* @returns Array of browser language codes
|
|
*
|
|
* @example
|
|
* getBrowserLanguagesForOcr('deu') // Returns ['de', 'de-DE', 'de-AT', 'de-CH']
|
|
* getBrowserLanguagesForOcr('eng') // Returns ['en', 'en-US', 'en-GB', 'en-AU', ...]
|
|
* getBrowserLanguagesForOcr('nor') // Returns ['no', 'nb', 'nn', 'no-NO', 'nb-NO', 'nn-NO']
|
|
*/
|
|
export function getBrowserLanguagesForOcr(ocrCode: string): string[] {
|
|
return ocrToBrowserMap.get(ocrCode) || [];
|
|
}
|
|
|
|
/**
|
|
* Gets the OCR language code for the current browser language
|
|
*
|
|
* @param currentLanguage - Current i18n language
|
|
* @returns OCR language code array (empty if no match)
|
|
*
|
|
* @example
|
|
* getAutoOcrLanguage('de-DE') // Returns ['deu']
|
|
* getAutoOcrLanguage('en-GB') // Returns ['eng']
|
|
* getAutoOcrLanguage('unknown') // Returns []
|
|
*/
|
|
export function getAutoOcrLanguage(currentLanguage: string): string[] {
|
|
const ocrLanguage = mapBrowserLanguageToOcr(currentLanguage);
|
|
return ocrLanguage ? [ocrLanguage] : [];
|
|
}
|
|
|
|
/**
|
|
* Gets all available language definitions
|
|
*
|
|
* @returns Array of all language definitions
|
|
*
|
|
* @example
|
|
* const allLanguages = getAllLanguageDefinitions();
|
|
* // Returns: [{ ocrCode: 'eng', displayName: 'English', browserCodes: ['en', 'en-US', ...] }, ...]
|
|
*/
|
|
export function getAllLanguageDefinitions(): LanguageDefinition[] {
|
|
return [...languageDefinitions];
|
|
}
|
|
|
|
/**
|
|
* Legacy compatibility - provides the same interface as tempOcrLanguages.ts
|
|
*/
|
|
export const tempOcrLanguages = {
|
|
lang: Object.fromEntries(ocrToDisplayMap)
|
|
} as const;
|