-
Notifications
You must be signed in to change notification settings - Fork 7
/
language.py
103 lines (90 loc) · 4.17 KB
/
language.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""Module to convert between different kinds of language codes.
There are three kinds of language codes relevant for this tool:
- MediaWiki language codes
- HTML language codes (aka BCP47 language tags)
- Babel language codes
Additionally, MediaWiki language codes differ by usage between:
- lexicographical data language codes
- user interface language codes
Templates specify lexicographical data MediaWiki language codes;
translations are given for user interface language codes.
These language codes cannot be converted to one another losslessly,
and this module does not provide functions for all possible
conversions; rather, it only implements conversions to less specific
codes, which may lose some information. (The conversion from
MediaWiki language codes to HTML language codes, previously called
lang_int2html() in this module, is now lang_mw_to_bcp47() and lives in
toolforge_i18n.)"""
from toolforge_i18n import language_code_to_babel
def lang_lex2int(code: str) -> str:
"""Convert a MediaWiki language code from lexicographical data usage
to user interface usage."""
return {
# Manbhumi reuses the standard Bengali messages
'bn-x-Q6747180': 'bn',
}.get(code, code)
def lang_int2babel(code: str) -> str:
"""Convert a MediaWiki user interface language code to a Babel one."""
mapped = language_code_to_babel(code)
if mapped != code:
return mapped
return {
# Latin and Venetian are not in CLDR, Italian is similar for our purposes
'la': 'it',
'vec': 'it',
# pnb (Western Punjabi) is represented in Babel
# as pa_Arab (Punjabi in Arabic script);
# its replacement lah (Lahnda) is not in Babel
'pnb': 'pa_Arab',
# Serbo-Croatian is not in CLDR, Croatian is closest for our purposes
'sh-latn': 'hr',
# hno (Hindko) is not in Babel, nor is lah (Lahnda);
# use pa_Arab just like for pnb (Western Punjabi) above,
# as pnb is said to be mutually intellegible (https://w.wiki/6Nu7)
'hno': 'pa_Arab',
# anp (Angika) is not in Babel;
# hi (Hindi) is the MediaWiki fallback
'anp': 'hi',
# ban (Balinese) is not in Babel;
# id (Indonesian) is the MediaWiki fallback
'ban': 'id',
# io (Ido) is not in CLDR;
# eo (Esperanto) is the MediaWiki fallback
'io': 'eo',
# krc (Karachay-Balkar) is not in Babel;
# ru (Russian) is the MediaWiki fallback
'krc': 'ru',
# mrh (Mara) is not in CLDR;
# MediaWiki has no fallback, so I assume reusing plurals etc. from English is okay
'mrh': 'en',
# roa-tara (tarandíne) is not in CLDR;
# it (Italian) is the MediaWiki fallback
'roa-tara': 'it',
# xmf (Mingrelian) is not in Babel;
# ka (Georgian) is the MediaWiki fallback
'xmf': 'ka',
# skr-arab (Saraiki) is not in Babel;
# ur (Urdu) is the MediaWiki fallback and its .text_direction matches
'skr-arab': 'ur',
# ba (Bashkir) is not in Babel;
# tt (Tatar) is closely related, has the same plural forms,
# and its list formatting seems to match ba better than MediaWiki’s ba fallback ru (Russian)
'ba': 'tt',
# kai (Karai-Karai or Karekare) is not in Babel;
# MediaWiki support is still in progress (T345807),
# so the fallback to ha (Hausa) here is just a total guess
# based on both languages being West Chadic according to Wikipedia
'kai': 'ha',
# an (Aragonese) is not in Babel;
# es (Spanish) is the MediaWiki fallback its .text_direction matches
'an': 'es',
# kaa (Karakalpak) is not in Babel;
# in MediaWiki, it’s in Latin script, but its closest relatives in Babel are all in Cyrillic script;
# uz (Uzbek) has the same plural forms,
# and its list formatting (“X and Y”) is probably intelligible to Karakalpak speakers for geopolitical reasons
# (kk_Latn would be even better, if it is added to Babel)
'kaa': 'uz',
# ht (Haitian Creole) is Babel;
# fr (French) the MediaWiki fallback
'ht': 'fr',
}.get(code, code.partition('-')[0])