-
Notifications
You must be signed in to change notification settings - Fork 1
/
language_tag.ts
380 lines (344 loc) · 13.2 KB
/
language_tag.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
/**
* Represents tags for identifying languages. See also RFC 5646.
*
* It guarantees all instances are interned and only single instance is made
* for each language tag, which means, `===` operator can be used for testing
* structural equality of two {@link LanguageTag} operands.
* @copyright 2021–2023 Hong Minhee
* @license LGPL-3.0-only
*/
export class LanguageTag {
/** Required set of permissions for using this class within Deno. */
static readonly requiredPermissions: Deno.PermissionOptionsObject = {
net: ["cdn.skypack.dev"],
};
/** An ISO 639 code which consists of two or three letters. */
readonly language: string;
/** An ISO 15924 code which consists of four letters. */
readonly script: string | null;
/** An ISO 3166-1 code which consists of two or three letters. */
readonly region: string | null;
private constructor(
language: string,
script?: string | null,
region?: string | null,
) {
if (!language.match(/^[A-Za-z]{2,3}$/)) {
throw new LanguageTagError(
"A language must be 2-or-3-letter ISO 639 code.",
);
} else if (script != null && !script.match(/^[A-Za-z]{4}$/)) {
throw new LanguageTagError("A script must be 4-letter ISO 15924 code.");
} else if (region != null && !region.match(/^[A-Za-z]{2,3}$/)) {
throw new LanguageTagError(
"A region must be 2-or-3-letter ISO 3166-1 code.",
);
}
this.language = language.toLowerCase();
this.script = script?.toLowerCase() || null;
this.region = region?.toLowerCase() || null;
}
private static readonly interns: Record<string, LanguageTag> = {};
/**
* Gets a {@link LanguageTag} instance.
* @param language An ISO 639 code which consists of two or three letters.
* @param script An ISO 15924 code which consists of four letters.
* @param region An ISO 3166-1 code which consists of two or three letters.
* @returns A corresponding {@link LanguageTag} instance.
* @throws {LanguageTagError} Thrown when any parameter is invalid.
*/
static get(
language: string,
script?: string | null,
region?: string | null,
): LanguageTag {
const langTag = new this(language, script, region);
const key = langTag.toString();
const interned = this.interns[key];
return interned || (this.interns[key] = langTag);
}
private static readonly PATTERN: RegExp =
/^([A-Za-z]{2,3})(-([A-Za-z]{4}))?(-([A-Za-z]{2,3}))?$/;
/**
* Parses an RFC 5646 language tag string.
* @param tagString An RFC 5646 language tag.
* @returns A corresponding {@link LanguageTag} instance.
* @throws {LanguageTagError} Thrown when the given argument is not a valid
* RFC 5646 language tag.
*/
static fromString(tagString: string): LanguageTag {
const fullTag = this.PATTERN.exec(tagString);
if (fullTag == null) {
throw new LanguageTagError(
`Not a correct language tag string: ${JSON.stringify(tagString)}.`,
);
}
return this.get(fullTag[1], fullTag[3], fullTag[5]);
}
/**
* Checks if the language tag instance shares some parts in common with
* the operand. The operand is treated as a pattern, which means if some
* fields are empty these are treated as wildcard. For example:
*
* - `ko-KR` matches to `ko-KR`
* - `ko-KR` matches to `ko`
* - `ko-KR` does not match to `ko-Kore`
* - `ko-KR` does not match to `ko-KP`
* - `zh-Hant-HK` matches to `zh-Hant-HK`
* - `zh-Hant-HK` matches to `zh-HK`
* - `zh-Hant-HK` matches to `zh-Hant`
* - `zh-Hant-HK` matches to `zh`
* - `zh-Hant-HK` does not match to `zh-Hant-TW`
* - `zh-Hant-HK` does not match to `zh-Hans-HK`
* - `zh-Hant-HK` does not match to `en`
* @param language The language pattern.
* @returns Whether two language tags are compatible.
* @throws {LanguageTagError} Thrown when the given argument is a string which
* is not a valid RFC 5646 language tag.
*/
matches(language: LanguageTag | string): boolean {
const pattern = typeof language == "string"
? LanguageTag.fromString(language)
: language;
return this === pattern ||
this.language === pattern.language &&
(pattern.script == null || this.script === pattern.script) &&
(pattern.region == null || this.region === pattern.region);
}
/**
* Lists all available reduced (i.e., less specific) language tags for the
* given language tag. For example, `en-Latn-US` has `en-Latn`, `en-US`, and
* `en` as reduced tags.
* @param containSelf Whether the reduced tags should contain the given
* language tag itself.
* @returns A list of reduced tags. The most specific tag is listed first,
* and the most general tag is listed last.
*/
*reduce(containSelf = false): Iterable<LanguageTag> {
const scripts = this.script == null ? [null] : [this.script, null];
const regions = this.region == null ? [null] : [this.region, null];
for (const script of scripts) {
for (const region of regions) {
if (containSelf || this.script != script || this.region != region) {
yield LanguageTag.get(this.language, script, region);
}
}
}
}
static readonly #CLDR_LIKELY_SUBTAGS_URL =
"https://cdn.skypack.dev/cldr-core/supplemental/likelySubtags.json";
static #cldrLikelySubtags?: Record<string, string>;
/**
* Gets the most likely subtag according to Unicode CLDR (Likely Subtags in
* Supplemental Data), if available. For example, `ko` is likely to be
* `ko-Kore-KR`, and `zh-TW` is likely to be `zh-Hant-TW`.
* @returns The most likely subtag if available, otherwise it returns the
* original language tag.
*/
async toLikelySubtag(): Promise<LanguageTag> {
if (LanguageTag.#cldrLikelySubtags == null) {
let data: Record<string, string>;
try {
const response = await fetch(LanguageTag.#CLDR_LIKELY_SUBTAGS_URL);
const json = await response.json();
data = json.supplemental.likelySubtags;
} catch {
data = {};
}
LanguageTag.#cldrLikelySubtags = Object.fromEntries(
Object.entries(data).map(([k, v]) => [k.toLowerCase(), v]),
);
}
const tag = LanguageTag.#cldrLikelySubtags[this.toString().toLowerCase()];
if (tag == null) return this;
return LanguageTag.fromString(tag);
}
static readonly #CLDR_LOCALENAMES_URL_BASE =
"https://cdn.skypack.dev/cldr-localenames-full/main/";
private async *tryCldrUrls(
file: string,
): AsyncIterable<[LanguageTag, string]> {
const base = LanguageTag.#CLDR_LOCALENAMES_URL_BASE.replace(/\/$/, "");
yield [this, `${base}/${this.toString()}/${file}`];
const likelySubtag = await this.toLikelySubtag();
const similarTags = likelySubtag.reduce(likelySubtag !== this);
for (const l of similarTags) {
if (l !== this) yield [l, `${base}/${l.toString()}/${file}`];
}
}
static #cldrData: Map<
LanguageTag,
Record<string, Record<string, unknown> | undefined> | null
> = new Map();
private async fetchCldr(file: string): Promise<Record<string, unknown>> {
const triedUrls: string[] = [];
let data: Record<string, unknown> | undefined;
for await (const [l, url] of this.tryCldrUrls(file)) {
triedUrls.push(url);
const cldrData = LanguageTag.#cldrData.get(l);
if (cldrData != null && cldrData[file] != null) {
data = cldrData[file];
} else if (cldrData === null) {
continue;
} else {
let json;
try {
const response = await fetch(url);
json = await response.json();
} catch {
LanguageTag.#cldrData.set(l, null);
continue;
}
const tag = l.toString();
const main = json?.main[tag] != null
? json.main[tag]
: Object.values(json.main)[0];
data = main.localeDisplayNames;
if (cldrData == null) {
LanguageTag.#cldrData.set(l, { file: data });
} else {
cldrData[file] = data;
}
}
break;
}
if (data == null) {
throw new LanguageTagError(
"There is no Unicode CLDR sheet for the language tag: "
`${this.toString()}. Tried URLs are:\n ${triedUrls.join("\n ")}`,
);
}
return data;
}
/**
* Looks up the display name for the given language tag in this language
* from the Unicode CLDR data.
* @param language The language tag to look up.
* @returns The display name for the given language tag. If the given
* language tag is not found, `null` is returned.
*/
async getLanguageName(
language: string | LanguageTag,
): Promise<string | null> {
language = typeof language == "string"
? LanguageTag.fromString(language)
: language;
const data = await this.fetchCldr("languages.json");
const languages = data.languages as Record<string, string> | undefined;
if (languages == null) return null;
type Pattern =
| "localePattern"
| "localeSeparator"
| "localeKeyTypePattern";
const localeDisplayPattern = async (): Promise<Record<Pattern, string>> => {
const data = await this.fetchCldr("localeDisplayNames.json");
return data.localeDisplayPattern as Record<Pattern, string>;
};
function replace(pattern: string, a?: string, b?: string): string {
return pattern.replace(
/\{[01]\}/g,
(p) => p === "{0}" ? `${a}` : `${b}`,
);
}
let languageName: string | null = null;
for (const reduced of language.reduce(true)) {
const name = languages[reduced.toString()];
if (name == null) continue;
languageName = name;
if (reduced == language) break;
let variants: [] | [string] | [string, string] = [];
if (language.region != null && reduced.region != language.region) {
const territory = await this.getTerritoryName(language.region);
if (territory != null) variants = [territory];
}
if (language.script != null && reduced.script != language.script) {
const script = await this.getScriptName(language.script);
if (script != null) variants = [script, ...variants];
}
const pattern = (await localeDisplayPattern()).localePattern;
const generalName = languages[reduced.language];
if (reduced.region != null) {
const regionName = await this.getTerritoryName(reduced.region);
if (
regionName != null &&
languageName === replace(pattern, generalName, regionName)
) {
languageName = generalName;
variants = variants[0] != null
? [variants[0], regionName]
: [regionName];
}
} else if (reduced.script != null) {
const scriptName = await this.getScriptName(reduced.script);
if (
scriptName != null &&
languageName === replace(pattern, generalName, scriptName)
) {
languageName = generalName;
variants = variants[0] != null
? [scriptName, variants[0]]
: [scriptName];
}
}
if (variants.length == 2) {
const separator = (await localeDisplayPattern()).localeSeparator;
variants = [replace(separator, ...variants)];
}
if (variants.length == 1) {
languageName = replace(pattern, languageName, variants[0]);
}
break;
}
return languageName;
}
/**
* Looks up the display name of the given `territory` in this language
* from the Unicode CLDR data.
* @param territory The territory to look up. The territory must be a valid
* ISO 3166-1 alpha-2 code.
* @returns The display name of the territory. If the territory is not found,
* `null` is returned.
*/
async getTerritoryName(territory: string): Promise<string | null> {
territory = territory.toUpperCase();
const data = await this.fetchCldr("territories.json");
const territories = data.territories as Record<string, string> | undefined;
return territories?.[territory] ?? null;
}
/**
* Looks up the display name of the given `script` in this language
* from the Unicode CLDR data.
* @param script The script to look up. The script must be a valid
* ISO 15924 code.
* @returns The display name of the script, or `null` if it is unavailable
* from the Unicode CLDR data.
*/
async getScriptName(script: string): Promise<string | null> {
script = script[0].toUpperCase() script.substr(1).toLowerCase();
const data = await this.fetchCldr("scripts.json");
const scripts = data?.scripts as Record<string, string> | undefined;
return scripts?.[script] ?? null;
}
/**
* Turns a language tag object into an RFC 5646 string.
* @returns A string formatted as RFC 5646.
*/
toString(): string {
return [
this.language,
this.script && `${this.script[0].toUpperCase()}${this.script.substr(1)}`,
this.region?.toUpperCase(),
].filter((t) => t != null).join("-");
}
[Symbol.for("Deno.customInspect")](): string {
return `LanguageTag(${this.toString()})`;
}
}
/** Thrown when an invalid language tag is tried to be made. */
export class LanguageTagError extends Error {
constructor(message?: string) {
super(message);
this.name = "LanguageTagError";
}
}
export default LanguageTag;