Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check-language-consistency #4251

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
import org.unicode.cldr.util.StandardCodes;
import org.unicode.cldr.util.StandardCodes.LstrType;
import org.unicode.cldr.util.SupplementalDataInfo;
import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData;
import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type;
import org.unicode.cldr.util.Validity;
import org.unicode.cldr.util.Validity.Status;

Expand Down Expand Up @@ -560,7 +562,7 @@
}

if (!collectedBad.isEmpty()) {
warnln(

Check warning on line 565 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:565) Warning: Locales have 220 unexpected characters in main and/or aux: [҂״܀-܍०-९৲-৺੦-੯૰౦-౯໐-໙၀-၉၏႐-႙႞႟፠-፼᎐-᎙᠐-᠙⳥-⳪꒐-꓆𞅏{a\:}{ch’}{e\:}{i\:}{k’}{o\:}{ts’}{tł’}{t’}{à\:}{á\:}{è\:}{é\:}{ì\:}{í\:}{ò\:}{ó\:}{ଅ\:}{ଆ\:}{ଏ\:}]
"Locales have "
+ collectedBad.size()
+ " unexpected characters in main and/or aux:\t"
Expand Down Expand Up @@ -915,4 +917,112 @@
}
}
}

final Joiner JOIN = Joiner.on("").useForNull("null");

/**
* The first primary script in scripts must be the likely script for the language with no region.
* <pre>
* &lt;languageData>
* &lt;likelySubtag from="sr" to="sr_Cyrl_RS"/>
* &lt;likelySubtag from="sr_ME" to="sr_Latn_ME"/>
* </pre>
* So because of the above, we should see Cyrl as the first in the scripts list in the following (which we do).
* <pre>
* &lt;language type="sr" scripts="Cyrl Latn" territories="BA ME RS XK"/>
* <pre>
*/
public void testBasicLanguageDataConsistency() {
Map<String, String> likelyData = SUPPLEMENTAL_DATA_INFO.getLikelySubtags();
Set<String> langOnlyLikelyFrom = new LinkedHashSet<>();

for (Entry<String, String> likelyEntry : likelyData.entrySet()) {
CLDRLocale from = CLDRLocale.getInstance(likelyEntry.getKey());
CLDRLocale to = CLDRLocale.getInstance(likelyEntry.getValue());
String fromLang = from.getLanguage();
if (fromLang.equals("und")) {
continue;
}
if (!from.getScript().isEmpty()) {
continue;
}
boolean noFromRegion = from.getRegion().isEmpty();
if (noFromRegion) {
langOnlyLikelyFrom.add(fromLang);
}
String toScript = to.getScript();

final Map<Type, BasicLanguageData> basicLanguageDataMap =
SUPPLEMENTAL_DATA_INFO.getBasicLanguageDataMap(fromLang);
if (basicLanguageDataMap == null) {
continue;
}
for (Entry<Type, BasicLanguageData> entry : basicLanguageDataMap.entrySet()) {
if (entry.getKey() == Type.secondary) { // skip secondaries
continue;
}
BasicLanguageData data = entry.getValue();
Set<String> scripts = data.getScripts();
// NOTE: this should be an immutable linked hash set to preserve order

String fromAndTo =
JOIN.join(
from.getDisplayName(),
" (",
from,
") ⇒ ",
to.getDisplayName(),
" (",
to,
")");

if (noFromRegion) {
// if there is no fromRegion, then it must match the *first* script.
String first = scripts.isEmpty() ? "missing" : scripts.iterator().next();
assertEquals(

Check failure on line 982 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:982) Error: : aii (aii) ⇒ aii (Syriac,Iraq) (aii_Syrc_IQ): first primary languageData script = likely : expected "Syrc", got "Cyrl"

Check failure on line 982 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:982) Error: : Azerbaijani (az) ⇒ Azerbaijani (Latin,Azerbaijan) (az_Latn_AZ): first primary languageData script = likely : expected "Latn", got "Arab"

Check failure on line 982 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:982) Error: : Bosnian (bs) ⇒ Bosnian (Latin,Bosnia & Herzegovina) (bs_Latn_BA): first primary languageData script = likely : expected "Latn", got "Cyrl"

Check failure on line 982 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:982) Error: : Chakma (ccp) ⇒ Chakma (Chakma,Bangladesh) (ccp_Cakm_BD): first primary languageData script = likely : expected "Cakm", got "Beng"

Check failure on line 982 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:982) Error: : cjs (cjs) ⇒ cjs (Latin,Russia) (cjs_Latn_RU): first primary languageData script = likely : expected "Latn", got "Cyrl"

Check failure on line 982 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:982) Error: : ctd (ctd) ⇒ ctd (Pau Cin Hau,Myanmar (Burma)) (ctd_Pauc_MM): first primary languageData script = likely : expected "Pauc", got "Latn"

Check failure on line 982 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:982) Error: : Hausa (ha) ⇒ Hausa (Latin,Nigeria) (ha_Latn_NG): first primary languageData script = likely : expected "Latn", got "Arab"

Check failure on line 982 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:982) Error: : Hmong Njua (hnj) ⇒ Hmong Njua (Nyiakeng Puachue Hmong,United States) (hnj_Hmnp_US): first primary languageData script = likely : expected "Hmnp", got "Laoo"

Check failure on line 982 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:982) Error: : Kazakh (kk) ⇒ Kazakh (Cyrillic,Kazakhstan) (kk_Cyrl_KZ): first primary languageData script = likely : expected "Cyrl", got "Arab"

Check failure on line 982 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:982) Error: : Kurdish (ku) ⇒ Kurdish (Latin,Türkiye) (ku_Latn_TR): first primary languageData script = likely : expected "Latn", got "Arab"
fromAndTo + ": first primary languageData script = likely ",
toScript,
first);
} else {
// otherwise, the likely script must be somewhere in the list,
// but doesn't need to be first
assertTrue(
JOIN.join(
fromAndTo,
": primary languageData scripts ",
scripts,
" must contain ",
toScript),
scripts.contains(toScript));
}
}
}

Set<String> basicDataLanguages = SUPPLEMENTAL_DATA_INFO.getBasicLanguageDataLanguages();
if (basicDataLanguages.contains("und")) {
errln(
"NOTE: should not have 'und' in basic data, eg no:\n\t<language type='und' territories='AQ CP HM' alt='secondary'/>");
basicDataLanguages =
Sets.difference(
SUPPLEMENTAL_DATA_INFO.getBasicLanguageDataLanguages(), Set.of("und"));
}

Set<String> inBasicLanguageDataButNotLikely =
Sets.difference(basicDataLanguages, langOnlyLikelyFrom);
if (!inBasicLanguageDataButNotLikely.isEmpty()) {
errln(
JOIN.join(
"Basic data languages missing some from likely",
inBasicLanguageDataButNotLikely));
}

Set<String> inLikelyButNotBasicLanguageData =
Sets.difference(langOnlyLikelyFrom, basicDataLanguages);
if (!inLikelyButNotBasicLanguageData.isEmpty()) {
warnln(

Check warning on line 1022 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:1022) Warning: Basic data languages missing some from likely (not serious issue)[aaa, aab, aac, aad, aae, aaf, aag, aah, aai, aak, aal, aan, aao, aap, aaq, aas, aat, aau, aaw, aax, aaz, aba, abb, abc, abd, abe, abf, abg, abh, abi, abl, abm, abn, abo, abp, abs, abt, abu, abv, abw, abx, aby, abz, aca, acb, acd, acf, acm, acn, acp, acq, acr, acs, act, acu, acv, acw, acx, acy, acz, adb, add, ade, adf, adg, adh, adi, adj, adl, adn, ado, adq, adr, adt, adu, adw, adx, adz, aea, aec, aee, aek, ael, aem, aeq, aer, aeu, aew, aey, aez, afb, afd, afe, afh, afi, afk, afn, afo, afp, afs, afu, afz, aga, agb, agc, agd, age, agf, agg, agh, agi, agj, agk, agl, agm, agn, ago, agr, ags, agt, agu, agv, agw, agx, agy, agz, aha, ahb, ahg, ahh, ahi, ahk, ahl, ahm, ahn, aho, ahp, ahr, ahs, aht, aia, aib, aic, aid, aie, aif, aig, aij, aik, ail, aim, aio, aip, aiq, air, ait, aiw, aix, aiy, aja, ajg, aji, ajn, ajw, ajz, akb, akc, akd, ake, akf, akg, akh, aki, akl, ako, akp, akq, akr, aks, akt, aku, akv, akw, ala, alc, ald, alf, alh, ali, alj, alk, all, alm, alo, alp, alq, alr, alu, alw, alx, aly, alz, ama, amb, amc, ame, amf, amg, ami, amj, amk, amm, amn, amp, amq, amr, ams, amt, amu, amv, amw, amx, amy, amz, ana, anb, anc, and, ane, anf, anh, ani, anj, ank, anl, anm, ano, anq, anr, ans, ant, anu, anv, anw, anx, any, anz, aoa, aob, aoc, aod, aoe, aof, aog, aoi, aoj, aok, aol, aom, aon, aor, aos, aot, aox, apb, ape, apf, apg, aph, api, apj, apk, apl, apm, apn, apo, app, apr, aps, apt, apu, apv, apw, apx, apy, apz, aqc, aqd, aqg, aqk, aqm, aqn, aqr, aqt, aqz, ard, are, arh, ari, arj, ark, arl, arr, aru, arx, asb, asc, ase, asg, ash, asi, asj, ask, asl, asn, aso, asr, ass, asu, asv, asx, asy, asz, ata, atb, atc, atd, ate, atg, ati, atk, atl, atm, atn, ato, atp, atq, atr, ats, att, atu, atv, atw, atx, aty, atz, aua, auc, aud, aug, auh, aui, auj, auk, aul, aum, aun, auo, aup, auq, aur, aut, auu, auw, auy, auz, avb, avd, avi, avl, avm, avn, avo, avs, avt, avu, avv, awb, awc, awe, awg, awh, awi, awk, awm, awn, awo, awr, aws, awt, awu, awv, aww, awx, awy, axb, axe, axg, axk, axl, axm, axx, aya, ayb, ayc, ayd, aye, ayg, ayh, ayi, ayk, ayl, ayn, ayo, ayp, ayq, ays, ayt, ayu, ayz, azb, azd, azg, azm, azn, azo, azt, azz, baa, bab, bac, bae, baf, bag, bah, baj, bao, bau, bav, baw, bay, bba, bbb, bbd, bbe, bbf, bbg, bbi, bbk, bbl, bbm, bbn, bbo, bbp, bbq, bbr, bbs, bbt, bbu, bbv, bbw, bbx, bby, bca, bcb, bcd, bce, bcf, bcg, bch, bcj, bck, bcm, bcn, bco, bcp, bcq, bcr, bcs, bct, bcu, bcv, bcw, bcy, bcz, bda, bdb, bdc, bdd, bde, bdf, bdg, bdh, bdi, bdj, bdk, bdl, bdm, bdn, bdo, bdp, bdq, bdr, bds, bdt, bdu, bdv, bdw, bdx, bdy, bdz, bea, beb, bec, bed, bee, bef, beh, bei, bek, beo, bep, beq, bes, bet, beu, bev, bex, bey, bfa, bfb, bfc, bfe, bff, bfg, bfh, bfj, bfl, bfm, bfn, bfo, bfp, bfs, bfu, bfw, bfx, bfz, bga, bgb, bgd, bgf, bgg, bgi, bgj, bgo, bgp, bgq, bgr, bgs, bgt, bgu, bgv, bgw, bgy, bgz, bha, bhc, bhd, bhe, bhf, bhg, bhh, bhj, bhl, bhm, bhn, bhp, bhq, bhr, bhs, bht, bhu, bhv, bhw, bhy, bhz, bia, bib, bid, bie, bif, big, bil, bim, bio, bip, biq, bir, bit, biu, biv, biw, biy, biz, bja, bjb, bjc, bjf, bjg, bjh, bji, bjk, bjl, bjm, bjo, bjp, bjr, bjs, bju, bjv, bjw, bjx, bjy, bjz, bka, bkc, bkd, bkf, bkg, bkh, bki, bkj, bkk, bkl, bkn, bko, bkp, bkq, bkr, bks, bkt, bkv, bkw, bkx, bky, bkz, blb, blc, bld, ble, blf, blh, bli, blj, blk, blm, bln, blp, blq, blr, bls, blv, blw, blx, bly, blz, bma, bmb, bmc, bmd, bme, bmf, bmg, bmh, bmi, bmj, bmk, bml, bmm, bmn, bmo, bmp, bmr, bms, bmu, bmv, bmw, bmx, bmz, bna, bnb, bnc, bnd, bne, bnf, bng, bni, bnj, bnk, bnm, bnn, bno, bnp, bnq, bnr, bns, bnu, bnv, bnw, bnx, bny, bnz, boa, bob, boe, bof, boh, boj, bok, bol, bom, bon, boo, bop, boq, bor, bot, bou, bov, bow, box, boy, boz, bpa, bpc, bpd, bpe, bpg, bph, bpi, bpj, bpk, bpl, bpm, bpo, bpp, bpq, bpr, bps, bpt, bpu, bpv, bpw, bpx, bpz, bqa, bqb, bqc, bqd, bqf, bqg, bqj, bqk, bql, bqm, bqo, bqp, bqq, bqr, bqs, bqt, bqu, bqw, bqx, bqz, brb, brc, brd, brf, brg, bri, brj, brk, brl, brm, brn, bro, brp, brq, brr, brs, brt, bru, brv, brw, bry, brz, bsa,
JOIN.join(
"Basic data languages missing some from likely (not serious issue)",
inLikelyButNotBasicLanguageData));
}
}
}
Loading