-
Notifications
You must be signed in to change notification settings - Fork 4
/
language-order.go
153 lines (131 loc) · 3.74 KB
/
language-order.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
package main
import (
"encoding/json"
"fmt"
"math"
"os"
"path"
"path/filepath"
"sort"
"strconv"
"strings"
"github.com/elliotchance/pie/v2"
"github.com/go-shiori/dom"
)
func createLanguageOrder(languageLocales map[string][]string) ([]string, error) {
// Parse CLDR territory info
languagePopulationMap, err := parseCldrTerritory()
if err != nil {
return nil, err
}
// Parse most common languages from W3Techs
mostCommonLanguages, err := parseW3CommonLanguages()
if err != nil {
return nil, err
}
// Separate between languages that has population data or not
var languages []string
var unusedLanguages []string
for language := range languageLocales {
population := languagePopulationMap[language]
if population > 0 {
languages = append(languages, language)
} else {
unusedLanguages = append(unusedLanguages, language)
}
}
// Sort languages based on how common is it and how many speaker it has
sort.Slice(languages, func(i, j int) bool {
langI := languages[i]
langJ := languages[j]
// Check for common language
iIdx, iIsCommon := mostCommonLanguages[langI]
jIdx, jIsCommon := mostCommonLanguages[langJ]
switch {
case iIsCommon && jIsCommon:
return iIdx < jIdx
case iIsCommon && !jIsCommon:
return true
case !iIsCommon && jIsCommon:
return false
}
// Check for population
iPopulation := languagePopulationMap[langI]
jPopulation := languagePopulationMap[langJ]
if iPopulation == jPopulation {
return langI > langJ
} else {
return iPopulation > jPopulation
}
})
// Put back the unused languages
sort.Strings(unusedLanguages)
for _, unusedLanguage := range unusedLanguages {
parentLanguage := rxLocaleCleaner.ReplaceAllString(unusedLanguage, "")
parentIdx := pie.FindFirstUsing(languages, func(v string) bool {
return v == parentLanguage
})
if parentIdx >= 0 {
languages = pie.Insert(languages, parentIdx+1, unusedLanguage)
} else {
languages = append(languages, unusedLanguage)
}
}
return languages, nil
}
func parseCldrTerritory() (map[string]int, error) {
// Open territory info file
fPath := filepath.Join(RAW_DIR, "cldr-core/supplemental/territoryInfo.json")
f, err := os.Open(fPath)
if err != nil {
return nil, err
}
defer f.Close()
// Decode JSON
var data CldrTerritoryData
err = json.NewDecoder(f).Decode(&data)
if err != nil {
return nil, err
}
// Get population per language
languagePopulations := map[string]int{}
for _, territoryData := range data.Supplemental.TerritoryInfo {
population, _ := strconv.ParseFloat(territoryData.Population, 64)
for language, languageData := range territoryData.LanguagePopulation {
language = strings.ReplaceAll(language, "_", "-")
percentage, _ := strconv.ParseFloat(languageData.PopulationPercent, 64)
languagePopulation := math.Round(population * percentage)
languagePopulations[language] += int(languagePopulation)
}
}
return languagePopulations, nil
}
func parseW3CommonLanguages() (map[string]int, error) {
// Open HTML file
fPath := filepath.Join(RAW_DIR, "w3techs", "content_language.html")
f, err := os.Open(fPath)
if err != nil {
return nil, err
}
defer f.Close()
// Parse HTML
doc, err := dom.Parse(f)
if err != nil {
return nil, err
}
// Fetch the common languages
commonLanguages := map[string]int{}
for i, a := range dom.QuerySelectorAll(doc, "table.bars th a") {
href := dom.GetAttribute(a, "href")
langCode := path.Base(href)
langCode = strings.ToLower(langCode)
langCode = strings.TrimPrefix(langCode, "cl")
langCode = strings.Trim(langCode, "-")
commonLanguages[langCode] = i
}
// If English is not the most common language, it's error
if commonLanguages["en"] != 0 {
return nil, fmt.Errorf("english is not the first language")
}
return commonLanguages, nil
}