Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions SUPPORTED_LANGUAGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,4 @@ and [documentation](https://docs.rs/whatlang/).
| Tagalog | tgl | `Lang::Tgl` |
| Armenian | hye | `Lang::Hye` |
| Welsh | cym | `Lang::Cym` |
| Kyrgyz | kir | `Lang::Kir` |
1 change: 1 addition & 0 deletions misc/alphabets/cyrillic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ ukr: абвгдежзийклмнопрстуфхцчшщьюяєіїґ
bel: абвгдежзйклмнопрстуфхцчшыьэюяёіў
srp: абвгдежзиклмнопрстуфхцчшђјљњћџ
mkd: абвгдежзиклмнопрстуфхцчшѓѕјљњќџ
kir: абвгдеёжзийклмнңоөпрстуүфхцчшщъыьэюя
51 changes: 51 additions & 0 deletions misc/kyrgyz_corpus/fetch_wiki.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash
# Fetch Kyrgyz Wikipedia articles for trigram generation

ARTICLES=(
"Кыргызстан"
"Кыргыз_тили"
"Бишкек"
"Ош"
"Тарых"
"Кыргыз_элинин_тарыхы"
"Манас"
"Ысык-Көл"
"Ала-Тоо"
"Кыргыз_маданияты"
"Түрк_тилдери"
"Ислам"
"Кыргыз_Республикасынын_Президенти"
"Музыка"
"Адабият"
"Саясат"
"Экономика"
"Билим_берүү"
"Табият"
"Спорт"
)

OUTPUT_FILE="kyrgyz_corpus.txt"
> "$OUTPUT_FILE"

for article in "${ARTICLES[@]}"; do
encoded=$(python3 -c "import urllib.parse; print(urllib.parse.quote('$article'))")
url="https://ky.wikipedia.org/w/api.php?action=query&titles=$encoded&prop=extracts&explaintext=true&format=json"

content=$(curl -s -L -A "Mozilla/5.0 (compatible; WhatlangBot/1.0)" "$url" | python3 -c "
import sys, json
try:
data = json.load(sys.stdin)
pages = data.get('query', {}).get('pages', {})
for page_id, page in pages.items():
if 'extract' in page:
print(page['extract'])
except:
pass
")
if [ -n "$content" ]; then
echo "$content" >> "$OUTPUT_FILE"
echo "" >> "$OUTPUT_FILE"
fi
done

echo "Corpus size: $(wc -c < "$OUTPUT_FILE") bytes"
77 changes: 77 additions & 0 deletions misc/kyrgyz_corpus/generate_trigrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env python3
"""
Generate trigrams for Kyrgyz language from corpus.
Output format is compatible with whatlang-rs data.json
"""

import re
from collections import Counter


def extract_trigrams(text: str, top_n: int = 300) -> str:
"""
Extract top N trigrams from text.

Trigrams are extracted from words with space padding, similar to how
whatlang-rs processes text. For example, word "кыргыз" becomes " кыргыз "
and produces trigrams: " кы", "кыр", "ырг", "ргы", "гыз", "ыз ".
"""
# Lowercase the text
text = text.lower()

# Remove punctuation and digits, keep letters and spaces
# Keep Cyrillic letters including Kyrgyz-specific ones: ң, ө, ү
text = re.sub(r'[^\w\s]', ' ', text)
text = re.sub(r'\d+', ' ', text)
text = re.sub(r'\s+', ' ', text)

trigrams = Counter()

for word in text.split():
if len(word) < 1:
continue
# Add space padding around word
word = f' {word} '
for i in range(len(word) - 2):
trigram = word[i:i+3]
# Only count trigrams with at least one letter
if any(c.isalpha() for c in trigram):
trigrams[trigram] += 1

# Get top N trigrams and format as pipe-separated string
top_trigrams = [t for t, _ in trigrams.most_common(top_n)]
return '|'.join(top_trigrams)


def main():
# Read the corpus
with open('kyrgyz_corpus.txt', 'r', encoding='utf-8') as f:
text = f.read()

print(f"Corpus size: {len(text)} characters")

# Generate trigrams
trigram_str = extract_trigrams(text, 300)

# Count how many trigrams we got
trigrams = trigram_str.split('|')
print(f"Generated {len(trigrams)} trigrams")

# Show first 20 trigrams as sample
print("\nFirst 20 trigrams:")
for i, t in enumerate(trigrams[:20]):
print(f" {i+1}. '{t}'")

# Save to file
with open('trigrams.txt', 'w', encoding='utf-8') as f:
f.write(trigram_str)

print(f"\nTrigrams saved to trigrams.txt")

# Also print the full string for easy copy-paste
print("\n--- Full trigram string for data.json ---")
print(trigram_str)


if __name__ == '__main__':
main()
1,107 changes: 1,107 additions & 0 deletions misc/kyrgyz_corpus/kyrgyz_corpus.txt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions misc/kyrgyz_corpus/trigrams.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ын |ан | жа| ка| ба| кы|да |на |ары|ана|ен |нда|ин |тар|ган| ал|ар |ала| би|ык | ма|кыр| бо|ард|дын|лар| та|ырг|нын|жан|ргы|гыз| тү|уу |алы| ме|анд|рды|кан| кө| ко|аны|ара| жы|ынд|кал|агы|үн |дар|тан| ай|жыл|ат | ке| са|бол|мен|или|нен| ша|ста|ылы|лга| де|ры |ене| то|ун |ыз |лык| ар|кта|кар|ген|рын|ып | же|ери| ти|ыны|тер|бир|ика|үү |гы | ор|асы|шка|ору|ик |ына|аар|акт|баш| бе|аты|ата|га |тил|дан|ман|тын| со|өн |ай | ку|айы|ада|шаа| да|түр| ту|ашк|де |сын|ет |олу|ри | ки| эл|лик|етт|кет|лды|ды |нды|ула|дин|нин|алг| өз|ети|тал|ка | бу|ук | чы| жо|ек |ири|ини|тта|рда|ерд|тик|рди|сы |еке|нде|ылд|даг|ер |тур|бай|алд|ишк|он |еги|еп |ага|ты |тти| ре|кы | ат| те|туу|өнү|улу|рга| ад|луу|дык|ура|атт|кте|шке|лек| че| эк|айл|өрү|ект|рин|нас|уп |бор|инд|лы |орт|ти |шта|уру|ча |лан|бар|икт|тин|гөн|нүн|кыл|кел|ак |ине|лда| өн|кур|көл|кон| ан|үнд|акы|рал|бер|деп| ча|лер|рат|жай|кат| ош|ир |оло|ал |ясы|ызс|мам|мак|кек|айт|зст|ыгы|тоо|уну| ми|түз| су|лат|йма|бул|биш|үгү|ция|амл|лин|айд|ги |гын|гон|мле|эле|еге|жак|кен|үнү|ашт|айм|дам|гиз|нун| ок|ндү|ия |ул |үк |ияс|иял|рес| аз|та | не|йла|ны |ого|өлү|жер|үрк| иш|ип |дук|бал|еле|үрү|ндө|ону|илд|ыла|кты|үзү|йин| ак|бли|тай|йын| эм|ыкт|арт|есп|спу|пуб
1 change: 1 addition & 0 deletions misc/supported_languages.csv
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,4 @@ cat,Catalan,Català,10
tgl,Tagalog,Tagalog,
hye,Armenian,Հայերեն,7
cym,Welsh,Cymraeg,0.5
kir,Kyrgyz,Кыргызча,6
2 changes: 2 additions & 0 deletions src/alphabets/cyrillic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ const UKR: &str = "абвгдежзийклмнопрстуфхцчшщьюяє
const BEL: &str = "абвгдежзйклмнопрстуфхцчшыьэюяёіў";
const SRP: &str = "абвгдежзиклмнопрстуфхцчшђјљњћџ";
const MKD: &str = "абвгдежзиклмнопрстуфхцчшѓѕјљњќџ";
const KIR: &str = "абвгдеёжзийклмнңоөпрстуүфхцчшщъыьэюя";

const CYRILLIC_ALPHABETS: &[(Lang, &str)] = &[
(Lang::Bul, BUL),
Expand All @@ -18,6 +19,7 @@ const CYRILLIC_ALPHABETS: &[(Lang, &str)] = &[
(Lang::Bel, BEL),
(Lang::Srp, SRP),
(Lang::Mkd, MKD),
(Lang::Kir, KIR),
];

/// Inverted map binding a character to a set of languages.
Expand Down
12 changes: 10 additions & 2 deletions src/lang.rs
Original file line number Diff line number Diff line change
Expand Up @@ -226,9 +226,12 @@ pub enum Lang {

/// Cymraeg (Welsh)
Cym = 69,

/// Кыргызча (Kyrgyz)
Kir = 70,
}

const VALUES: [Lang; 70] = [
const VALUES: [Lang; 71] = [
Lang::Epo,
Lang::Eng,
Lang::Rus,
Expand Down Expand Up @@ -299,6 +302,7 @@ const VALUES: [Lang; 70] = [
Lang::Tgl,
Lang::Hye,
Lang::Cym,
Lang::Kir,
];

fn lang_from_code<S: Into<String>>(code: S) -> Option<Lang> {
Expand Down Expand Up @@ -373,6 +377,7 @@ fn lang_from_code<S: Into<String>>(code: S) -> Option<Lang> {
"tgl" => Some(Lang::Tgl),
"hye" => Some(Lang::Hye),
"cym" => Some(Lang::Cym),
"kir" => Some(Lang::Kir),
_ => None,
}
}
Expand Down Expand Up @@ -449,6 +454,7 @@ fn lang_to_code(lang: Lang) -> &'static str {
Lang::Tgl => "tgl",
Lang::Hye => "hye",
Lang::Cym => "cym",
Lang::Kir => "kir",
}
}

Expand Down Expand Up @@ -524,6 +530,7 @@ fn lang_to_name(lang: Lang) -> &'static str {
Lang::Tgl => "Tagalog",
Lang::Hye => "Հայերեն",
Lang::Cym => "Cymraeg",
Lang::Kir => "Кыргызча",
}
}

Expand Down Expand Up @@ -599,6 +606,7 @@ fn lang_to_eng_name(lang: Lang) -> &'static str {
Lang::Tgl => "Tagalog",
Lang::Hye => "Armenian",
Lang::Cym => "Welsh",
Lang::Kir => "Kyrgyz",
}
}

Expand Down Expand Up @@ -708,7 +716,7 @@ mod tests {

#[test]
fn test_all() {
assert_eq!(Lang::all().len(), 70);
assert_eq!(Lang::all().len(), 71);
let all = Lang::all();
assert!(all.contains(&Lang::Ukr));
assert!(all.contains(&Lang::Swe));
Expand Down
3 changes: 2 additions & 1 deletion src/scripts/lang_mapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,14 @@ const LATIN_LANGS: [Lang; 37] = [
Lang::Lat,
Lang::Cym,
];
const CYRILLIC_LANGS: [Lang; 6] = [
const CYRILLIC_LANGS: [Lang; 7] = [
Lang::Rus,
Lang::Ukr,
Lang::Srp,
Lang::Bel,
Lang::Bul,
Lang::Mkd,
Lang::Kir,
];
const ARABIC_LANGS: [Lang; 3] = [Lang::Ara, Lang::Urd, Lang::Pes];
const DEVANAGARI_LANGS: [Lang; 3] = [Lang::Hin, Lang::Mar, Lang::Nep];
Expand Down
Loading