66"""
77
88__all__ = [
9- "word_freqs" ,
10- "unigram_word_freqs" ,
119 "bigram_word_freqs" ,
1210 "trigram_word_freqs" ,
11+ "unigram_word_freqs" ,
12+ "word_freqs" ,
1313]
1414
1515from collections import defaultdict
1616from typing import List , Tuple
1717
1818from pythainlp .corpus import get_corpus , get_corpus_path
1919
20- _FILENAME = "tnc_freq.txt"
21- _BIGRAM = "tnc_bigram_word_freqs"
22- _TRIGRAM = "tnc_trigram_word_freqs"
20+ _UNIGRAM_FILENAME = "tnc_freq.txt"
21+ _BIGRAM_CORPUS_NAME = "tnc_bigram_word_freqs"
22+ _TRIGRAM_CORPUS_NAME = "tnc_trigram_word_freqs"
2323
2424
2525def word_freqs () -> List [Tuple [str , int ]]:
@@ -30,53 +30,61 @@ def word_freqs() -> List[Tuple[str, int]]:
3030
3131 Credit: Korakot Chaovavanich https://www.facebook.com/groups/thainlp/posts/434330506948445
3232 """
33- lines = list ( get_corpus ( _FILENAME ))
34- word_freqs = []
33+ freqs : list [ tuple [ str , int ]] = []
34+ lines = list ( get_corpus ( _UNIGRAM_FILENAME ))
3535 for line in lines :
3636 word_freq = line .split ("\t " )
3737 if len (word_freq ) >= 2 :
38- word_freqs .append ((word_freq [0 ], int (word_freq [1 ])))
38+ freqs .append ((word_freq [0 ], int (word_freq [1 ])))
3939
40- return word_freqs
40+ return freqs
4141
4242
43- def unigram_word_freqs () -> defaultdict :
43+ def unigram_word_freqs () -> dict [ str , int ] :
4444 """
4545 Get unigram word frequency from Thai National Corpus (TNC)
4646 """
47- lines = list ( get_corpus ( _FILENAME ) )
48- _word_freqs = defaultdict ( int )
47+ freqs : dict [ str , int ] = defaultdict ( int )
48+ lines = list ( get_corpus ( _UNIGRAM_FILENAME ) )
4949 for i in lines :
5050 _temp = i .strip ().split (" " )
5151 if len (_temp ) >= 2 :
52- _word_freqs [_temp [0 ]] = int (_temp [- 1 ])
52+ freqs [_temp [0 ]] = int (_temp [- 1 ])
5353
54- return _word_freqs
54+ return freqs
5555
5656
57- def bigram_word_freqs () -> defaultdict :
57+ def bigram_word_freqs () -> dict [ Tuple [ str , str ], int ] :
5858 """
5959 Get bigram word frequency from Thai National Corpus (TNC)
6060 """
61- _path = get_corpus_path (_BIGRAM )
62- _word_freqs = defaultdict (int )
63- with open (_path , "r" , encoding = "utf-8-sig" ) as fh :
61+ freqs : dict [tuple [str , str ], int ] = defaultdict (int )
62+ path = get_corpus_path (_BIGRAM_CORPUS_NAME )
63+ if not path :
64+ return freqs
65+ path = str (path )
66+
67+ with open (path , "r" , encoding = "utf-8-sig" ) as fh :
6468 for i in fh .readlines ():
65- _temp = i .strip ().split (" " )
66- _word_freqs [( _temp [0 ], _temp [1 ])] = int (_temp [- 1 ])
69+ temp = i .strip ().split (" " )
70+ freqs [( temp [0 ], temp [1 ])] = int (temp [- 1 ])
6771
68- return _word_freqs
72+ return freqs
6973
7074
71- def trigram_word_freqs () -> defaultdict :
75+ def trigram_word_freqs () -> dict [ Tuple [ str , str , str ], int ] :
7276 """
7377 Get trigram word frequency from Thai National Corpus (TNC)
7478 """
75- _path = get_corpus_path (_TRIGRAM )
76- _word_freqs = defaultdict (int )
77- with open (_path , "r" , encoding = "utf-8-sig" ) as fh :
79+ freqs : dict [tuple [str , str , str ], int ] = defaultdict (int )
80+ path = get_corpus_path (_TRIGRAM_CORPUS_NAME )
81+ if not path :
82+ return freqs
83+ path = str (path )
84+
85+ with open (path , "r" , encoding = "utf-8-sig" ) as fh :
7886 for i in fh .readlines ():
79- _temp = i .strip ().split (" " )
80- _word_freqs [( _temp [0 ], _temp [1 ], _temp [2 ])] = int (_temp [- 1 ])
87+ temp = i .strip ().split (" " )
88+ freqs [( temp [0 ], temp [1 ], temp [2 ])] = int (temp [- 1 ])
8189
82- return _word_freqs
90+ return freqs
0 commit comments