@@ -258,39 +258,54 @@ def expand_maiyamok(sent: Union[str, List[str]]) -> List[str]:
258258 repetition. This function preprocesses Thai text by replacing
259259 Maiyamok with a word being repeated.
260260
261- :param Union[str, List[str]] sent: input sentence (list or str )
261+ :param Union[str, List[str]] sent: sentence (list or string )
262262 :return: list of words
263263 :rtype: List[str]
264264
265265 :Example:
266266 ::
267-
268267 from pythainlp.util import expand_maiyamok
269268
270- expand_maiyamok("เด็กๆกิน ")
271- # output: ['เด็ก ', 'เด็ก ', 'กิน ']
269+ expand_maiyamok("คนๆนก ")
270+ # output: ['คน ', 'คน ', 'นก ']
272271 """
273272 if isinstance (sent , str ):
274273 sent = word_tokenize (sent )
275- _list_word : list [str ] = []
276- i = 0
277- for j , text in enumerate (sent ):
278- if text .isspace () and "ๆ" in sent [j + 1 ]:
279- continue
280- if " ๆ" in text :
281- text = text .replace (" ๆ" , "ๆ" )
282- if "ๆ" == text :
283- text = _list_word [i - 1 ]
284- elif "ๆ" in text :
285- count = text .count ("ๆ" )
286- text = _list_word [i - 1 ]
287- for _ in range (count ):
288- _list_word .append (text )
289- i += 1
274+
275+ # Breaks Maiyamok that attached to others, e.g. "นกๆๆ", "นกๆ ๆ", "นกๆคน"
276+ temp_toks : list [str ] = []
277+ for _ , token in enumerate (sent ):
278+ toks = re .split (r"(ๆ)" , token )
279+ toks = [tok for tok in toks if tok ] # remove empty string ("")
280+ temp_toks .extend (toks )
281+ sent = temp_toks
282+
283+ output_toks : list [str ] = []
284+
285+ yamok = "ๆ"
286+ yamok_count = 0
287+ len_sent = len (sent )
288+ for i in range (len_sent - 1 , - 1 , - 1 ): # do it backward
289+ if yamok_count == 0 or (i + 1 >= len_sent ):
290+ if sent [i ] == yamok :
291+ yamok_count = yamok_count + 1
292+ else :
293+ output_toks .append (sent [i ])
290294 continue
291- _list_word .append (text )
292- i += 1
293- return _list_word
295+
296+ if sent [i ] == yamok :
297+ yamok_count = yamok_count + 1
298+ else :
299+ if sent [i ].isspace ():
300+ if yamok_count > 0 : # remove space before yamok
301+ continue
302+ else : # with preprocessing above, this should not happen
303+ output_toks .append (sent [i ])
304+ else :
305+ output_toks .extend ([sent [i ]] * (yamok_count + 1 ))
306+ yamok_count = 0
307+
308+ return output_toks [::- 1 ]
294309
295310
296311def maiyamok (sent : Union [str , List [str ]]) -> List [str ]:
@@ -303,7 +318,7 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]:
303318 repetition. This function preprocesses Thai text by replacing
304319 Maiyamok with a word being repeated.
305320
306- :param Union[str, List[str]] sent: input sentence (list or str )
321+ :param Union[str, List[str]] sent: sentence (list or string )
307322 :return: list of words
308323 :rtype: List[str]
309324
@@ -312,8 +327,8 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]:
312327
313328 from pythainlp.util import expand_maiyamok
314329
315- expand_maiyamok("เด็กๆกิน ")
316- # output: ['เด็ก ', 'เด็ก ', 'กิน ']
330+ expand_maiyamok("คนๆนก ")
331+ # output: ['คน ', 'คน ', 'นก ']
317332 """
318333 warn_deprecation (
319334 "pythainlp.util.maiyamok" , "pythainlp.util.expand_maiyamok"
0 commit comments