-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsplit.py
More file actions
37 lines (25 loc) · 1.09 KB
/
split.py
File metadata and controls
37 lines (25 loc) · 1.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import re
# prompt: 使用python帮我编写代码,对字符串列表中的每个字符串进行分词,每个汉字分为一个token,每个英文单词分为一个token,中英文以外的字符单独分为一个token
strings = ["你好nihao世界?>:{{:.sh;ijie Hello world!", "这是123个测试例子."]
def tokenize_strings(str_list):
# 定义一个正则表达式模式,用于匹配中文、英文单词和其他字符
pattern = r'[\u4e00-\u9fff]|[a-zA-Z]+|[^a-zA-Z\u4e00-\u9fff]'
tokenized_list = []
for s in str_list:
# 使用findall方法根据定义的模式找到所有匹配项
tokens = re.findall(pattern, s)
tokenized_list.append(tokens)
return tokenized_list
tokenized = tokenize_strings(strings)
for t in tokenized:
print(t)
print(len(t))
def get_tokenize_length(str_list):
pattern = r'[\u4e00-\u9fff]|[a-zA-Z]+|[^a-zA-Z\u4e00-\u9fff]'
length = 0
for s in str_list:
tokens = re.findall(pattern, s)
length += len(tokens)
return length
length = get_tokenize_length(strings)
print(length)