-
Notifications
You must be signed in to change notification settings - Fork 1
/
step1.py
98 lines (91 loc) · 2.77 KB
/
step1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# # coding:utf-8
#
# import jieba
# import re
# import random
# import pickle
#
#
# def cut_word(text):
# restr = '[0-9\s+\.\!\/_,$%^*();?:\-<>《》【】+\"\']+|[+——!,;。?:、~@#¥%……&*()]+'
# resu = text.replace('|', '').replace(' ', '').replace('ldquo', '').replace('rdquo',
# '').replace(
# 'lsquo', '').replace('rsquo', '').replace('“', '').replace('”', '').replace('〔', '').replace('〕', '')
# resu = re.split(r'\s+', resu)
# dr = re.compile(r'<[^>]+>', re.S)
# dd = dr.sub('', ''.join(resu))
# line = re.sub(restr, '', dd)
# seg_list = jieba.lcut(line)
# return seg_list
#
#
# pos = [i for i in open('DATA/pos.txt', 'r', encoding='utf8')]
# neg = [i for i in open('DATA/neg.txt', 'r', encoding='utf8')]
# xs = list()
# ys = list()
# for i in range(len(pos)):
# xs.append(" ".join(cut_word(pos[i])))
# ys.append("1")
# if i % 100 == 0:
# print(i / 100)
# for i in range(len(neg)):
# xs.append(' '.join(cut_word(neg[i])))
# ys.append("0")
# if i % 100 == 0:
# print(i / 100)
# train = []
# label = []
#
# id = list(range(len(ys)))
# random.shuffle(id)
# f = open("DATA/train.txt", "w", encoding="utf-8")
# g = open("DATA/label.txt", "w", encoding="utf-8")
# for i in id:
# f.write(xs[i] + "\n")
# g.write(ys[i] + "\n")
# f.close()
# g.close()
#
# pass
# coding:utf-8
import jieba
import re
import random
import pickle
def cut_word(text):
restr = '[0-9\s+\.\!\/_,$%^*();?:\-<>《》【】+\"\']+|[+——!,;。?:、~@#¥%……&*()]+'
resu = text.replace('|', '').replace(' ', '').replace('ldquo', '').replace('rdquo',
'').replace(
'lsquo', '').replace('rsquo', '').replace('“', '').replace('”', '').replace('〔', '').replace('〕', '')
resu = re.split(r'\s+', resu)
dr = re.compile(r'<[^>]+>', re.S)
dd = dr.sub('', ''.join(resu))
line = re.sub(restr, '', dd)
seg_list = jieba.lcut(line)
return seg_list
pos = [i for i in open('DATA/pos.txt', 'r', encoding='utf8')]
neg = [i for i in open('DATA/neg.txt', 'r', encoding='utf8')]
xs = list()
ys = list()
for i in range(len(pos)):
xs.append(" ".join(cut_word(pos[i])))
ys.append("1")
if i % 100 == 0:
print(i / 100)
for i in range(len(neg)):
xs.append(' '.join(cut_word(neg[i])))
ys.append("0")
if i % 100 == 0:
print(i / 100)
train = []
label = []
id = list(range(len(ys)))
random.shuffle(id)
f = open("DATA/train.txt", "w", encoding="utf-8")
g = open("DATA/label.txt", "w", encoding="utf-8")
for i in id:
f.write(xs[i] + "\n")
g.write(ys[i] + "\n")
f.close()
g.close()
pass