forked from czz989898/machine-learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
beyesi_get_batch.py
65 lines (57 loc) · 2.4 KB
/
beyesi_get_batch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk
#nltk.download()
def get_feature():
with open(r'E:\home work\semester3\machine learning\assignment\data\train_tweets.txt',encoding='UTF-8') as train_data:
with open(r'E:\home work\semester3\machine learning\assignment\data\feature_word_data.txt','w',encoding='UTF-8') as feature_word_data:
count = 0
for i in train_data:
count+=1
if count%100==0:
print(count)
info = i.split('\t')
words = info[0] + '\t'
sentence = info[1].strip()
j=0
while j <len(sentence)-1:
if (sentence[j].isalpha() or sentence[j].isdigit()) and sentence[j+1] in ['!','@','#','$',',','.','?',':']:
sentence = sentence[:j+1]+' '+sentence[j+1]
j+=1
sentence.replace(',','')
sentence.replace('.', '')
sentence_l = sentence.split(' ')
for j in sentence_l:
j = j.lower()
if j not in stopwords.words('english'):
words += j + ' '
words+='\n'
feature_word_data.write(words)
def get_batch_train(num):
features = []
vectorizer = CountVectorizer()
doc = []
flag = False
start = 0
with open(r'E:\home work\semester3\machine learning\assignment\data\config.txt',encoding='UTF-8') as config:
for i in config:
start = int(i)
with open(r'E:\home work\semester3\machine learning\assignment\data\feature_word_data.txt',encoding='UTF-8') as doc_source:
count = -1
y_data = []
for i in doc_source:
count += 1
if count>=start:
if count % 200 == 0:
print(count)
info = i.split('\t')
doc.append(info[1])
y_data.append(info[0])
if count>start+num:
flag = True
break
a = vectorizer.fit_transform(doc).toarray()
with open(r'E:\home work\semester3\machine learning\assignment\data\config.txt','w', encoding='UTF-8') as config:
config.write(str(start+num))
return a,y_data,flag
#get_batch_train()