-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathReadBulletScreen.py
83 lines (68 loc) · 2.73 KB
/
ReadBulletScreen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# -*- coding: utf-8 -*-
import re
import jieba
import jieba.posseg as pseg
import uniout
import copy
try:
import cPickle as pickle
except ImportError:
import pickle
class BulletScreen(object):
def __init__(self):
self.stop_words= set([
" ","the","of","is","and","to","in","that","we","for",\
"an","are","by","be","as","on","with","can","if","from","which","you",
"it","this","then","at","have","all","not","one","has","or","that","什么","姐姐","一个"
])
def load_stop_words(self,file="data/metadata/stopWords.txt"):
f = open(file)
content = f.read().decode('utf-8')
words = content.split('\n')
for w in words:
self.stop_words.add(w.strip())
def read(self,file_name,timelength):
#f = open("data/1993410.txt", "r")
#timelength = 5640
# f = open("data/5077534.txt", "r")
# timelength = 4740
f = open(file_name, "r")
#timelength = 2582
tempLine=[]
#vocabulary=set()
vocabulary = {}
jieba.load_userdict("data/metadata/user_dict.txt")
for lineNo,line in enumerate(f.readlines()):
pattern=re.compile("^<d p=\"(.+)\">(.+)</d>")
m=pattern.match(line)
if m:
temp={}
temp={"time":int(float(m.group(1).split(',')[0])), \
"text":[word for word,flag in pseg.cut(m.group(2)) \
if word not in self.stop_words and flag not in \
["m","w","g","c","o","p","z","q","un","e","r","x","d","t","h","k","y","u","s","uj","ul","r","eng"] ],
"lineno":lineNo+1}
if len(temp["text"])>3:
tempLine.append(temp)
for item in temp["text"]:
if item not in vocabulary:
vocabulary[item]=0
#print(len(tempLine))
lines=sorted(tempLine, key= lambda e:(e.__getitem__('time')))
# print vocabulary
# print "vocabulary size: %d " % len(vocabulary)
# print "video comment size: %d " % len(lines)
# print lines[12]
self.store(lines,timelength)
return lines,timelength,vocabulary
def store(self,lines,timelength):
fw = open("data/var/lines", "wb")
pickle.dump({"lines":lines,"timelength":timelength},fw)
fw.close()
def run(self,file_name,timelength):
self.load_stop_words()
return self.read(file_name,timelength)
if __name__=="__main__":
filename="data/1.txt"
timelenth=2582
print BulletScreen().run(filename,timelenth)