forked from zxzlogic/ir-python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
buildIndex_extract.py
97 lines (83 loc) · 2.36 KB
/
buildIndex_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import sys
import time
from bs4 import BeautifulSoup
import nltk
import leveldb
root_dir = '/home/zxz/Documents/ir/Robust2004/result/'
data_dir = root_dir + '/tokenize/'
index_dir = root_dir + '/index/'
stopword_path = '/home/zxz/Documents/ir/Robust2004/stop-word-list'
extract_path = index_dir + 'doc.extract'
vocab_path = index_dir + 'vocab'
docname_path = index_dir + 'docname'
stemmer = nltk.stem.porter.PorterStemmer()
### get all file pathes in corpus
list_file = []
list_dir = os.walk(data_dir)
for root, dirs, files in list_dir:
for f in files:
list_file.append(os.path.join(root,f))
sys.stdout.write('There are #%d files to be processed.\n'%(len(list_file)))
### load stopwordlist
stop_word_list = []
fo = open(stopword_path, 'r')
for line in fo:
line = line.strip()
stop_word_list.append(line)
stop_word_list = [stemmer.stem(word) for word in stop_word_list]
fo.close()
### for each file
count = 0
start_time = time.time()
vocab_ids = {}
vocab_strs = {}
doc_ids = {}
doc_names = {}
vocab_count = 0
doc_count = 0
fout = open(extract_path,'wt')
for file_name in list_file:
count += 1
print file_name
fo = open(file_name,'r')
soup = BeautifulSoup(fo,'lxml')
batch = leveldb.WriteBatch()
for doc_i in soup.find_all('doc'):
doc_ids[doc_i.docno.text.strip()] = doc_count
doc_names[doc_count] = doc_i.docno.text.strip()
doc_id = doc_count
doc_count += 1
sent_list = doc_i.doctext.text.split('\n')
doc_index = {}
pos_i = 0
for sent in sent_list:
word_list = sent.split('\t')
for i in range(len(word_list)):
if word_list[i] in stop_word_list:
continue
term_i = word_list[i]
if term_i not in vocab_ids:
vocab_ids[term_i] = vocab_count
vocab_strs[vocab_count] = term_i
vocab_count += 1
term_id = vocab_ids[term_i]
if term_id in doc_index:
doc_index[term_id].append(i+pos_i)
else:
doc_index[term_id] = [i+pos_i]
pos_i += len(word_list)
for (k,v) in doc_index.items():
fout.write('%d\t%d\t%d\t%s\n' % (doc_id,k,len(v),','.join([str(i) for i in v])))
end_time = time.time()
print 'time: %f sec, Processed %d docs' % (end_time - start_time, doc_count)
fo.close()
fout.close()
fout = open(vocab_path,'wt')
for (k,v) in vocab_ids.items():
fout.write('%s\t%d\n' %(k,v))
fout.close()
fout = open(docname_path,'wt')
for (k,v) in doc_ids.items():
fout.write('%s\t%d\n' %(k,v))
fout.close()