-
Notifications
You must be signed in to change notification settings - Fork 3
/
query.py
156 lines (134 loc) · 4.76 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
__author__ = 'raccoon'
import sys
import os
import math
from warc.parser import Parser
from indexing.index import Index
def usage():
print("Usage:")
print("\tpython3", sys.argv[0], "[-r number] -w warc_file_name -q free_query_text")
print("\tpython3", sys.argv[0], "-h")
print("\r\nParameter:")
print("\t", "-r\t", "WARC file, can auto detect index file is exist or not.")
print("\t", "-r\t", "control that how many document may display. default is 10")
print("\t", "-q\t", "free text query term")
print("\t", "-h\t", "show this helper")
print("\r\nExample:")
print("python3", sys.argv[0], "-w 00.warc", "-q hong kong")
exit(0)
def query():
file_name = ""
return_count = 10
N = 0
# parse parameters
if len(sys.argv) >= 3:
if "-w" in sys.argv:
file_name = sys.argv[sys.argv.index("-w") + 1]
else:
usage()
if "-r" in sys.argv:
return_count = int(sys.argv[sys.argv.index("-r") + 1])
if "-q" in sys.argv:
query_string = sys.argv[sys.argv.index("-q") + 1:]
else:
usage()
else:
usage()
# set idx file and dict file path
idx_file = file_name + "_index.idx"
dict_file = file_name + "_index.dict"
# error detection
if not os.path.isfile(idx_file) or not os.path.isfile(idx_file):
print("Error: index dictionary file(_index.dict or inverted index file (_index.idx) not found.",
file=sys.stderr)
exit(1)
# count total document N
parser = Parser(file_name)
while True:
if parser.fetch() is not None:
N += 1
else:
break
# read dict file to dict
dict_file = open(dict_file)
dicts = {}
for d in dict_file:
(key, offset) = d.split(', ')
dicts[key] = int(offset)
# term's index
term_index = {}
# query's parameter table
query_table = {}
# docs's parameter table
docs_table = {}
# docs set for merge document
docs_set = set()
# docs score hash, use cosine similarity score with weight use tf-idf
docs_score = {}
# Calculate query's weight
for term in query_string:
if term in dicts:
term_index[term] = Index.read_index_by_offset(idx_file, dicts[term]).index[term]
# add doc# to set
for doc in term_index[term]:
docs_set.add(int(doc))
query_table[term] = {}
query_table[term]["tf"] = 1
query_table[term]["df"] = len(term_index[term])
query_table[term]["idf"] = math.log(N / query_table[term]["df"], 10)
query_table[term]["w"] = (1 + math.log(query_table[term]["tf"])) * query_table[term]["idf"]
else:
term_index[term] = {}
query_table[term] = {}
query_table[term]["tf"] = 1
query_table[term]["df"] = 0
query_table[term]["idf"] = 0
query_table[term]["w"] = 0
# Calculate query's weight
euclidean_length = 0
while True:
try:
element = str(docs_set.pop())
except KeyError:
break
docs_table[element] = {}
for term in query_string:
docs_table[element][term] = {}
docs_table[element][term]["tf"] = 0
if element in term_index[term]:
docs_table[element][term]["tf"] = len(term_index[term][str(element)])
euclidean_length += docs_table[element][term]["tf"] * docs_table[element][term]["tf"]
euclidean_length = math.sqrt(euclidean_length)
for doc in docs_table:
for term in query_string:
if docs_table[doc][term]["tf"] > 0:
docs_table[doc][term]["w"] = (1 + math.log(docs_table[doc][term]["tf"], 10)) * math.log(
query_table[term]["df"], 10)
else:
docs_table[doc][term]["w"] = 0
query_len = 0
for term in query_string:
query_len += query_table[term]["w"] * query_table[term]["w"]
query_len = math.sqrt(query_len)
for doc in docs_table:
up_part = 0
doc_len = 0
for terms in query_string:
up_part += docs_table[doc][terms]["w"] * query_table[terms]["w"]
doc_len += docs_table[doc][terms]["w"] * docs_table[doc][terms]["w"]
docs_score[doc] = up_part / (math.sqrt(doc_len) * query_len)
print("Query terms:", query_string)
print("Top", return_count, "results:")
print("doc#\tscore")
for i in sorted(docs_score, key=docs_score.get, reverse=True):
return_count -= 1
if return_count < 0:
break
print("%d\t%.3f" % (int(i), docs_score[i]))
if __name__ == "__main__":
if "-h" in sys.argv:
usage()
elif "-q" in sys.argv:
query()
else:
usage()