forked from criskolonas/ir-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
query_processor.py
32 lines (27 loc) · 1.29 KB
/
query_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import math
import numpy as np
class QuerryProcessor:
def __init__(self, indexer, d_n):
self.d_n = d_n
self.inverted_index = indexer
def calculate_cosine_similarity(self,vector1,vector2):
dot_product = sum(p * q for p, q in zip(vector1, vector2))
magnitude = math.sqrt(sum([val ** 2 for val in vector1])) * math.sqrt(sum([val ** 2 for val in vector2]))
if not magnitude:
return 0
return dot_product / magnitude
# calculates the cosine similarity and return a list of top-k documents closest to the given query
def compare_documents(self):
top_k_docs = [[0 for x in range(2)] for y in range(self.d_n-1)]
tfid_query_list = []
for term in self.inverted_index.keys():
tfid_query_list.append(self.inverted_index[term][self.d_n-1].score)
query_vector = np.array(tfid_query_list)
for i in range(self.d_n-1):
tfid_doc_list = []
for term in self.inverted_index.keys():
tfid_doc_list.append(self.inverted_index[term][i].score)
doc_vector = np.array(tfid_doc_list)
top_k_docs[i-1][0] = self.inverted_index[term][i].doc
top_k_docs[i-1][1] = self.calculate_cosine_similarity(query_vector,doc_vector)
return top_k_docs