-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_queries_imp1.py
120 lines (88 loc) · 3.59 KB
/
test_queries_imp1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
import pickle
import numpy as np
INDEX = "./pickles/"
# Main code
# Load the data previously stored using Pickle
norm_fname = 'Normalized_tdf'
norm_tfwt = open(INDEX+norm_fname, 'rb') # normalized log tf wt for query term in document
norm = pickle.load(norm_tfwt)
idf_fname = 'term_idf'
term_idfs = open(INDEX+idf_fname, 'rb') # IDF wts of all terms in vocab
idf = pickle.load(term_idfs)
docs_dict_fname = 'docs_dict'
doc_dictionary = open(INDEX+docs_dict_fname, 'rb') # Documents IDs and Title
docs_dict = pickle.load(doc_dictionary)
vocabulary_fname = 'vocabulary'
vocab = open(INDEX+vocabulary_fname, 'rb') # Terms present in the corpus
vocabulary = pickle.load(vocab)
N = len(docs_dict) # number of documents
K = 10 # top K documents to be returned
query = ""
input_q = input("Enter query: ")
q_p = [] # List containing query terms present in vocab
# defining punctuations and removing them
punctuations = '''!()-[]{\};:'",<>./?@#$%^&*_~'''
for char in input_q:
if char in punctuations:
char = ' '
query = query + char
q_tokens = nltk.word_tokenize(query) # Tokenizing query words using nltk
query_tokens = []
# defining stop words and removing them
stop_words = set(stopwords.words('english'))
for token in q_tokens:
if token not in stop_words:
query_tokens.append(token)
freq = {} # dictionary with query term as key and logarithmic tf as value
tf_idf = {} # dictionary with query term as key and tf_idf wt as value
prod = {} # dictionary with doc_id as key and product of query wt and doc wt as value
sos = 0 # Variable to store sum of squares to be used in normalization
# Checking if term present in vocab and storing it in a new list
for token in query_tokens:
if(token in vocabulary):
q_p.append(token)
# Storing term freq for query for terms present in vocab and 0 if not present
# freq is a dict variable with term as key and frequency as value
for token in query_tokens:
if(token in q_p):
if (token in freq):
freq[token] = freq[token]+1
else:
freq[token] = 1
else:
freq[token] = 0
tf_idf[token] = 0
# Converting natural term frequency to its logarithmic equivalent --> 1 + log(tf)
# calculating tf_idf of query vector and sum of squares for cosine normalization
for q in set(q_p):
freq[q] = (1+np.log10(freq[q]))
tf_idf[q] = freq[q]*idf[q]
sos = sos + tf_idf[q]**2
sqrt_sos = np.sqrt(sos) # To normalize the weights
for q in set(q_p):
tf_idf[q] = tf_idf[q]/sqrt_sos # Final normalized tf_idf wt of query terms
#Calculating relevance of document to given query using vector space model calculations
for (d_id, d_name) in docs_dict.items():
prod[d_id] = 0
for q in query_tokens:
if q in norm and d_id in norm[q]:
prod[d_id] = prod[d_id] + tf_idf[q]*norm[q][d_id]
#Improvement by giving weight to the title of the documents
for (d_id, d_name) in docs_dict.items():
d_title = nltk.word_tokenize(d_name.lower()) #Converting the document title to lowercase and tokenizing
for q in query_tokens:
if q in d_title:
prod[d_id] = prod[d_id] + 0.10 #Adding 0.10 to the score for each term in title
rel_docs = sorted(prod.items(), key=lambda x: (x[1], x[0]), reverse=True)
# Retrieving the top K docs out of all the ranked documents
topK = rel_docs[:K]
i = 1
for (key, value) in topK:
title = docs_dict[key]
print("Document", i)
i = i+1
print(key, title, value)
print("")