-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
192 lines (185 loc) · 9.2 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import sys
import index
import pickle
import time
import math
from index import Posting, tokenize, parseStr
from nltk.stem import PorterStemmer
#The list of processed stopwords using the tokenizer and the stemmer
stopWords = ['a', 'about', 'abov', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'ani', 'are', 'aren', 't', 'as', 'at', 'be', 'becaus', 'been', 'befor', 'be', 'below', 'between', 'both', 'but', 'by', 'can', 't', 'cannot', 'could', 'couldn', 't', 'did', 'didn', 't', 'do', 'doe', 'doesn', 't', 'do', 'don', 't', 'down', 'dure', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', 't', 'ha', 'hasn', 't', 'have', 'haven', 't', 'have', 'he', 'he', 'd', 'he', 'll', 'he', 's', 'her', 'here', 'here', 's', 'her', 'herself', 'him', 'himself', 'hi', 'how', 'how', 's', 'i', 'i', 'd', 'i', 'll', 'i', 'm', 'i', 've', 'if', 'in', 'into', 'is', 'isn', 't', 'it', 'it', 's', 'it', 'itself', 'let', 's', 'me', 'more', 'most', 'mustn', 't', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'onc', 'onli', 'or', 'other', 'ought', 'our', 'our', 'ourselv', 'out', 'over', 'own', 'same', 'shan', 't', 'she', 'she', 'd', 'she', 'll', 'she', 's', 'should', 'shouldn', 't', 'so', 'some', 'such', 'than', 'that', 'that', 's', 'the', 'their', 'their', 'them', 'themselv', 'then', 'there', 'there', 's', 'these', 'they', 'they', 'd', 'they', 'll', 'they', 're', 'they', 've', 'thi', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'veri', 'wa', 'wasn', 't', 'we', 'we', 'd', 'we', 'll', 'we', 're', 'we', 've', 'were', 'weren', 't', 'what', 'what', 's', 'when', 'when', 's', 'where', 'where', 's', 'which', 'while', 'who', 'who', 's', 'whom', 'whi', 'whi', 's', 'with', 'won', 't', 'would', 'wouldn', 't', 'you', 'you', 'd', 'you', 'll', 'you', 're', 'you', 've', 'your', 'your', 'yourself', 'yourselv']
#Extracts the boolean query results from index
def extractFromIndex(tokens, indOfInd, indexName) -> list:
file = open(indexName)
results = {}
#Checks to see if all query words are in index
for x in tokens:
#Return [] if the token doesn't exist for boolean search
if x not in results:
results[x] = []
if x in indOfInd:
file.seek(indOfInd[x])
line = file.readline()
term, posts = parseStr(line)
results[x] = posts
# if results == []:
# #If results are empty, its equal to first set of docs it sees
# results = [post.getDoc() for post in index[x]]
# else:
# #Else results is the intersection
# results = findIntersection(results, [post.getDoc() for post in index[x]])
file.close()
return results
#Returns a list of the docIDs in common from the two list of ids
def findIntersection(list1, list2) -> list:
common = []
ind1 = 0
ind2 = 0
len1 = len(list1)
len2 = len(list2)
if (len1 == 0 or len2 == 0):
return common
while ind1< len1 and ind2 < len2:
if list1[ind1] < list2[ind2]:
ind1 += 1
elif list1[ind1] > list2[ind2]:
ind2 += 1
elif list1[ind1] == list2[ind2]:
common.append(list1[ind1])
ind1 += 1
ind2 += 1
return common
#Returns relevance of document given weights, list of tfidfs, and the square root squared sum of weights, and a promoter value
def relevance(weights, docTfidfs, qSq, pVal) -> int:
score = 0
#The variable to keep track of the sqrt of the sum of squared tfids
sqrtSqaureTfidf = 0
#Prod will keep track of the sum of products between weights and tfidfs
prod = 0
for x in range(len(docTfidfs)):
#This is doing the part of scoring where we sum tfidf
score += docTfidfs[x]
#Keeping track of sum of squares/sum of products
sqrtSqaureTfidf += (docTfidfs[x] * docTfidfs[x])
prod += weights[x]*docTfidfs[x]
#Square root the sum of squares
sqrtSqaureTfidf = math.sqrt(sqrtSqaureTfidf)
if pVal > 0:
score = 0.685*(score+(0.35*math.log(pVal))) + (1-0.685)*(prod/(sqrtSqaureTfidf*qSq))
else:
score = 0.685*(score) + (1-0.685)*(prod/(sqrtSqaureTfidf*qSq))
return score
#Returns results sorted by relevance
def resultsByRelevance(weights, results) -> list:
scores = {}
termCnt = 0
#Calculate the sqrt of the sum of squares of the query coordinates, done here and passed as param later so don't need to repeat calculations
sqrtSquareSumWeights = 0
for x in weights:
sqrtSquareSumWeights += (x*x)
sqrtSquareSumWeights = math.sqrt(sqrtSquareSumWeights)
#Get number of terms for creating our list of doc coordinates
totTerms = len(weights)
#Create doc coordinates using tfidf at corresponding index in the list, terms are guranteed to already be in same order
for term, posts in results.items():
for post in posts:
docId = post.getDoc()
if docId not in scores:
scores[docId] = [[], 0]
#Add in the necessary number of zeroes to build initial list
for x in range(totTerms):
scores[docId][0].append(0)
#Update list at index if valid
scores[docId][0][termCnt] = post.getTfidf()
scores[docId][1] += post.getImpTxt()*post.getTfidf()
termCnt += 1
return sorted(scores.keys(), key=(lambda x : -relevance(weights, scores[x][0], sqrtSquareSumWeights, scores[x][1])) )
#Given list of tokens, goes through them and returns a dict representing query and their weights, currently using freq in query
def makeQueryWeights(tokens):
#Count stopwords to determine if it's fine to remove them later or not
stopCount = 0
weights = {}
#Iterate through tokens, if not yet in dict, initialize the count to 1, otherwise increment the count by 1
for t in tokens:
if t in stopWords:
stopCount += 1
if t not in weights:
weights[t] = 1
else:
weights[t] += 1
if len(tokens) > 0 and (stopCount/len(tokens)) <= 0.5:
for k in list(weights.keys()):
if k in stopWords:
del weights[k]
return weights
if __name__ == "__main__":
#Decide which index of index and index to use based on if a champion list argument is given
indexOfIndexName = "indexOfIndexes"
indexName = "FinalIndex"
#Construct string based on arguments
if len(sys.argv)==2:
indexOfIndexName = indexOfIndexName + sys.argv[1]
indexName = indexName + sys.argv[1]
#Open and load index of index
file = open(indexOfIndexName, 'rb')
indOfInd = pickle.load(file)
file.close()
#Open and load docMap
file = open("pickleDocMap", "rb")
docMap = pickle.load(file)
file.close()
#Notfiy when setup finishes
print('Done loading')
print()
#While they don't quit keep prompting them for query and give results
while True:
query = input("Enter query (or type 'exit' to quit prompt): ").strip().lower()
#If they quit end the loop
if query == 'exit':
break
#Keep track of start of query input
start_time = time.time()
#process the query here
#Tokenize and stem query
ps = PorterStemmer()
tokens = [ps.stem(x) for x in tokenize(query)]
#Calculate weight of tokens in query
weights = makeQueryWeights(tokens)
#Get relevant results
results = extractFromIndex(weights.keys(), indOfInd, indexName)
#show results
relevantResults = None
curPage = 1
if any(x != [] for x in results.values()):
print()
#Get sorted results and print top 10
relevantResults = resultsByRelevance(list(weights.values()), results)
print(f"Total documents matching query '{query}': {len(relevantResults)}")
print("Currently on page 0")
if len(relevantResults)>10:
sortedResults = relevantResults[:10]
for x in sortedResults:
print(f'Docid: {x}\nURL: {docMap[x]}')
else:
print(f"No documents found for query '{query}'")
print("Time passed since queried: %s seconds" % (time.time() - start_time))
print()
if any(x != [] for x in results.values()):
while True:
action = input("To get next and previous pages enter 'next' or 'prev', to enter new query enter 'search': ").strip().lower()
if action == 'search':
print()
break
elif action == 'prev' and curPage>1:
curPage -= 1
print("Currently on page",curPage)
sortedResults = relevantResults[(curPage-1)*10:curPage*10]
for x in sortedResults:
print(f'Docid: {x}\nURL: {docMap[x]}')
print()
elif action == 'next' and (curPage)*10 < len(relevantResults):
curPage += 1
print("Currently on page",curPage)
sortedResults = relevantResults[(curPage-1)*10:curPage*10]
for x in sortedResults:
print(f'Docid: {x}\nURL: {docMap[x]}')
print()