-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
328 lines (272 loc) · 10.6 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
from difflib import SequenceMatcher
import os
import glob
import json
import string
from urllib.parse import urlparse, urlunparse
import merge
import search
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
import time
from spellchecker import SpellChecker
import psutil
develop = True
unique_words = set()
doc_id = 1
block_id = 1
url_mapping = dict()
url_length_mapping = dict()
# Function to get a list of files in a folder with a file extension
def get_files_in_folder(folder_path, file_extension='json'):
files = []
for root, _, filenames in os.walk(folder_path):
for filename in filenames:
if filename.endswith(f".{file_extension}"):
files.append(os.path.join(root, filename))
return files
# Function to parse a document and extract relevant information
def parse_document(file):
# print(f"parsing {file}")
# input()
data = json.load(file)
content = data['content']
if data['encoding'] != 'utf-8' and data['encoding'] != 'ascii':
return [], '',0
soup = BeautifulSoup(content, "lxml")
# obtaining important text
important_text = []
important_tags = ['b', 'strong', 'h1', 'h2', 'h3', 'title']
for tag in important_tags:
elements = soup.find_all(tag)
for element in elements:
important_text.append(element.get_text().lower())
words = soup.get_text().lower()
for imp_text in important_text:
words += " " + imp_text
stopword_set = set(stopwords.words('english'))
words_list = word_tokenize(words)
filtered_words_list = [
w
for w in words_list
]
url = data['url']
parsed_url = urlparse(url)
url_without_fragment = parsed_url._replace(fragment='').geturl()
word_count = len(filtered_words_list)
# return filtered_words_list
return filtered_words_list, url_without_fragment, word_count
# Function to write an index block to a file
def write_block(indices):
global block_id
output_file = f'index-blocks/inverted_index-{block_id}.txt'
sorted_indices = sorted(indices.items(), key=lambda x: x[0])
with open(output_file, 'w', encoding='utf-8') as output:
for pair in sorted_indices:
try:
output.write(f'{pair}\n')
except Exception as e:
print(f'An error occurred while writing, {e}')
output.close()
block_id += 1
# Function to check if a new URL is too similar to URLs in a list
def is_similar_url(new_url, url_list, similarity_threshold=0.94):
for url in url_list:
# Check if the new URL is too similar to any URL in the list
if SequenceMatcher(None, new_url, url).ratio() > similarity_threshold:
return True # Return True if too similar
return False # Return False if not too similar
# Main function to build the inverted index from a folder of JSON documents
def build_index(folder_path):
global url_mapping
global url_length_mapping
inverted_index = dict()
batch_limit = 2500 # Adjust the batch size as needed (note: 2000)
current_batch = 1
global doc_id
global block_id
url_list = []
for file_path in get_files_in_folder(folder_path):
with open(file_path, 'r', encoding='utf-8') as file:
file_name = os.path.splitext(os.path.basename(file_path))[0]
tokens, url, word_count= parse_document(file)
if not is_similar_url(url, url_list[-min(10, len(url_list)):]):
url_list.append(url)
url_list = url_list[-10:]
else:
continue
url_mapping[doc_id] = url
url_length_mapping[doc_id] = word_count
if doc_id % 10 == 0:
print(doc_id, url)
# this is used to stem the word that to be included into the index
ps = PorterStemmer()
for token in tokens:
token = ps.stem(token.lower())
if token not in inverted_index:
inverted_index[token] = []
if token not in unique_words:
unique_words.add(token)
inverted_index[token].append(doc_id)
doc_id += 1
current_batch += 1
if current_batch == batch_limit:
current_batch = 1
write_block(inverted_index)
inverted_index.clear()
if inverted_index:
write_block(inverted_index)
inverted_index.clear()
# Function to build the mapping of document IDs to URLs
def build_mapping():
global url_mapping
with open('document_mapping.txt','w',encoding='utf-8') as file:
for doc_id, url in url_mapping.items():
file.write(f'{doc_id},{url}\n')
# Function to build the mapping of document IDs to document lengths
def build_length_mapping():
global url_length_mapping
with open('document_length_mapping.txt','w',encoding='utf-8') as file:
for doc_id, url in url_length_mapping.items():
file.write(f'{doc_id},{url}\n')
# Function to read the document mapping from document_mapping
def read_mapping():
global url_mapping
with open('document_mapping.txt','r',encoding='utf-8') as file:
line = file.readline()
while line:
line = line.strip().split(',')
url_mapping[int(line[0])] = line[1]
line = file.readline()
# Function to read the document length mapping from document_length_mapping
def read_length_mapping():
global url_length_mapping
with open('document_length_mapping.txt','r',encoding='utf-8') as file:
line = file.readline()
while line:
line = line.strip().split(',')
url_length_mapping[int(line[0])] = line[1]
line = file.readline()
# Function to get memory usage of the process
def get_memory_usage():
process = psutil.Process()
memory_info = process.memory_info()
return memory_info.rss # Resident Set Size in bytes
# Function to replace special characters in a query string
def replaceSpecialCharacters(queryString):
characterMapping = {
'.': ' dot ',
',': ' comma ',
'?': ' question mark ',
'!': ' exclamation mark ',
';': ' semicolon ',
':': ' colon ',
'(': ' left parenthesis ',
')': ' right parenthesis ',
'[': ' left square bracket ',
']': ' right square bracket ',
'{': ' left curly brace ',
'}': ' right curly brace ',
'-': ' hyphen ',
'_': ' underscore ',
'\'': ' apostrophe ',
'"': ' quotation mark ',
'/': ' slash ',
'\\': ' backslash ',
'|': ' vertical bar ',
'@': ' at ',
'#': ' hashtag ',
'$': ' dollar ',
'%': ' percent ',
'^': ' caret ',
'&': ' ampersand ',
'*': ' asterisk ',
'+': ' plus sign',
'=': ' equal sign',
'<': ' less than ',
'>': ' greater than ',
'~': ' tilde ',
'`': ' backtick '
}
res = []
for c in queryString:
if c in characterMapping.keys():
res.append(characterMapping[c])
else:
res.append(c)
result_string = ''.join(res)
return result_string
# Function to print search results
def printResults(resultDict):
for rank in resultDict.keys():
url, score = resultDict[rank]
print(f'Rank {rank}: {url}')
if __name__ == '__main__':
starttime = time.time()
# Set the path for storing index blocks
index_blocks_path = './index-blocks'
if not os.path.exists(index_blocks_path):
os.makedirs(index_blocks_path)
# downloaded folder
folder_path = 'DEV'
# Build the inverted index if the merged output doesn't exist
if not os.path.exists("./merged_output.txt"):
build_index(folder_path)
# Get the list of index block files
index_files = get_files_in_folder("index-blocks", "txt")
# Perform binary merge on the index block files
merge.binary_merge(index_files)
print("Inverted index found or built")
if not os.path.exists("./document_mapping.txt"):
# if document mapping doesn't exist, write it
build_mapping()
else:
# otherwise read it
read_mapping()
if not os.path.exists("./document_length_mapping.txt"):
# if document mapping doesn't exist, write it
build_length_mapping()
else:
# otherwise read it
read_length_mapping()
print("Read inverted index")
script_dir = os.path.dirname(os.path.realpath(__file__))
merged_output_path = os.path.join(script_dir, 'merged_output.txt')
inverted_index = search.read_inverted_index_position(merged_output_path)
#Report
endtime = time.time()
runtime = endtime - starttime
while True:
user_query = input("Enter your search query: ")
user_query = replaceSpecialCharacters(user_query)
user_query_words = word_tokenize(user_query)
# print(f'First round of query processing: {user_query_words}')
if len(user_query_words) == 0:
print(f'search must not be empty')
continue
starttime = time.time()
result_documents, avg_score_orginal = search.search(user_query_words, inverted_index,url_mapping, url_length_mapping, merged_output_path) #result_documents = search.search(user_query, inverted_index)
endtime = time.time()
runtime = endtime - starttime
print(f'Search time: {runtime}')
# print(f'Avg score: {avg_score_orginal}')
printResults(result_documents)
corrected_start_time = time.time()
spell = SpellChecker()
corrected_query_words = list({spell.correction(word.lower()) if len(word) > 3 else word.lower() for word in user_query_words})
corrected_query_words = [value for value in corrected_query_words if value is not None]
# print(f'Corrected query words: {corrected_query_words}')
if len(corrected_query_words):
result_documents_corrected, avg_score_corrected = search.search(corrected_query_words, inverted_index,url_mapping, url_length_mapping, merged_output_path)
if len(result_documents.keys()) == 0 or avg_score_corrected > avg_score_orginal:
print(f'Did you mean {corrected_query_words}?')
# print(f'Avg score: {avg_score_corrected}')
printResults(result_documents_corrected)
print(f'Suggestion time: {time.time() - corrected_start_time}')
memory_used = get_memory_usage()
print(f"Memory used: {memory_used} bytes")