-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
170 lines (127 loc) · 5.89 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# app.py
from flask import Flask, render_template, request
import search
from main import *
from openai import OpenAI
import requests
from spellchecker import SpellChecker
from nltk.tokenize import word_tokenize
app = Flask(__name__)
index_blocks_path = './index-blocks'
folder_path = 'DEV'
client = OpenAI(api_key = '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',)
def fetch_text_from_url(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract text content from the HTML, you may need to adjust this based on the structure of the webpage
text_content = ' '.join([p.get_text() for p in soup.find_all('p')])
return text_content
def build_index_if_needed():
# Set the path for storing index blocks
if not os.path.exists(index_blocks_path):
os.makedirs(index_blocks_path)
# Build the inverted index if the merged output doesn't exist
if not os.path.exists("./merged_output.txt"):
build_index(folder_path)
# Get the list of index block files
index_files = get_files_in_folder("index-blocks", "txt")
# Perform binary merge on the index block files
merge.binary_merge(index_files)
print("Inverted index found or built")
if not os.path.exists("./document_mapping.txt"):
# if document mapping doesn't exist, write it
build_mapping()
else:
# otherwise read it
read_mapping()
if not os.path.exists("./document_length_mapping.txt"):
# if document mapping doesn't exist, write it
build_length_mapping()
else:
# otherwise read it
read_length_mapping()
print("Read inverted index")
script_dir = os.path.dirname(os.path.realpath(__file__))
merged_output_path = os.path.join(script_dir, 'merged_output.txt')
inverted_index = search.read_inverted_index_position(merged_output_path)
return merged_output_path, inverted_index
def summarize_text(texts):
# Use OpenAI API for summarization
if texts != "":
prompt = "summarize\n".join(texts)
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
max_tokens=100 # Adjust as needed
)
return response.choices[0].message.content
return ""
merged_output_path, inverted_index = build_index_if_needed()
@app.route('/')
def index():
return render_template('index.html')
@app.route('/search', methods=['GET'])
def app_search():
show_suggest = False
query = request.args.get('q', '')
# clean up the query so it can be used by search()
user_query = replaceSpecialCharacters(query)
user_query_words = word_tokenize(user_query)
# whether we want to search with summary or not
summarize = request.args.get('summarize', 'false') == 'true'
# Use the search function from search.py
# get result of searching with the user query
result_documents, avg_score_orginal = search.search(user_query_words, inverted_index, url_mapping,
url_length_mapping,
merged_output_path)
resultTuples = [v for k, v in sorted(result_documents.items())]
# extract results
results = [u for u,s in resultTuples]
# run a query with spell checked corrections
spell = SpellChecker()
corrected_query = [spell.correction(word.lower()) if len(word) > 3 else word.lower() for word in user_query_words]
corrected_query_string = ' '.join(corrected_query)
corrected_query_words = list(set(corrected_query))
result_documents_corrected, avg_score_corrected = search.search(corrected_query_words, inverted_index, url_mapping,
url_length_mapping, merged_output_path)
correctedResultTuples = [v for k, v in sorted(result_documents_corrected.items())]
correctedResults = [u for u, s in correctedResultTuples]
# if original query returned no results, return the spellchecked result because the user probably made a typo
if len(results) == 0:
results = correctedResults
query = corrected_query_string
# else if the avg score of the corrected query's results is higher than the original, suggest that query to the user
elif avg_score_corrected > avg_score_orginal:
show_suggest = True
print(f'Did you mean {corrected_query_string}?')
summaries = ['','','','','']
# if option to search with summarize was selected
if summarize:
# Fetch and summarize content from each URL
summaries = []
for url in results:
text_content = fetch_text_from_url(url)
# print(text_content.strip())
if text_content.strip() != '':
summary = summarize_text([text_content])
else:
summary = ""
summaries.append(summary)
# print(summary)
print("summaries list : ", summaries)
# Zip results and summaries for passing to the template
result_summaries = zip(results, summaries)
# render the page and return it
page = render_template('search_results.html', query=query, result_summaries=result_summaries, show_suggest = show_suggest, corrected_query = corrected_query_string, suggested_results = correctedResults, summarized = summarize)
return page
@app.route('/suggest_search', methods=['GET'])
def suggest_search():
# Get the suggested results from the query parameters
suggested_results = request.args.getlist('suggested_results')
# Other necessary data for rendering the template
query = request.args.get('corrected_query')
# Render the suggest.html template with the suggested results
page = render_template('suggest.html', query=query, suggested_results=suggested_results)
return page
if __name__ == '__main__':
app.run(debug=True, use_reloader=False)