-
Notifications
You must be signed in to change notification settings - Fork 73
/
Copy pathurls.py
310 lines (268 loc) · 10.1 KB
/
urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
import asyncio
import json
from threading import Thread
import time
from typing import Dict, Any
from urllib.parse import urlparse
from flask import Blueprint, request
from server.constant.constants import (MAX_ISOLATED_URL_BATCH_LENGTH,
FROM_ISOLATED_URL,
ADD_ISOLATED_URL_CONTENT,
DELETE_ISOLATED_URL_CONTENT)
from server.app.utils.decorators import token_required
from server.app.utils.sqlite_client import get_db_connection
from server.app.utils.diskcache_lock import diskcache_lock
from server.app.utils.url_helper import is_valid_url, normalize_url
from server.logger.logger_config import my_logger as logger
from server.rag.index.parser.html_parser.web_content_crawler import AsyncCrawlerSiteContent
urls_bp = Blueprint('urls', __name__, url_prefix='/open_kf_api/urls')
def async_isolated_url_content_task(url_dict: Dict[int, str],
task_type: int) -> None:
"""
Starts the asynchronous crawl and embedding process for a list of isolated urls.
task_type:
1 - add_content
2 - delete_content
"""
"""Start the crawl content task in an asyncio event loop."""
logger.info(
f"async_isolated_url_content_task begin! url_dict: {url_dict}, task_type: {task_type}"
)
crawler_content = AsyncCrawlerSiteContent(domain_list=[],
doc_source=FROM_ISOLATED_URL)
# Run the crawler
if task_type == ADD_ISOLATED_URL_CONTENT:
asyncio.run(crawler_content.add_content(url_dict))
elif task_type == DELETE_ISOLATED_URL_CONTENT:
asyncio.run(crawler_content.delete_content(url_dict))
logger.info(f"async_isolated_url_content_task end!")
@urls_bp.route('/submit_isolated_url_list', methods=['POST'])
@token_required
def submit_isolated_url_list() -> Dict[str, Any]:
data = request.json
url_list = data.get('url_list')
if not url_list:
return {
'retcode': -20000,
'message': 'url_list is required',
'data': {}
}
if len(url_list) > MAX_ISOLATED_URL_BATCH_LENGTH:
return {
'retcode': -20001,
'message':
f"The size of 'url_list' is {len(url_list)}, which is greater than {MAX_ISOLATED_URL_BATCH_LENGTH}",
'data': {}
}
normalized_url_list = []
for url in url_list:
if not is_valid_url(url):
logger.error(f"url: '{url}' is not a valid URL!")
return {
'retcode': -20002,
'message': f"url: '{url}' is not a valid URL",
'data': {}
}
normalized_url_list.append(normalize_url(url))
conn = None
try:
conn = get_db_connection()
cur = conn.cursor()
# Find which URLs already exist in the database
placeholders = ', '.join(['?'] * len(normalized_url_list))
cur.execute(
f"SELECT id, url FROM t_isolated_url_tab WHERE url IN ({placeholders})",
normalized_url_list)
existing_urls = {row['url']: row['id'] for row in cur.fetchall()}
logger.warning(f"The existing_urls is {existing_urls}")
# Determine new and existing URLs
existing_to_update = []
new_to_insert = []
timestamp = int(time.time())
for url in normalized_url_list:
if url in existing_urls:
existing_to_update.append((timestamp, existing_urls[url]))
else:
new_to_insert.append((url, timestamp, timestamp))
try:
with diskcache_lock.lock():
# Update all existing URLs in one operation
if existing_to_update:
cur.executemany(
"UPDATE t_isolated_url_tab SET doc_status = 1, mtime = ? WHERE id = ?",
existing_to_update)
# Insert all new URLs in one operation
if new_to_insert:
cur.executemany(
"INSERT INTO t_isolated_url_tab (url, content, content_length, content_md5, doc_status, ctime, mtime) VALUES (?, '[]', 0, '', 1, ?, ?)",
new_to_insert)
conn.commit()
except Exception as e:
logger.error(f"Process discache_lock exception: {e}")
return {
'retcode': -30000,
'message': f'An error occurred: {e}',
'data': {}
}
cur.execute(
f"SELECT id, url FROM t_isolated_url_tab WHERE url IN ({placeholders})",
normalized_url_list)
url_dict = {row['id']: row['url'] for row in cur.fetchall()}
# Start the asynchronous crawl task
Thread(target=async_isolated_url_content_task,
args=(url_dict, ADD_ISOLATED_URL_CONTENT)).start()
return {
'retcode': 0,
'message': 'URLs processed successfully',
'data': {
'url_id_list': list(url_dict.keys())
}
}
except Exception as e:
logger.error(f"An error occurred: {e}")
return {
'retcode': -30000,
'message': f'An error occurred: {e}',
'data': {}
}
finally:
if conn:
conn.close()
@urls_bp.route('/get_isolated_url_list', methods=['POST'])
@token_required
def get_isolated_url_list():
data = request.json
url_id_list = data.get('id_list', None) # Make site an optional parameter
conn = None
try:
conn = get_db_connection()
cur = conn.cursor()
if url_id_list:
placeholders = ', '.join(['?'] * len(url_id_list))
cur.execute(
f"SELECT id, url, content_length, doc_status, ctime, mtime FROM t_isolated_url_tab WHERE id IN ({placeholders})",
url_id_list)
else:
cur.execute(
"SELECT id, url, content_length, doc_status, ctime, mtime FROM t_isolated_url_tab"
)
rows = cur.fetchall()
response_data = {}
response_data['url_list'] = [dict(row) for row in rows]
return {'retcode': 0, 'message': 'Success', 'data': response_data}
except Exception as e:
logger.error(f"An error occurred while fetching URL list: {e}")
return {
'retcode': -30000,
'message': f'An error occurred: {e}',
'data': {}
}
finally:
if conn:
conn.close()
@urls_bp.route('/delete_isolated_url_list', methods=['POST'])
@token_required
def delete_isolated_url_list():
data = request.json
url_id_list = data.get('id_list')
if not url_id_list:
return {
'retcode': -20000,
'message': 'id_list is required',
'data': {}
}
conn = None
try:
conn = get_db_connection()
cur = conn.cursor()
placeholders = ', '.join(['?'] * len(url_id_list))
cur.execute(
f"SELECT id, url FROM t_isolated_url_tab WHERE id IN ({placeholders})",
url_id_list)
url_dict = {row['id']: row['url'] for row in cur.fetchall()}
# Use threading to avoid blocking the Flask application
Thread(target=async_isolated_url_content_task,
args=(url_dict, DELETE_ISOLATED_URL_CONTENT)).start()
return {
'retcode': 0,
'message': 'Started deleting the isolated URL list embeddings.',
'data': {}
}
except Exception as e:
logger.error(f"An error occurred while fetching URL list: {e}")
return {
'retcode': -30000,
'message': f'An error occurred: {e}',
'data': {}
}
finally:
if conn:
conn.close()
@urls_bp.route('/get_isolated_url_sub_content_list', methods=['POST'])
@token_required
def get_isolated_url_sub_content_list():
data = request.json
url_id = data.get('id')
page = data.get('page')
page_size = data.get('page_size')
# Validate mandatory parameters
if None in (url_id, page, page_size):
return {
'retcode': -20000,
'message': 'Missing mandatory parameters',
'data': {}
}
if not isinstance(page, int) or not isinstance(
page_size, int) or page < 1 or page_size < 1:
return {
'retcode': -20001,
'message': 'Invalid page or page_size parameters',
'data': {}
}
conn = None
try:
conn = get_db_connection()
cur = conn.cursor()
# Retrieve the content from the database
cur.execute('SELECT content FROM t_isolated_url_tab WHERE id = ?',
(url_id, ))
row = cur.fetchone()
if not row:
return {
'retcode': -30000,
'message': 'Content not found',
'data': {}
}
content = row['content']
content_vec = json.loads(content)
# Calculate pagination details
total_count = len(content_vec)
start_index = (page - 1) * page_size
end_index = start_index + page_size
if start_index > 0 and start_index >= total_count:
return {
'retcode': -20002,
'message': 'Page number out of range',
'data': {}
}
# Slice the content vector to get the sub-content list for the current page
sub_content_list = [{
"index": start_index + index + 1,
"content": part,
"content_length": len(part)
} for index, part in enumerate(content_vec[start_index:end_index],
start=start_index)]
return {
"retcode": 0,
"message": "success",
"data": {
"total_count": total_count,
"sub_content_list": sub_content_list
}
}
except Exception as e:
logger.error(f"An error occurred: {e}")
return {'retcode': -30001, 'message': 'Database exception', 'data': {}}
finally:
if conn:
conn.close()