Skip to content

Commit c7211df

Browse files
committed
replace 'Redis' with 'diskcache'
1 parent a34de92 commit c7211df

12 files changed

+511
-369
lines changed

README.md

+2-10
Original file line numberDiff line numberDiff line change
@@ -84,23 +84,15 @@ Once the virtual environment is activated, you can use `pip` to install the requ
8484
pip install -r requirements.txt
8585
```
8686

87-
#### 4.2 Install Redis
88-
89-
The OpenKF service relies on Redis as its caching service. If Redis is already installed, start Redis and listen on port `6379`. If not installed, refer to the following method for installation.
90-
91-
```shell
92-
docker run --name redis -d -p 6379:6379 redis
93-
```
94-
95-
#### 4.3 Create SQLite Database
87+
#### 4.2 Create SQLite Database
9688

9789
The OpenKF service uses SQLite as its storage DB. Before starting the OpenKF service, you need to execute the following command to initialize the database and add the default configuration for admin console.
9890

9991
```shell
10092
python3 create_sqlite_db.py
10193
```
10294

103-
#### 4.4 Start the service
95+
#### 4.3 Start the service
10496

10597
If you have completed the steps above, you can try to start the OpenKF service by executing the following command.
10698

crawler_module/web_content_crawler.py

+35-42
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,14 @@
1212

1313
class AsyncCrawlerSiteContent:
1414

15-
def __init__(self, domain_list, sqlite_db_path, max_requests, max_embedding_input, document_embedder_obj, redis_lock):
15+
def __init__(self, domain_list, sqlite_db_path, max_requests, max_embedding_input, document_embedder_obj, distributed_lock):
1616
logger.info(f"[CRAWL_CONTENT] init, domain_list:{domain_list}")
1717
self.domain_list = domain_list
1818
self.sqlite_db_path = sqlite_db_path
1919
self.semaphore = asyncio.Semaphore(max_requests)
2020
self.max_embedding_input = max_embedding_input
2121
self.document_embedder_obj = document_embedder_obj
22-
self.redis_lock = redis_lock
22+
self.distributed_lock = distributed_lock
2323
self.count = 0
2424
self.batch_size = max_requests * 2
2525

@@ -112,20 +112,19 @@ async def update_doc_status(self, doc_id_list, doc_status):
112112
logger.info(f"[CRAWL_CONTENT] update_doc_status, doc_id_list:{doc_id_list}, doc_status:{doc_status}")
113113
timestamp = int(time.time())
114114
async with aiosqlite.connect(self.sqlite_db_path) as db:
115-
# Enable WAL mode for better concurrency
116115
await db.execute("PRAGMA journal_mode=WAL;")
117116

118-
if await self.redis_lock.aacquire_lock():
119-
try:
117+
try:
118+
with self.distributed_lock.lock():
120119
await db.execute(
121120
"UPDATE t_raw_tab SET doc_status = ?, mtime = ? WHERE id IN ({placeholders})".format(
122121
placeholders=','.join(['?' for _ in doc_id_list])
123122
),
124123
[doc_status, timestamp] + doc_id_list
125124
)
126125
await db.commit()
127-
finally:
128-
await self.redis_lock.arelease_lock()
126+
except Exception as e:
127+
logger.error(f"process distributed_lock exception:{e}")
129128

130129
async def fetch_existing_contents(self, doc_id_list):
131130
"""
@@ -134,7 +133,6 @@ async def fetch_existing_contents(self, doc_id_list):
134133
logger.info(f"[CRAWL_CONTENT] fetch_existing_contents, doc_id_list:{doc_id_list}")
135134
query = "SELECT id, content FROM t_raw_tab WHERE id IN ({})".format(', '.join('?' for _ in doc_id_list))
136135
async with aiosqlite.connect(self.sqlite_db_path) as db:
137-
# Enable WAL mode for better concurrency
138136
await db.execute("PRAGMA journal_mode=WAL;")
139137

140138
cursor = await db.execute(query, doc_id_list)
@@ -192,33 +190,31 @@ async def process_updated_contents(self, updated_contents, url_dict):
192190
content_update_queries.append((content_json, content_length, 3, timestamp, doc_id))
193191

194192
# Lock to ensure database operations are atomic
195-
if await self.redis_lock.aacquire_lock():
196-
try:
197-
async with aiosqlite.connect(self.sqlite_db_path) as db:
198-
# Enable WAL mode for better concurrency
199-
await db.execute("PRAGMA journal_mode=WAL;")
193+
async with aiosqlite.connect(self.sqlite_db_path) as db:
194+
await db.execute("PRAGMA journal_mode=WAL;")
200195

196+
try:
197+
with self.distributed_lock.lock():
201198
# Update content details in t_raw_tab
202199
await db.executemany(
203200
"UPDATE t_raw_tab SET content = ?, content_length = ?, doc_status = ?, mtime = ? WHERE id = ?",
204201
content_update_queries
205202
)
206203
await db.commit()
207-
finally:
208-
await self.redis_lock.arelease_lock()
209-
204+
except Exception as e:
205+
logger.error(f"process distributed_lock exception:{e}")
206+
210207
# Delete old embeddings
211208
doc_id_list = list(updated_contents.keys())
212209
await self.delete_embedding_doc(doc_id_list)
213210

214211
# Prepare data for updating embeddings and database records
215212
data_for_embedding = [(doc_id, url_dict[doc_id], chunk_text_vec) for doc_id, chunk_text_vec in updated_contents.items()]
216-
if await self.redis_lock.aacquire_lock():
217-
try:
213+
try:
214+
with self.distributed_lock.lock():
218215
records_to_add, records_to_update = await self.document_embedder_obj.aadd_content_embedding(data_for_embedding)
219216
# Insert new embedding records and update t_raw_tab doc_status to 4
220217
async with aiosqlite.connect(self.sqlite_db_path) as db:
221-
# Enable WAL mode for better concurrency
222218
await db.execute("PRAGMA journal_mode=WAL;")
223219

224220
if records_to_add:
@@ -229,25 +225,24 @@ async def process_updated_contents(self, updated_contents, url_dict):
229225
if records_to_update:
230226
await db.executemany("UPDATE t_raw_tab SET doc_status = ?, mtime = ? WHERE id = ?", records_to_update)
231227
await db.commit()
232-
finally:
233-
await self.redis_lock.arelease_lock()
228+
except Exception as e:
229+
logger.error(f"process distributed_lock exception:{e}")
234230

235231
async def update_unchanged_contents_status(self, unchanged_doc_ids):
236232
"""
237233
Update the status of unchanged contents in the database to reflect they have been processed.
238234
"""
239235
logger.info(f"[CRAWL_CONTENT] update_unchanged_contents_status, unchanged_doc_ids:{unchanged_doc_ids}")
240236
async with aiosqlite.connect(self.sqlite_db_path) as db:
241-
# Enable WAL mode for better concurrency
242237
await db.execute("PRAGMA journal_mode=WAL;")
243238

244-
if await self.redis_lock.aacquire_lock():
245-
try:
239+
try:
240+
with self.distributed_lock.lock():
246241
async with aiosqlite.connect(self.sqlite_db_path) as db:
247242
await db.execute("UPDATE t_raw_tab SET doc_status = 4 WHERE id IN ({})".format(', '.join('?' for _ in unchanged_doc_ids)), unchanged_doc_ids)
248243
await db.commit()
249-
finally:
250-
await self.redis_lock.arelease_lock()
244+
except Exception as e:
245+
logger.error(f"process distributed_lock exception:{e}")
251246

252247
async def add_content(self, url_dict):
253248
"""Begin processing URLs from url_dict in batches for add."""
@@ -294,26 +289,25 @@ async def delete_embedding_doc(self, doc_id_vec):
294289
doc_id_tuple = tuple(doc_id_vec)
295290
placeholder = ','.join('?' * len(doc_id_vec)) # Create placeholders
296291
async with aiosqlite.connect(self.sqlite_db_path) as db:
297-
# Enable WAL mode for better concurrency
298292
await db.execute("PRAGMA journal_mode=WAL;")
299293

300294
cursor = await db.execute(f"SELECT embedding_id_list FROM t_doc_embedding_map_tab WHERE doc_id IN ({placeholder})", doc_id_tuple)
301295
rows = await cursor.fetchall()
302296
# Parse embedding_id_list and flatten the list
303297
embedding_id_vec = [id for row in rows for id in json.loads(row[0])]
304298

305-
if await self.redis_lock.aacquire_lock():
306-
try:
299+
try:
300+
with self.distributed_lock.lock():
307301
if embedding_id_vec:
308302
logger.info(f"[CRAWL_CONTENT] delete_embedding_doc, document_embedder_obj.delete_content_embedding:{embedding_id_vec}")
309303
self.document_embedder_obj.delete_content_embedding(embedding_id_vec)
310304

311305
# Delete records from t_doc_embedding_map_tab
312306
await db.execute(f"DELETE FROM t_doc_embedding_map_tab WHERE doc_id IN ({placeholder})", doc_id_tuple)
313307
await db.commit()
314-
finally:
315-
await self.redis_lock.arelease_lock()
316-
308+
except Exception as e:
309+
logger.error(f"process distributed_lock exception:{e}")
310+
317311
async def delete_content(self, url_dict, delete_raw_table=True):
318312
"""Begin processing URLs from url_dict in batches for deletion."""
319313
begin_time = int(time.time())
@@ -345,12 +339,13 @@ async def process_delete_batch(self, batch, delete_raw_table):
345339
# Delete records from t_raw_tab after deleting embeddings
346340
async with aiosqlite.connect(self.sqlite_db_path) as db:
347341
await db.execute("PRAGMA journal_mode=WAL;")
348-
if await self.redis_lock.aacquire_lock():
349-
try:
342+
343+
try:
344+
with self.distributed_lock.lock():
350345
await db.execute(f"DELETE FROM t_raw_tab WHERE id IN ({placeholder})", doc_id_tuple)
351346
await db.commit()
352-
finally:
353-
await self.redis_lock.arelease_lock()
347+
except Exception as e:
348+
logger.error(f"process distributed_lock exception:{e}")
354349

355350
async def update_content(self, url_dict):
356351
logger.info(f"[CRAWL_CONTENT] update_content begin, url_dict:{url_dict}")
@@ -361,7 +356,6 @@ async def update_content(self, url_dict):
361356
async def check_and_update_domain_status(self):
362357
logger.info(f"[CRAWL_CONTENT] check_and_update_domain_status")
363358
async with aiosqlite.connect(self.sqlite_db_path) as db:
364-
# Enable WAL mode for better concurrency
365359
await db.execute("PRAGMA journal_mode=WAL;")
366360

367361
timestamp = int(time.time())
@@ -375,13 +369,12 @@ async def check_and_update_domain_status(self):
375369
"SELECT COUNT(*) FROM t_raw_tab WHERE domain = ? AND doc_status < 4", (domain,))
376370
count_row = await cursor.fetchone()
377371
if count_row[0] == 0: # If no records have doc_status < 4
378-
if await self.redis_lock.aacquire_lock():
379-
try:
372+
try:
373+
with self.distributed_lock.lock():
380374
# Step 3: Update domain_status to 4 in t_domain_tab
381375
await db.execute(
382376
"UPDATE t_domain_tab SET domain_status = ?, mtime = ? WHERE domain = ?", (4, timestamp, domain))
383377
await db.commit()
384-
finally:
385-
await self.redis_lock.arelease_lock()
378+
except Exception as e:
379+
logger.error(f"process distributed_lock exception:{e}")
386380
logger.info(f"[CRAWL_CONTENT] check_and_update_domain_status, Domain status updated to 4 for domain:'{domain}'")
387-

0 commit comments

Comments
 (0)