1+ import pandas as pd
12from elasticsearch import Elasticsearch
23from typing import List , Dict , Any , Tuple , Optional
3- from datetime import datetime
4+ from datetime import datetime , timezone
45
56
67class ElasticsearchService :
@@ -11,6 +12,7 @@ def __init__(self, host: str = "elasticsearch", port: int = 9200):
1112 self .index_name = "place_data"
1213 self .log_index_name = "chatbot_log"
1314 self .click_log_index_name = "click_log"
15+ self .search_log_index_name = "search_log"
1416
1517 def is_connected (self ) -> bool :
1618 """연결 상태 확인"""
@@ -19,7 +21,7 @@ def is_connected(self) -> bool:
1921 except :
2022 return False
2123
22- def search_places (self , query : str , max_results : int = 23 ) -> List [Dict [str , Any ]]:
24+ def search_places (self , query : str , max_results : int = 23 , user_id : Optional [ str ] = None ) -> List [Dict [str , Any ]]:
2325 """장소 검색"""
2426 search_body = {
2527 "query" : {
@@ -48,51 +50,19 @@ def search_places(self, query: str, max_results: int = 23) -> List[Dict[str, Any
4850 for hit in hits
4951 ]
5052
51- return places
52-
53- def search_places_chatbot (self , query : str , max_results : int = 100 ):
54- """챗봇 장소 검색"""
55- search_body = {
56- "query" : {
57- "match" : {
58- "name" : {
59- "query" : query ,
60- "fuzziness" : "AUTO"
61- }
62- }
63- },
64- "sort" : [{"_score" : {"order" : "desc" }}],
65- "size" : max_results ,
66- "_source" : ["uuid" , "name" , "category" , "subcategory" , "gu" , "dong" , "ro" , "station" , "location" , "opentime" , "breaktime" , "closedate" , "phone" , "alias" , "address" , "content" ]
67- }
68-
69- response = self .es .search (index = self .index_name , body = search_body )
70-
71- hits = response ['hits' ]['hits' ]
72- places = [
73- {
74- 'uuid' : hit ['_source' ]['uuid' ],
75- 'name' : hit ['_source' ]['name' ],
76- 'category' : hit ['_source' ]['category' ],
77- 'subcategory' : hit ['_source' ]['subcategory' ],
78- 'gu' : hit ['_source' ]['gu' ],
79- 'dong' : hit ['_source' ]['dong' ],
80- 'ro' : hit ['_source' ]['ro' ],
81- 'station' : hit ['_source' ]['station' ],
82- 'location' : hit ['_source' ]['location' ],
83- 'opentime' : hit ['_source' ]['opentime' ],
84- 'breaktime' : hit ['_source' ]['breaktime' ],
85- 'closedate' : hit ['_source' ]['closedate' ],
86- 'phone' : hit ['_source' ]['phone' ],
87- 'alias' : hit ['_source' ]['alias' ],
88- 'address' : hit ['_source' ]['address' ],
89- 'content' : hit ['_source' ]['content' ]
53+ if user_id :
54+ place_ids = [place ['uuid' ] for place in places ]
55+ log_data = {
56+ "userId" : user_id ,
57+ "query" : query ,
58+ "placeIds" : place_ids ,
59+ "timestamp" : datetime .now (timezone .utc )
9060 }
91- for hit in hits
92- ]
61+ self . insert_search_log ( log_data )
62+
9363 return places
9464
95- def search_places_for_llm_tool (self , region : str , categories : List [str ]) -> Tuple [List [str ], int ]:
65+ def search_places_for_llm_tool (self , region : str , categories : List [str ], user_id : Optional [ str ] = None ) -> Tuple [List [str ], int ]:
9666 """
9767 LLM 도구를 위한 장소 검색.
9868 지역과 카테고리 정보를 바탕으로 장소 uuid 목록과 총 개수를 반환합니다.
@@ -308,4 +278,130 @@ def get_click_count_by_place(self, place_id: str) -> int:
308278 return response .get ('count' , 0 )
309279 except Exception as e :
310280 print (f"클릭 수 조회 오류: { e } " )
311- return 0
281+ return 0
282+
283+ def create_search_log_index_if_not_exists (self ):
284+ """검색 로그 인덱스가 없으면 생성"""
285+ try :
286+ if not self .es .indices .exists (index = self .search_log_index_name ):
287+ # 검색 로그 인덱스 매핑 설정
288+ mapping = {
289+ "mappings" : {
290+ "properties" : {
291+ "userId" : {"type" : "keyword" },
292+ "query" : {"type" : "text" },
293+ "placeIds" : {"type" : "keyword" },
294+ "timestamp" : {"type" : "date" }
295+ }
296+ }
297+ }
298+ self .es .indices .create (index = self .search_log_index_name , body = mapping )
299+ print (f"검색 로그 인덱스 '{ self .search_log_index_name } ' 생성 완료" )
300+ except Exception as e :
301+ raise Exception (f"검색 로그 인덱스 생성 오류: { e } " )
302+
303+ def insert_search_log (self , log_data : dict ) -> Tuple [bool , Optional [str ]]:
304+ """검색 로그 데이터를 Elasticsearch에 삽입"""
305+ try :
306+ # 문서 ID 생성 (타임스탬프 + userId 조합)
307+ doc_id = f"{ log_data ['userId' ]} _{ int (datetime .now ().timestamp ())} "
308+
309+ # Elasticsearch에 문서 삽입
310+ response = self .es .index (
311+ index = self .search_log_index_name ,
312+ id = doc_id ,
313+ body = log_data
314+ )
315+
316+ # 삽입 성공 여부 확인
317+ if response .get ('result' ) in ['created' , 'updated' ]:
318+ return True , doc_id
319+ else :
320+ return False , None
321+
322+ except Exception as e :
323+ print (f"검색 로그 삽입 오류: { e } " )
324+ return False , None
325+
326+ def get_all_search_logs_by_user (self , user_id : str ) -> List [Dict ]:
327+ """사용자의 모든 검색 로그를 시간순으로 조회"""
328+ try :
329+ query = {
330+ "query" : {"term" : {"userId" : user_id }},
331+ "sort" : [{"timestamp" : {"order" : "asc" }}],
332+ "size" : 1000
333+ }
334+ response = self .es .search (index = self .search_log_index_name , body = query )
335+ return response ["hits" ]["hits" ]
336+ except Exception as e :
337+ print (f"사용자 검색 로그 조회 오류: { e } " )
338+ return []
339+
340+ def get_all_click_logs_by_user (self , user_id : str ) -> List [Dict ]:
341+ """사용자의 모든 클릭 로그를 시간순으로 조회"""
342+ try :
343+ query = {
344+ "query" : {"term" : {"userId" : user_id }},
345+ "sort" : [{"timestamp" : {"order" : "asc" }}],
346+ "size" : 10000
347+ }
348+ response = self .es .search (index = self .click_log_index_name , body = query )
349+ return response ["hits" ]["hits" ]
350+ except Exception as e :
351+ print (f"사용자 클릭 로그 조회 오류: { e } " )
352+ return []
353+
354+ def get_search_click_data_for_user (self , user_id : str ) -> pd .DataFrame :
355+ """사용자 검색 및 클릭 데이터를 기반으로 학습 데이터 생성"""
356+
357+ def _ensure_utc_aware (dt_str : str ) -> datetime :
358+ """datetime 문자열을 UTC-aware datetime 객체로 변환"""
359+ dt = datetime .fromisoformat (dt_str .replace ('Z' , '+00:00' ))
360+ if dt .tzinfo is None :
361+ # Naive datetime은 UTC로 간주
362+ return dt .replace (tzinfo = timezone .utc )
363+ return dt
364+
365+ search_logs = self .get_all_search_logs_by_user (user_id )
366+ click_logs = self .get_all_click_logs_by_user (user_id )
367+
368+ if not search_logs :
369+ return pd .DataFrame ()
370+
371+ training_data = []
372+ click_iterator = iter (click_logs )
373+ current_click = next (click_iterator , None )
374+
375+ for i , search_hit in enumerate (search_logs ):
376+ search_log = search_hit ["_source" ]
377+ search_time = _ensure_utc_aware (search_log ["timestamp" ])
378+
379+ next_search_time = datetime .now (timezone .utc )
380+ if i + 1 < len (search_logs ):
381+ next_search_log = search_logs [i + 1 ]["_source" ]
382+ next_search_time = _ensure_utc_aware (next_search_log ["timestamp" ])
383+
384+ clicked_in_window = set ()
385+ while current_click :
386+ click_log = current_click ["_source" ]
387+ click_time = _ensure_utc_aware (click_log ["timestamp" ])
388+
389+ if search_time <= click_time < next_search_time :
390+ clicked_in_window .add (click_log ["placeId" ])
391+ current_click = next (click_iterator , None )
392+ elif click_time >= next_search_time :
393+ break
394+ else : # click_time < search_time
395+ current_click = next (click_iterator , None )
396+
397+ for place_id in search_log .get ("placeIds" , []):
398+ training_data .append ({
399+ "userid" : user_id ,
400+ "place_id" : place_id ,
401+ "yn" : 1 if place_id in clicked_in_window else 0
402+ })
403+
404+ if not training_data :
405+ return pd .DataFrame ()
406+
407+ return pd .DataFrame (training_data )
0 commit comments