Agentic-AI-Busan · JeongTJ · Mar 14, 2025 · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -0,0 +1,21 @@
+## PR 제목
+
+## 작업 내용 (What)
+
+## 이유 (Why)
+
+## 변경 사항 (Changes)
+
+## 테스트 (How)
+
+## 참고 사항
+---
+
+### 체크리스트
+- [ ] 코드가 의도한 대로 작동합니다.
+- [ ] 변경 사항이 문서화되었습니다.
+- [ ] 기존 테스트를 통과했으며, 필요한 경우 새로운 테스트를 추가했습니다.
+
+---
+
+### 이슈 번호
diff --git a/.github/workflows/github-actions-demo.yml b/.github/workflows/github-actions-demo.yml
@@ -0,0 +1,18 @@
+name: GitHub Actions Demo
+run-name: ${{ github.actor }} is testing out GitHub Actions 🚀
+on: [push]
+jobs:
+  Explore-GitHub-Actions:
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
+      - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!"
+      - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
+      - name: Check out repository code
+        uses: actions/checkout@v4
+      - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
+      - run: echo "🖥️ The workflow is now ready to test your code on the runner."
+      - name: List files in the repository
+        run: |
+          ls ${{ github.workspace }}
+      - run: echo "🍏 This job's status is ${{ job.status }}."
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,4 @@
 __pycache__
 data
 vectordb
-ai-preprocessing
+tmux-script/
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ AI RAG서버용 리포지토리입니다.
 - 깃허브 readme에 테스트 방법 작성
     1. [`https://app.docker.com/auth/desktop/redirect?code=zFNfmkeLy5_7-sb7gVnSh_42TJEd98tuGFwmmGHMH_bD4&state=B_I9FtCHqcoV3XqY-a5lZGAxdhRqKJuMaDNWlTVr86E`](https://app.docker.com/auth/desktop/redirect?code=zFNfmkeLy5_7-sb7gVnSh_42TJEd98tuGFwmmGHMH_bD4&state=B_I9FtCHqcoV3XqY-a5lZGAxdhRqKJuMaDNWlTVr86E) 링크에 들어가서 Docker Desktop 기본 설정으로 설치
     2. Docker Desktop 실행 
-    3. 프로젝트 clone 한 뒤 .env.local에 있는 환경변수 설정 후 .env로 파일이름 변경 
+    3. 프로젝트 clone 한 뒤 .env.local에 있는 환경변수 설정 후 .env로 파일이름 변경
     4. 실행
         1. in Window
             1. docker-compose.yml파일이 있는 디렉토리로 이돔

diff --git a/build_image/Dockerfile b/build_image/Dockerfile
@@ -0,0 +1,7 @@
+# 베이스 이미지로 python:3.11-slim 사용
+FROM python:3.11-slim
+
+RUN apt-get update && apt-get install -y build-essential
+RUN pip install --upgrade pip
+RUN pip install -r https://raw.githubusercontent.com/teddylee777/langchain-kr/main/requirements.txt
+
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -9,6 +9,17 @@ services:
       context: ./srcs/ai-server
       dockerfile: Dockerfile  # 아래에 Dockerfile 예시 참고
     volumes:
-      - ./srcs/ai-server/project:/project  # 현재 디렉토리를 컨테이너 내 /app에 마운트
+      - ./srcs/ai-server/project:/project
     ports:
       - "80:8000"  # 호스트의 8000 포트를 컨테이너의 8000 포트
+
+  ai-preprocessing:
+    init: true
+    container_name: ai-preprocessing
+    environment:
+      OPENAI_API_KEY: ${OPENAI_API_KEY}
+    build:
+      context: ./srcs/ai-preprocessing
+      dockerfile: Dockerfile  # 아래에 Dockerfile 예시 참고
+    volumes:
+      - ./srcs/ai-preprocessing/project:/project
diff --git a/srcs/ai-preprocessing/Dockerfile b/srcs/ai-preprocessing/Dockerfile
@@ -0,0 +1,5 @@
+FROM jeongtj/langchain-rag
+
+WORKDIR /project
+
+CMD ["tail", "-f", "/dev/null"]
diff --git a/srcs/ai-preprocessing/project/script/create_restaurant_vectordb.py b/srcs/ai-preprocessing/project/script/create_restaurant_vectordb.py
@@ -0,0 +1,136 @@
+import os
+import sys
+from tqdm import tqdm
+from pathlib import Path
+from dotenv import load_dotenv
+from langchain_community.document_loaders.csv_loader import CSVLoader
+from langchain_community.vectorstores import FAISS
+from langchain_openai import OpenAIEmbeddings
+from langchain.schema import Document
+import pandas as pd
+
+# 환경변수 로드
+load_dotenv()
+
+CHUNK_SIZE = 500  # 한 번에 처리할 CSV 행 수
+
+def split_csv(input_path: Path, output_prefix: str, chunk_size: int = CHUNK_SIZE):
+    """큰 CSV 파일을 여러 개로 분할"""
+    output_files = []
+    for i, chunk in enumerate(pd.read_csv(input_path, chunksize=chunk_size, encoding="utf-8")):
+        output_file = input_path.parent / f"{output_prefix}_part{i}.csv"
+        chunk.to_csv(output_file, index=False)
+        output_files.append(output_file)
+    return output_files
+
+
+def prepare_restaurant_documents(docs):
+    """CSV 데이터를 벡터화할 문서 형태로 변환"""
+    restaurant_docs = []
+    for doc in docs:
+        content = doc.page_content
+        rstr_id = None
+
+        for line in content.split("\n"):
+            if line.startswith("\ufeffRSTR_ID:") or line.startswith("RSTR_ID:"):
+                try:
+                    rstr_id = int(line.split(":")[1].strip())
+                except ValueError:
+                    rstr_id = None  # 변환 실패 시 None으로 설정
+                break
+
+        content_lines = [
+            line
+            for line in content.split("\n")
+            if not (line.startswith("\ufeffRSTR_ID:") or line.startswith("RSTR_ID:"))
+        ]
+        filtered_content = "\n".join(content_lines)
+
+        restaurant_docs.append(
+            Document(
+                page_content=filtered_content.strip(), metadata={"RSTR_ID": rstr_id}
+            )
+        )
+    return restaurant_docs
+
+
+def create_vectordb(data_path: str | Path, index_name: str, encoding: str = "utf-8") -> None:
+    """벡터 DB를 생성하고 저장"""
+    project_root = Path(__file__).parent.parent
+    data_path = project_root / data_path
+
+    if not data_path.exists():
+        raise FileNotFoundError(f"데이터 파일을 찾을 수 없습니다: {data_path}")
+
+    vectordb_path = project_root / "vectordb" / index_name
+    vectordb_path.parent.mkdir(exist_ok=True, parents=True)
+
+    # 기존 벡터DB 로드 (이미 있는 경우)
+    embeddings = OpenAIEmbeddings()
+    vectorstore = None
+
+    if vectordb_path.exists():
+        print("기존 벡터DB를 로드합니다...")
+        try:
+            vectorstore = FAISS.load_local(str(vectordb_path), embeddings, allow_dangerous_deserialization=True)
+        except Exception as e:
+            print(f"벡터DB 로드 실패: {e}")
+            vectorstore = None
+
+    # CSV 파일 분할
+    print(f"파일이 너무 커서 분할 진행... (기본 {CHUNK_SIZE}행씩)")
+    split_files = split_csv(data_path, index_name)
+
+    # 분할된 CSV 파일 하나씩 처리
+    for file in tqdm(split_files, desc="Embedding 중"):
+        print(f"처리 중: {file}")
+        loader = CSVLoader(file_path=str(file), encoding=encoding)
+        docs = loader.load()
+
+        processed_docs = prepare_restaurant_documents(docs)
+        print(f"처리된 문서 수: {len(processed_docs)}")
+
+        # 문서 임베딩 생성
+        # embedded_documents = [
+        #     (doc, embeddings.embed_query(doc.page_content)) for doc in tqdm(processed_docs, desc="Embedding 중")
+        # ]
+
+        # FAISS 벡터DB 생성
+        new_vectorstore = FAISS.from_documents(
+            documents=processed_docs, embedding=OpenAIEmbeddings()
+        )
+        # new_vectorstore = FAISS.from_embeddings(
+        #     texts=[doc.page_content for doc, _ in embedded_documents],  # ✅ 문서 텍스트 추가
+        #     embeddings=[embed for _, embed in embedded_documents],      # ✅ 임베딩 벡터 추가
+        #     embedding=embeddings  # ✅ OpenAIEmbeddings 인스턴스 추가
+        # )
+
+        # 기존 벡터DB와 병합
+        if vectorstore:
+            vectorstore.merge_from(new_vectorstore)
+        else:
+            vectorstore = new_vectorstore
+
+    for file in split_files:
+        # 임시 파일 삭제
+        os.remove(file)
+
+    # 벡터DB 저장
+    if vectorstore:
+        vectorstore.save_local(str(vectordb_path))
+        print(f"벡터 DB 저장 완료: {vectordb_path}")
+    else:
+        print("⚠️ 벡터DB 저장할 데이터가 없습니다!")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("⚠️ 파일 이름을 입력해주세요")
+        exit(1)
+    if len(sys.argv) < 3:
+        print("⚠️ 저장될 벡터저장소 이름을 입력해주세요")
+        exit(1)
+
+    data_path = sys.argv[1]
+    index_name = sys.argv[2]
+    create_vectordb(data_path=data_path, index_name=index_name)
diff --git a/srcs/ai-preprocessing/project/script/faiss_index_check.py b/srcs/ai-preprocessing/project/script/faiss_index_check.py
@@ -0,0 +1,39 @@
+import os
+import sys
+from tqdm import tqdm
+from pathlib import Path
+from dotenv import load_dotenv
+from langchain_community.document_loaders.csv_loader import CSVLoader
+from langchain_community.vectorstores import FAISS
+from langchain_openai import OpenAIEmbeddings
+from langchain.schema import Document
+import pandas as pd
+
+# 환경변수 로드
+load_dotenv()
+
+def print_vectordb_info(data_path: str | Path, encoding: str = "utf-8") -> None:
+    """벡터 DB를 생성하고 저장"""
+    project_root = Path(__file__).parent.parent
+    data_path = project_root / data_path
+
+    if not data_path.exists():
+        raise FileNotFoundError(f"데이터 파일을 찾을 수 없습니다: {data_path}")
+
+    # 기존 벡터DB 로드 (이미 있는 경우)
+    embeddings = OpenAIEmbeddings()
+    vectorstore = None
+
+    if data_path.exists():
+        print("기존 벡터DB를 로드합니다...")
+        try:
+            vectorstore = FAISS.load_local(str(data_path), embeddings, allow_dangerous_deserialization=True)
+        except Exception as e:
+            print(f"벡터DB 로드 실패: {e}")
+            vectorstore = None
+    print(vectorstore.index.ntotal)
+
+
+if __name__ == "__main__":
+    data_path = sys.argv[1]
+    print_vectordb_info(data_path=data_path)
diff --git a/srcs/ai-server/Dockerfile b/srcs/ai-server/Dockerfile
@@ -1,22 +1,11 @@
-# 베이스 이미지로 python:3.11-slim 사용
-FROM python:3.11-slim
-# FROM langchain/langchain:latest
+FROM jeongtj/langchain-rag
 
-# 작업 디렉토리 생성 및 설정
 WORKDIR /project
 
-# 종속성 설치를 위해 requirements.txt 복사 및 설치
 COPY requirements.txt /project/
-RUN apt-get update && apt-get install -y build-essential
-RUN pip install --upgrade pip
-RUN pip install -r https://raw.githubusercontent.com/teddylee777/langchain-kr/main/requirements.txt
+COPY entrypoint.sh /home/entrypoint.sh
 
 RUN pip install -r requirements.txt
-
-# 애플리케이션 코드 복사
-# COPY ./app /app
-
-COPY entrypoint.sh /home/entrypoint.sh
 RUN chmod +x /home/entrypoint.sh
 
 # 컨테이너의 8000 포트를 외부에 노출

diff --git a/srcs/ai-server/project/app/services/base.py b/srcs/ai-server/project/app/services/base.py
@@ -6,11 +6,11 @@
 
 class BaseService:
     def __init__(
-        self, vectordb_name: str, model_name: str = "gpt-4", temperature: float = 0.2
+        self, vectordb_name: str, model_name: str = "gpt-4o-mini", temperature: float = 0.2
     ):
         self.vectorstore = load_vectordb(vectordb_name)
-        self.retriever = self.vectorstore.as_retriever()
-        self.llm = ChatOpenAI(model_name=model_name, temperature=temperature)
+        self.retriever = self.vectorstore.as_retriever(search_kwargs={"k": 20})
+        self.llm = ChatOpenAI(model_name=model_name, temperature=temperature, max_tokens=8192)
 
     async def process_query(self, query: str, prompt_template: str) -> Dict[str, Any]:
         raise NotImplementedError