diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..481e28f --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,21 @@ +## PR 제목 + +## 작업 내용 (What) + +## 이유 (Why) + +## 변경 사항 (Changes) + +## 테스트 (How) + +## 참고 사항 +--- + +### 체크리스트 +- [ ] 코드가 의도한 대로 작동합니다. +- [ ] 변경 사항이 문서화되었습니다. +- [ ] 기존 테스트를 통과했으며, 필요한 경우 새로운 테스트를 추가했습니다. + +--- + +### 이슈 번호 \ No newline at end of file diff --git a/.github/workflows/github-actions-demo.yml b/.github/workflows/github-actions-demo.yml new file mode 100644 index 0000000..15a61d6 --- /dev/null +++ b/.github/workflows/github-actions-demo.yml @@ -0,0 +1,18 @@ +name: GitHub Actions Demo +run-name: ${{ github.actor }} is testing out GitHub Actions 🚀 +on: [push] +jobs: + Explore-GitHub-Actions: + runs-on: ubuntu-latest + steps: + - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event." + - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!" + - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}." + - name: Check out repository code + uses: actions/checkout@v4 + - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner." + - run: echo "🖥️ The workflow is now ready to test your code on the runner." + - name: List files in the repository + run: | + ls ${{ github.workspace }} + - run: echo "🍏 This job's status is ${{ job.status }}." diff --git a/.gitignore b/.gitignore index 1657c37..e4ac5bb 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,4 @@ __pycache__ data vectordb -ai-preprocessing \ No newline at end of file +tmux-script/ \ No newline at end of file diff --git a/README.md b/README.md index f31725a..d6fa08b 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ AI RAG서버용 리포지토리입니다. - 깃허브 readme에 테스트 방법 작성 1. [`https://app.docker.com/auth/desktop/redirect?code=zFNfmkeLy5_7-sb7gVnSh_42TJEd98tuGFwmmGHMH_bD4&state=B_I9FtCHqcoV3XqY-a5lZGAxdhRqKJuMaDNWlTVr86E`](https://app.docker.com/auth/desktop/redirect?code=zFNfmkeLy5_7-sb7gVnSh_42TJEd98tuGFwmmGHMH_bD4&state=B_I9FtCHqcoV3XqY-a5lZGAxdhRqKJuMaDNWlTVr86E) 링크에 들어가서 Docker Desktop 기본 설정으로 설치 2. Docker Desktop 실행 - 3. 프로젝트 clone 한 뒤 .env.local에 있는 환경변수 설정 후 .env로 파일이름 변경 + 3. 프로젝트 clone 한 뒤 .env.local에 있는 환경변수 설정 후 .env로 파일이름 변경 4. 실행 1. in Window 1. docker-compose.yml파일이 있는 디렉토리로 이돔 diff --git a/build_image/Dockerfile b/build_image/Dockerfile new file mode 100644 index 0000000..6b68fd6 --- /dev/null +++ b/build_image/Dockerfile @@ -0,0 +1,7 @@ +# 베이스 이미지로 python:3.11-slim 사용 +FROM python:3.11-slim + +RUN apt-get update && apt-get install -y build-essential +RUN pip install --upgrade pip +RUN pip install -r https://raw.githubusercontent.com/teddylee777/langchain-kr/main/requirements.txt + diff --git a/docker-compose.yml b/docker-compose.yml index a66b55a..15d3dd8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,6 +9,17 @@ services: context: ./srcs/ai-server dockerfile: Dockerfile # 아래에 Dockerfile 예시 참고 volumes: - - ./srcs/ai-server/project:/project # 현재 디렉토리를 컨테이너 내 /app에 마운트 + - ./srcs/ai-server/project:/project ports: - "80:8000" # 호스트의 8000 포트를 컨테이너의 8000 포트 + + ai-preprocessing: + init: true + container_name: ai-preprocessing + environment: + OPENAI_API_KEY: ${OPENAI_API_KEY} + build: + context: ./srcs/ai-preprocessing + dockerfile: Dockerfile # 아래에 Dockerfile 예시 참고 + volumes: + - ./srcs/ai-preprocessing/project:/project \ No newline at end of file diff --git a/srcs/ai-preprocessing/Dockerfile b/srcs/ai-preprocessing/Dockerfile new file mode 100644 index 0000000..66f773d --- /dev/null +++ b/srcs/ai-preprocessing/Dockerfile @@ -0,0 +1,5 @@ +FROM jeongtj/langchain-rag + +WORKDIR /project + +CMD ["tail", "-f", "/dev/null"] diff --git a/srcs/ai-preprocessing/project/script/create_restaurant_vectordb.py b/srcs/ai-preprocessing/project/script/create_restaurant_vectordb.py new file mode 100644 index 0000000..6dd5157 --- /dev/null +++ b/srcs/ai-preprocessing/project/script/create_restaurant_vectordb.py @@ -0,0 +1,136 @@ +import os +import sys +from tqdm import tqdm +from pathlib import Path +from dotenv import load_dotenv +from langchain_community.document_loaders.csv_loader import CSVLoader +from langchain_community.vectorstores import FAISS +from langchain_openai import OpenAIEmbeddings +from langchain.schema import Document +import pandas as pd + +# 환경변수 로드 +load_dotenv() + +CHUNK_SIZE = 500 # 한 번에 처리할 CSV 행 수 + +def split_csv(input_path: Path, output_prefix: str, chunk_size: int = CHUNK_SIZE): + """큰 CSV 파일을 여러 개로 분할""" + output_files = [] + for i, chunk in enumerate(pd.read_csv(input_path, chunksize=chunk_size, encoding="utf-8")): + output_file = input_path.parent / f"{output_prefix}_part{i}.csv" + chunk.to_csv(output_file, index=False) + output_files.append(output_file) + return output_files + + +def prepare_restaurant_documents(docs): + """CSV 데이터를 벡터화할 문서 형태로 변환""" + restaurant_docs = [] + for doc in docs: + content = doc.page_content + rstr_id = None + + for line in content.split("\n"): + if line.startswith("\ufeffRSTR_ID:") or line.startswith("RSTR_ID:"): + try: + rstr_id = int(line.split(":")[1].strip()) + except ValueError: + rstr_id = None # 변환 실패 시 None으로 설정 + break + + content_lines = [ + line + for line in content.split("\n") + if not (line.startswith("\ufeffRSTR_ID:") or line.startswith("RSTR_ID:")) + ] + filtered_content = "\n".join(content_lines) + + restaurant_docs.append( + Document( + page_content=filtered_content.strip(), metadata={"RSTR_ID": rstr_id} + ) + ) + return restaurant_docs + + +def create_vectordb(data_path: str | Path, index_name: str, encoding: str = "utf-8") -> None: + """벡터 DB를 생성하고 저장""" + project_root = Path(__file__).parent.parent + data_path = project_root / data_path + + if not data_path.exists(): + raise FileNotFoundError(f"데이터 파일을 찾을 수 없습니다: {data_path}") + + vectordb_path = project_root / "vectordb" / index_name + vectordb_path.parent.mkdir(exist_ok=True, parents=True) + + # 기존 벡터DB 로드 (이미 있는 경우) + embeddings = OpenAIEmbeddings() + vectorstore = None + + if vectordb_path.exists(): + print("기존 벡터DB를 로드합니다...") + try: + vectorstore = FAISS.load_local(str(vectordb_path), embeddings, allow_dangerous_deserialization=True) + except Exception as e: + print(f"벡터DB 로드 실패: {e}") + vectorstore = None + + # CSV 파일 분할 + print(f"파일이 너무 커서 분할 진행... (기본 {CHUNK_SIZE}행씩)") + split_files = split_csv(data_path, index_name) + + # 분할된 CSV 파일 하나씩 처리 + for file in tqdm(split_files, desc="Embedding 중"): + print(f"처리 중: {file}") + loader = CSVLoader(file_path=str(file), encoding=encoding) + docs = loader.load() + + processed_docs = prepare_restaurant_documents(docs) + print(f"처리된 문서 수: {len(processed_docs)}") + + # 문서 임베딩 생성 + # embedded_documents = [ + # (doc, embeddings.embed_query(doc.page_content)) for doc in tqdm(processed_docs, desc="Embedding 중") + # ] + + # FAISS 벡터DB 생성 + new_vectorstore = FAISS.from_documents( + documents=processed_docs, embedding=OpenAIEmbeddings() + ) + # new_vectorstore = FAISS.from_embeddings( + # texts=[doc.page_content for doc, _ in embedded_documents], # ✅ 문서 텍스트 추가 + # embeddings=[embed for _, embed in embedded_documents], # ✅ 임베딩 벡터 추가 + # embedding=embeddings # ✅ OpenAIEmbeddings 인스턴스 추가 + # ) + + # 기존 벡터DB와 병합 + if vectorstore: + vectorstore.merge_from(new_vectorstore) + else: + vectorstore = new_vectorstore + + for file in split_files: + # 임시 파일 삭제 + os.remove(file) + + # 벡터DB 저장 + if vectorstore: + vectorstore.save_local(str(vectordb_path)) + print(f"벡터 DB 저장 완료: {vectordb_path}") + else: + print("⚠️ 벡터DB 저장할 데이터가 없습니다!") + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("⚠️ 파일 이름을 입력해주세요") + exit(1) + if len(sys.argv) < 3: + print("⚠️ 저장될 벡터저장소 이름을 입력해주세요") + exit(1) + + data_path = sys.argv[1] + index_name = sys.argv[2] + create_vectordb(data_path=data_path, index_name=index_name) diff --git a/srcs/ai-preprocessing/project/script/faiss_index_check.py b/srcs/ai-preprocessing/project/script/faiss_index_check.py new file mode 100644 index 0000000..494f5ed --- /dev/null +++ b/srcs/ai-preprocessing/project/script/faiss_index_check.py @@ -0,0 +1,39 @@ +import os +import sys +from tqdm import tqdm +from pathlib import Path +from dotenv import load_dotenv +from langchain_community.document_loaders.csv_loader import CSVLoader +from langchain_community.vectorstores import FAISS +from langchain_openai import OpenAIEmbeddings +from langchain.schema import Document +import pandas as pd + +# 환경변수 로드 +load_dotenv() + +def print_vectordb_info(data_path: str | Path, encoding: str = "utf-8") -> None: + """벡터 DB를 생성하고 저장""" + project_root = Path(__file__).parent.parent + data_path = project_root / data_path + + if not data_path.exists(): + raise FileNotFoundError(f"데이터 파일을 찾을 수 없습니다: {data_path}") + + # 기존 벡터DB 로드 (이미 있는 경우) + embeddings = OpenAIEmbeddings() + vectorstore = None + + if data_path.exists(): + print("기존 벡터DB를 로드합니다...") + try: + vectorstore = FAISS.load_local(str(data_path), embeddings, allow_dangerous_deserialization=True) + except Exception as e: + print(f"벡터DB 로드 실패: {e}") + vectorstore = None + print(vectorstore.index.ntotal) + + +if __name__ == "__main__": + data_path = sys.argv[1] + print_vectordb_info(data_path=data_path) diff --git a/srcs/ai-server/Dockerfile b/srcs/ai-server/Dockerfile index a0c305f..38e8c9c 100644 --- a/srcs/ai-server/Dockerfile +++ b/srcs/ai-server/Dockerfile @@ -1,22 +1,11 @@ -# 베이스 이미지로 python:3.11-slim 사용 -FROM python:3.11-slim -# FROM langchain/langchain:latest +FROM jeongtj/langchain-rag -# 작업 디렉토리 생성 및 설정 WORKDIR /project -# 종속성 설치를 위해 requirements.txt 복사 및 설치 COPY requirements.txt /project/ -RUN apt-get update && apt-get install -y build-essential -RUN pip install --upgrade pip -RUN pip install -r https://raw.githubusercontent.com/teddylee777/langchain-kr/main/requirements.txt +COPY entrypoint.sh /home/entrypoint.sh RUN pip install -r requirements.txt - -# 애플리케이션 코드 복사 -# COPY ./app /app - -COPY entrypoint.sh /home/entrypoint.sh RUN chmod +x /home/entrypoint.sh # 컨테이너의 8000 포트를 외부에 노출 diff --git a/srcs/ai-server/project/app/services/base.py b/srcs/ai-server/project/app/services/base.py index 9ac73e4..f0b7fab 100644 --- a/srcs/ai-server/project/app/services/base.py +++ b/srcs/ai-server/project/app/services/base.py @@ -6,11 +6,11 @@ class BaseService: def __init__( - self, vectordb_name: str, model_name: str = "gpt-4", temperature: float = 0.2 + self, vectordb_name: str, model_name: str = "gpt-4o-mini", temperature: float = 0.2 ): self.vectorstore = load_vectordb(vectordb_name) - self.retriever = self.vectorstore.as_retriever() - self.llm = ChatOpenAI(model_name=model_name, temperature=temperature) + self.retriever = self.vectorstore.as_retriever(search_kwargs={"k": 20}) + self.llm = ChatOpenAI(model_name=model_name, temperature=temperature, max_tokens=8192) async def process_query(self, query: str, prompt_template: str) -> Dict[str, Any]: raise NotImplementedError