diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..433cc7d --- /dev/null +++ b/.gitignore @@ -0,0 +1,34 @@ +# Ignore all CSV files +*.csv + +# Recommended ignores for Python projects +__pycache__/ +*.py[cod] +*.pyo +*.pyd +*.so +*.egg +*.egg-info/ +dist/ +build/ +*.log + +# Ignore Jupyter Notebook checkpoints +.ipynb_checkpoints/ + +# Ignore virtual environments +venv/ +env/ +ENV/ +.venv/ +.env/ + +# Ignore OS generated files +.DS_Store +Thumbs.db + +# Ignore IDE/editor folders +.vscode/ +.idea/ +*.swp +*.swo \ No newline at end of file diff --git a/fastapi_reco/app/__pycache__/main.cpython-312.pyc b/fastapi_reco/app/__pycache__/main.cpython-312.pyc index e4ee319..f2cf1fb 100644 Binary files a/fastapi_reco/app/__pycache__/main.cpython-312.pyc and b/fastapi_reco/app/__pycache__/main.cpython-312.pyc differ diff --git a/fastapi_reco/app/main.py b/fastapi_reco/app/main.py index 33ffafe..3c108db 100644 --- a/fastapi_reco/app/main.py +++ b/fastapi_reco/app/main.py @@ -1,30 +1,25 @@ from fastapi import FastAPI, Query -from app.recommender_content import ContentRecommender -from app.recommender_als import ALSRecommender -from app.logger import save_log +from typing import List +from pydantic import BaseModel -app = FastAPI() # ✅ 반드시 필요! +from recommender.als import ALSRecommender +from recommender.content import ContentRecommender +from recommender.hybrid import hybrid_recommend -content_model = ContentRecommender() -als_model = ALSRecommender() +app = FastAPI() -@app.get("/recommend/hybrid") -def hybrid_recommend(user_id: int = Query(...), post_id: int = Query(...), top_k: int = 5): - als_result = als_model.recommend(user_id, top_k) - content_result = content_model.recommend(post_id, top_k) +als_model = ALSRecommender(csv_path="mock_data/user_post.csv") +content_model = ContentRecommender(csv_path="mock_data/post_tags.csv") - result = { - "recommendations": { - "for_you": als_result, - "similar_to_this": content_result - } - } +class RecommendationResult(BaseModel): + post_id: int + score: float + reason: str - save_log({ - "user_id": user_id, - "post_id": post_id, - "top_k": top_k, - "result": result - }) - - return result +@app.get("/recommend", response_model=List[RecommendationResult]) +def recommend(user_email: str = Query(...), top_k: int = Query(5)): + try: + results = hybrid_recommend(user_email, als_model, content_model, top_k=top_k) + return results + except Exception as e: + return [] diff --git a/fastapi_reco/app/post_tags.csv b/fastapi_reco/app/post_tags.csv deleted file mode 100644 index c26e61f..0000000 --- a/fastapi_reco/app/post_tags.csv +++ /dev/null @@ -1,12 +0,0 @@ -post_id,title,content,tags -101,감성 인테리어,아늑한 공간을 만들었어요,"감성,사진,포근" -102,미니멀 침실,물건을 최소화한 정리 정돈,"미니멀,심플,화이트" -103,힐링 소파,편안한 소파와 조명으로 하루 마무리,"힐링,감성,휴식" -104,레트로 감성,옛 감성 가득한 방을 꾸며봤어요,"레트로,빈티지,아날로그" -105,카페 분위기,집에서도 카페 느낌을 내는 방법,"카페,인테리어,조명" -106,레트로 감성,옛 감성 가득한 방을 꾸며봤어요,"레트로,빈티지,아날로그" -107,미니멀 침실,물건을 최소화한 정리 정돈,"미니멀,심플,화이트" -108,힐링 소파,편안한 소파와 조명으로 하루 마무리,"힐링,감성,휴식" -109,감성 인테리어,아늑한 공간을 만들었어요,"감성,사진,포근" -110,미니멀 침실,물건을 최소화한 정리 정돈,"미니멀,심플,화이트" - diff --git a/fastapi_reco/app/recommender/als.py b/fastapi_reco/app/recommender/als.py new file mode 100644 index 0000000..8e40832 --- /dev/null +++ b/fastapi_reco/app/recommender/als.py @@ -0,0 +1,111 @@ +import pandas as pd +import numpy as np +from sklearn.preprocessing import LabelEncoder, MinMaxScaler +from scipy.sparse import csr_matrix +from implicit.als import AlternatingLeastSquares +from typing import List, Dict, Any + +class ALSRecommender: + def __init__(self, csv_path="app/user_post.csv"): + df = pd.read_csv(csv_path) + df["prefer"] = df["prefer"].fillna(1).clip(lower=1) + + self.user_encoder = LabelEncoder() + self.item_encoder = LabelEncoder() + df["user_id"] = self.user_encoder.fit_transform(df["user_email"]) + df["item_id"] = self.item_encoder.fit_transform(df["post_id"]) + + self.df = df + self.user_ids = df["user_id"].unique() + self.scaler = MinMaxScaler() + + matrix = csr_matrix(( + df["prefer"].astype(np.float32), + (df["item_id"], df["user_id"]) + )) + + self.model = AlternatingLeastSquares( + factors=32, # 잠재 요인 수 증가 + regularization=0.1, + iterations=50, + use_gpu=False + ) + self.model.fit(matrix) + self.user_item_matrix = matrix.T.tocsr() + + def _get_recommendation_reason(self, user_id: int, item_id: int) -> str: + """추천 이유를 생성합니다.""" + user_items = self.user_item_matrix[user_id] + similar_items = self.model.similar_items(item_id, N=3) + + if len(similar_items) > 0: + return f"이 게시물은 당신이 관심을 보인 다른 게시물들과 유사합니다." + return "이 게시물은 당신의 관심사와 잘 맞습니다." + + def _ensure_diversity(self, recommendations: List[Dict[str, Any]], top_k: int) -> List[Dict[str, Any]]: + """추천 결과의 다양성을 보장합니다.""" + if len(recommendations) <= top_k: + return recommendations + + # 점수 기반 정렬 + sorted_recs = sorted(recommendations, key=lambda x: x["score"], reverse=True) + + # 상위 결과는 유지하고 나머지에서 다양하게 선택 + diverse_recs = sorted_recs[:top_k//2] + remaining = sorted_recs[top_k//2:] + + # 나머지에서 랜덤하게 선택 + if remaining: + diverse_recs.extend(np.random.choice(remaining, + size=min(len(remaining), top_k - len(diverse_recs)), + replace=False)) + + return diverse_recs + + def recommend(self, user_id: int, top_k: int = 5) -> List[Dict[str, Any]]: + if user_id not in self.user_ids: + return [] + + user_items = self.user_item_matrix[user_id] + num_seen = user_items.getnnz() + num_total = self.user_item_matrix.shape[1] + num_unseen = num_total - num_seen + + if num_unseen <= 0: + return [] + + safe_k = min(top_k * 2, num_unseen) # 더 많은 후보를 생성 + + try: + item_ids, scores = self.model.recommend( + user_id, + user_items, + N=safe_k, + filter_already_liked_items=True + ) + + # 점수 정규화 + scores = self.scaler.fit_transform(scores.reshape(-1, 1)).flatten() + + results = [] + for item_id, score in zip(item_ids, scores): + if np.isnan(score) or score < 0: + continue + + post_id = self.item_encoder.inverse_transform([item_id])[0] + reason = self._get_recommendation_reason(user_id, item_id) + + results.append({ + "post_id": int(post_id), + "score": round(float(score), 3), + "reason": reason + }) + + # 다양성 보장 + results = self._ensure_diversity(results, top_k) + + return results + + except Exception as e: + print(f"⚠️ ALS 추천 오류: {e}") + return [] diff --git a/fastapi_reco/app/recommender/content.py b/fastapi_reco/app/recommender/content.py new file mode 100644 index 0000000..c75fda4 --- /dev/null +++ b/fastapi_reco/app/recommender/content.py @@ -0,0 +1,64 @@ +import pandas as pd +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity +from typing import List, Dict, Any +from sklearn.preprocessing import MinMaxScaler + +class ContentRecommender: + def __init__(self, csv_path="app/post_tags.csv"): + df = pd.read_csv(csv_path).fillna("") + df["tags"] = df["tags"].apply(lambda x: x.replace(",", " ")) + df["full_text"] = df["title"] + " " + df["content"] + " " + df["tags"] + + self.df = df + self.post_ids = df["post_id"].values + self.vectorizer = TfidfVectorizer( + max_features=5000, + ngram_range=(1, 2), + min_df=2 + ) + self.X = self.vectorizer.fit_transform(df["full_text"]) + self.similarity_matrix = cosine_similarity(self.X) + self.scaler = MinMaxScaler() + + def _get_common_tags(self, post_id: int, similar_post_id: int) -> List[str]: + """두 게시물 간의 공통 태그를 찾습니다.""" + post_tags = set(self.df[self.df["post_id"] == post_id]["tags"].iloc[0].split()) + similar_post_tags = set(self.df[self.df["post_id"] == similar_post_id]["tags"].iloc[0].split()) + return list(post_tags.intersection(similar_post_tags)) + + def _get_recommendation_reason(self, post_id: int, similar_post_id: int) -> str: + """추천 이유를 생성합니다.""" + common_tags = self._get_common_tags(post_id, similar_post_id) + if common_tags: + return f"이 게시물은 다음 태그들을 공유합니다: {', '.join(common_tags[:3])}" + return "이 게시물은 비슷한 주제를 다루고 있습니다." + + def recommend(self, post_id: int, top_k: int = 3) -> List[Dict[str, Any]]: + if post_id not in self.post_ids: + return [] + + idx = list(self.post_ids).index(post_id) + sim_scores = list(enumerate(self.similarity_matrix[idx])) + sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) + sim_scores = [s for s in sim_scores if s[0] != idx][:top_k * 2] # 더 많은 후보 생성 + + # 유사도 점수 정규화 + scores = np.array([score for _, score in sim_scores]) + scores = self.scaler.fit_transform(scores.reshape(-1, 1)).flatten() + + results = [] + for (i, _), score in zip(sim_scores, scores): + similar_post_id = int(self.df.iloc[i]["post_id"]) + reason = self._get_recommendation_reason(post_id, similar_post_id) + + results.append({ + "post_id": similar_post_id, + "title": self.df.iloc[i]["title"], + "similarity": round(float(score), 3), + "reason": reason + }) + + # 상위 결과만 반환 + return results[:top_k] diff --git a/fastapi_reco/app/recommender/hybrid.py b/fastapi_reco/app/recommender/hybrid.py new file mode 100644 index 0000000..070f6b4 --- /dev/null +++ b/fastapi_reco/app/recommender/hybrid.py @@ -0,0 +1,27 @@ +from recommender.als import ALSRecommender +from recommender.content import ContentRecommender + + +def hybrid_recommend(user_email, als: ALSRecommender, content: ContentRecommender, top_k=5): + user_id = als.user_encoder.transform([user_email])[0] + als_recs = als.recommend(user_id, top_k=top_k*2) + + seen_post_ids = set() + final_recs = [] + + for rec in als_recs: + post_id = rec["post_id"] + if post_id in seen_post_ids: + continue + + content_recs = content.recommend(post_id, top_k=1) + if content_recs: + enriched_reason = content_recs[0]["reason"] + rec["reason"] += f" ({enriched_reason})" + + final_recs.append(rec) + seen_post_ids.add(post_id) + if len(final_recs) >= top_k: + break + + return final_recs diff --git a/fastapi_reco/app/recommender_als.py b/fastapi_reco/app/recommender_als.py deleted file mode 100644 index 094b94a..0000000 --- a/fastapi_reco/app/recommender_als.py +++ /dev/null @@ -1,69 +0,0 @@ -import pandas as pd -import numpy as np -from sklearn.preprocessing import LabelEncoder -from scipy.sparse import csr_matrix -from implicit.als import AlternatingLeastSquares - -class ALSRecommender: - def __init__(self, csv_path="app/user_post.csv"): - df = pd.read_csv(csv_path) - df["prefer"] = df["prefer"].fillna(1).clip(lower=1) - - self.user_encoder = LabelEncoder() - self.item_encoder = LabelEncoder() - df["user_id"] = self.user_encoder.fit_transform(df["user_email"]) - df["item_id"] = self.item_encoder.fit_transform(df["post_id"]) - - self.df = df - self.user_ids = df["user_id"].unique() - - matrix = csr_matrix(( - df["prefer"].astype(np.float32), - (df["item_id"], df["user_id"]) - )) - - self.model = AlternatingLeastSquares( - factors=20, - regularization=0.1, - iterations=50, - use_gpu=False - ) - self.model.fit(matrix) - self.user_item_matrix = matrix.T.tocsr() - - def recommend(self, user_id: int, top_k: int = 5): - if user_id not in self.user_ids: - return [] - - user_items = self.user_item_matrix[user_id] - num_seen = user_items.getnnz() - num_total = self.user_item_matrix.shape[1] - num_unseen = num_total - num_seen - - if num_unseen <= 0: - return [] - - safe_k = min(top_k, num_unseen) - - try: - item_ids, scores = self.model.recommend( - user_id, - user_items, - N=safe_k, - filter_already_liked_items=True - ) - except Exception as e: - print(f"⚠️ ALS 추천 오류: {e}") - return [] - - results = [] - for item_id, score in zip(item_ids, scores): - if np.isnan(score) or score < -1e+30: - continue - post_id = self.item_encoder.inverse_transform([item_id])[0] - results.append({ - "post_id": int(post_id), - "score": round(float(score), 3) - }) - - return results diff --git a/fastapi_reco/app/recommender_content.py b/fastapi_reco/app/recommender_content.py deleted file mode 100644 index 773625a..0000000 --- a/fastapi_reco/app/recommender_content.py +++ /dev/null @@ -1,33 +0,0 @@ -import pandas as pd -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.metrics.pairwise import cosine_similarity - -class ContentRecommender: - def __init__(self, csv_path="app/post_tags.csv"): - df = pd.read_csv(csv_path).fillna("") - df["tags"] = df["tags"].apply(lambda x: x.replace(",", " ")) - df["full_text"] = df["title"] + " " + df["content"] + " " + df["tags"] - - self.df = df - self.post_ids = df["post_id"].values - self.vectorizer = TfidfVectorizer() - self.X = self.vectorizer.fit_transform(df["full_text"]) - self.similarity_matrix = cosine_similarity(self.X) - - def recommend(self, post_id: int, top_k: int = 3): - if post_id not in self.post_ids: - return [] - - idx = list(self.post_ids).index(post_id) - sim_scores = list(enumerate(self.similarity_matrix[idx])) - sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) - sim_scores = [s for s in sim_scores if s[0] != idx][:top_k] - - return [ - { - "post_id": int(self.df.iloc[i]["post_id"]), - "title": self.df.iloc[i]["title"], - "similarity": round(score, 3) - } - for i, score in sim_scores - ] diff --git a/fastapi_reco/app/user_post.csv b/fastapi_reco/app/user_post.csv deleted file mode 100644 index 5862eaa..0000000 --- a/fastapi_reco/app/user_post.csv +++ /dev/null @@ -1,14 +0,0 @@ -user_email,post_id,prefer -alice@example.com,101,1 -alice@example.com,102,1 -alice@example.com,103,1 -bob@example.com,101,1 -bob@example.com,103,1 -bob@example.com,104,1 -bob@example.com,108,1 -bob@example.com,109,1 -carol@example.com,104,1 -carol@example.com,106,1 -carol@example.com,108,1 -carol@example.com,109,1 -carol@example.com,110,1 diff --git a/fastapi_reco/logs/recommendation_log.jsonl b/fastapi_reco/logs/recommendation_log.jsonl deleted file mode 100644 index 8f9f65f..0000000 --- a/fastapi_reco/logs/recommendation_log.jsonl +++ /dev/null @@ -1,6 +0,0 @@ -{"user_id": 0, "post_id": 1, "top_k": 5, "result": {"recommendations": {"for_you": [{"post_id": 103, "score": -3.4028234663852886e+38}, {"post_id": 102, "score": -3.4028234663852886e+38}, {"post_id": 101, "score": -3.4028234663852886e+38}, {"post_id": 101, "score": 0.0}, {"post_id": 101, "score": 0.0}], "similar_to_this": []}}, "timestamp": "2025-04-09T14:18:18.124781"} -{"user_id": 0, "post_id": 1, "top_k": 5, "result": {"recommendations": {"for_you": [{"post_id": 101, "score": 0.0}, {"post_id": 101, "score": 0.0}], "similar_to_this": []}}, "timestamp": "2025-04-09T14:20:38.110008"} -{"user_id": 0, "post_id": 0, "top_k": 3, "result": {"recommendations": {"for_you": [], "similar_to_this": []}}, "timestamp": "2025-04-09T14:21:54.074620"} -{"user_id": 0, "post_id": 1, "top_k": 3, "result": {"recommendations": {"for_you": [], "similar_to_this": []}}, "timestamp": "2025-04-09T14:21:58.575812"} -{"user_id": 0, "post_id": 2, "top_k": 3, "result": {"recommendations": {"for_you": [], "similar_to_this": []}}, "timestamp": "2025-04-09T14:22:01.396550"} -{"user_id": 0, "post_id": 2, "top_k": 5, "result": {"recommendations": {"for_you": [{"post_id": 101, "score": 0.0}, {"post_id": 101, "score": 0.0}], "similar_to_this": []}}, "timestamp": "2025-04-09T14:22:03.668853"}