From 8320b2ef8be8d63abbaf17c6cf9e2137ac8f2b43 Mon Sep 17 00:00:00 2001 From: Evaclaire Wamitu Date: Mon, 29 Jul 2024 09:55:06 +0300 Subject: [PATCH] Add hybrid model --- movie_recommendor.ipynb | 733 ++++++++++++++-------------------------- 1 file changed, 259 insertions(+), 474 deletions(-) diff --git a/movie_recommendor.ipynb b/movie_recommendor.ipynb index d36f0af..2a3d64a 100644 --- a/movie_recommendor.ipynb +++ b/movie_recommendor.ipynb @@ -120,10 +120,9 @@ "id": "f28f3f7d-5712-458c-bc61-7730778d795e", "metadata": {}, "source": [ - "1. To build a collaborative filtering model using user ratings to generate top 5 movie recommendations, leveraging algorithms such as Singular Value Decomposition (SVD) and k-Nearest Neighbors (k-NN).\n", - "2. To address the cold start problem for new users by integrating content-based filtering, utilizing features such as movie genres, directors, and cast.\n", - "3. To evaluate the hybrid recommendation system using appropriate metrics like Root Mean Square Error (RMSE), Mean Average Precision (MAP), and Normalized Discounted Cumulative Gain (NDCG) to ensure accuracy and relevance of the recommendations.\n", - "\n" + "1. To build a collaborative filtering model using user ratings to generate top 5 movie recommendations leveraging algorithms such as Singular Value Decomposition (SVD) and k-Nearest Neighbors (k-NN).\n", + "2. To address the cold start problem for new users by developing content-based filtering, utilizing features such as movie genres and titles.\n", + "3. To build a hybrid recommendation system using the collaborative and content based filtering models and evaluate it using metrics such as Root Mean Square Error (RMSE) and Mean Average Precision (MAP) to ensure accuracy and relevance of the recommendations." ] }, { @@ -134,10 +133,7 @@ "## Success Metrics\n", "\n", "1. Root Mean Square Error (RMSE) < 0.9 for rating predictions\n", - "2. Mean Average Precision @5 (MAP@5) > 0.3 for recommended movies\n", - "3. Precision@5 of around 0.2 to 0.5\n", - "4. Recall@5 of around 0.2 to 0.5\n", - "5. F1 Score of around 0.3 to 0.7" + "2. Mean Average Precision @k (MAP@k) > 0.3 for recommended movies where k = 5\n" ] }, { @@ -2720,7 +2716,7 @@ "id": "0c954d06-e38c-447b-901f-d4bdd36aad59", "metadata": {}, "source": [ - "The output reveals the results of tuning and evaluating recommendation models using grid search. For the SVD model, the optimal parameters were identified as having 100 factors, 30 epochs, a learning rate of 0.01 and regularization of 0.1 achieving the best RMSE of approximately 0.862. In contrast, the KNN model required extensive computation of similarity matrices for various configurations including Pearson, cosine and MSD (Mean Squared Difference) similarities. The best parameters for the KNN model were found to be 30 neighbors, a minimum of 10 neighbors and using the MSD similarity metric with a non-user-based approach resulting in a higher RMSE of about 0.917. Consequently, the SVD model emerged as the superior choice with the lowest RMSE and we therefore selected it as the best model overall." + "The output reveals the results of tuning and evaluating recommendation models using grid search. For the SVD model, the optimal parameters were identified as having 100 factors, 30 epochs, a learning rate of 0.01 and regularization of 0.1 achieving the best RMSE of approximately 0.862. In contrast, the KNN model required extensive computation of similarity matrices for various configurations including Pearson, cosine and MSD (Mean Squared Difference) similarities. The best parameters for the KNN model were found to be 30 neighbors, a minimum of 10 neighbors and using the cosine similarity metric with a non-user-based approach resulting in a higher RMSE of about 0.917. Consequently, the SVD model emerged as the superior choice with the lowest RMSE and we therefore selected it as the best model overall." ] }, { @@ -2983,90 +2979,11 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "57e1252e-44b5-4605-a71a-ff86b4496b3a", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 65, - "id": "ddc7d36d-6264-4ba1-a986-f6d9b59c2193", + "cell_type": "markdown", + "id": "8f9045e0-ca85-4ebf-a75d-a68c2a73006f", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titlegenres
100813Black Butler: Book of the Atlanticaction, animation, comedy, fantasy
100814No Game No Life: Zeroanimation, comedy, fantasy
100815Flintdrama
100816Bungo Stray Dogs: Dead Appleaction, animation
100817Andrew Dice Clay: Dice Rulescomedy
\n", - "
" - ], - "text/plain": [ - " title genres\n", - "100813 Black Butler: Book of the Atlantic action, animation, comedy, fantasy\n", - "100814 No Game No Life: Zero animation, comedy, fantasy\n", - "100815 Flint drama\n", - "100816 Bungo Stray Dogs: Dead Apple action, animation\n", - "100817 Andrew Dice Clay: Dice Rules comedy" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "content_df.tail()" + "### Summary" ] }, { @@ -3085,6 +3002,14 @@ "## Content Based" ] }, + { + "cell_type": "markdown", + "id": "5682dd7a-ffa5-4238-828d-1eda7c95e082", + "metadata": {}, + "source": [ + "A content based recommender system is a type of recommendation algorithm that suggests items to users based on the characteristics or features of the items they have previously liked or interacted with. It analyzes the content or attributes of items (such as movies, books, or products) to find similarities and make recommendations. " + ] + }, { "cell_type": "code", "execution_count": 28, @@ -3176,441 +3101,301 @@ }, { "cell_type": "markdown", - "id": "fcd47967-e4a5-4915-a4ad-5ba0ebf8cae4", + "id": "0e1fd17e-6eb7-49f5-9bb5-b6cb231f45b5", "metadata": {}, "source": [ - "## Hybrid System" + "### Summary" ] }, { "cell_type": "markdown", - "id": "c07a19f0-fcde-450c-8fd0-eac2c0eac9f7", - "metadata": {}, - "source": [ - "The `HybridRecommender` class combines two types of recommendation systems namely content-based and collaborative filtering into a hybrid model. This class takes in two datasets (one for each recommendation model) and provides a unified recommendation list based on both models." - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "8867a083-b7bc-4f29-b78a-7a42dd3ca443", + "id": "ba4d1bad-c35d-4932-bdf9-4ded5bd19984", "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "\"['rating', 'user_id'] not in index\"", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[71], line 99\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 95\u001b[0m \u001b[38;5;66;03m# # Load movie data\u001b[39;00m\n\u001b[1;32m 96\u001b[0m \u001b[38;5;66;03m# df = pd.read_csv('movies_data.csv') # Update with the correct path\u001b[39;00m\n\u001b[1;32m 98\u001b[0m recommender \u001b[38;5;241m=\u001b[39m HybridMovieRecommender(df)\n\u001b[0;32m---> 99\u001b[0m \u001b[43mrecommender\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_collaborative_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 100\u001b[0m recommender\u001b[38;5;241m.\u001b[39mtrain_content_based_model()\n\u001b[1;32m 102\u001b[0m \u001b[38;5;66;03m# Get hybrid recommendations for a sample user\u001b[39;00m\n", - "Cell \u001b[0;32mIn[71], line 10\u001b[0m, in \u001b[0;36mHybridMovieRecommender.train_collaborative_model\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtrain_collaborative_model\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 9\u001b[0m reader \u001b[38;5;241m=\u001b[39m Reader(rating_scale\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m5\u001b[39m))\n\u001b[0;32m---> 10\u001b[0m data \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mload_from_df(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43muser_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmovieId\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrating\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m, reader)\n\u001b[1;32m 11\u001b[0m trainset, _ \u001b[38;5;241m=\u001b[39m train_test_split(data, test_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.2\u001b[39m)\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcollaborative_model \u001b[38;5;241m=\u001b[39m SVD()\n", - "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/frame.py:2908\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2906\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m 2907\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 2908\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_listlike_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mraise_missing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 2910\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m 2911\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n", - "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/indexing.py:1254\u001b[0m, in \u001b[0;36m_LocIndexer._get_listlike_indexer\u001b[0;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1251\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1252\u001b[0m keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m ax\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 1254\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_read_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mraise_missing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mraise_missing\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1255\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m keyarr, indexer\n", - "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/indexing.py:1304\u001b[0m, in \u001b[0;36m_LocIndexer._validate_read_indexer\u001b[0;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1302\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m raise_missing:\n\u001b[1;32m 1303\u001b[0m not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mset\u001b[39m(key) \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mset\u001b[39m(ax))\n\u001b[0;32m-> 1304\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1306\u001b[0m \u001b[38;5;66;03m# we skip the warning on Categorical\u001b[39;00m\n\u001b[1;32m 1307\u001b[0m \u001b[38;5;66;03m# as this check is actually done (check for\u001b[39;00m\n\u001b[1;32m 1308\u001b[0m \u001b[38;5;66;03m# non-missing values), but a bit later in the\u001b[39;00m\n\u001b[1;32m 1309\u001b[0m \u001b[38;5;66;03m# code, so we want to avoid warning & then\u001b[39;00m\n\u001b[1;32m 1310\u001b[0m \u001b[38;5;66;03m# just raising\u001b[39;00m\n\u001b[1;32m 1311\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m ax\u001b[38;5;241m.\u001b[39mis_categorical():\n", - "\u001b[0;31mKeyError\u001b[0m: \"['rating', 'user_id'] not in index\"" - ] - } - ], "source": [ - "class HybridMovieRecommender:\n", - " def __init__(self, df):\n", - " self.df = df\n", - " self.collaborative_model = None\n", - " self.tfidf_matrix = None\n", - " self.cosine_sim = None\n", + "The class above implements a content based movie recommender system using genres as the primary feature. It begins by importing necessary libraries and preparing the dataset ensuring only unique movie titles are retained. The `ContentBasedModel` class encapsulates the core functionality. Within its `train_model` method, movie genres are converted into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency) vectorization creating a matrix of genres. The cosine similarity between all pairs of movies is then calculated based on these TF-IDF representations producing a similarity matrix that identifies movies with similar genres. The `get_recommendations` method uses this similarity matrix to provide the top 5 most similar movies to a given title. The code demonstrates the class's usage by creating an instance, training the model and getting recommendations for 'Toy Story'. Finally, it saves the trained model including the TF-IDF matrix, cosine similarity matrix and movie indices to a pickle file for future use.\n", + "The recommender system outputs the top 5 movies similar to 'Toy Story' based on genre similarity i.e. 'Antz','Toy Story 2', 'The Adventures of Rocky and Bullwinkle', 'The Emperor's New Groove' and 'Monsters, Inc.'. Each recommendation shares key characteristics with 'Toy Story' being that they are family-friendly animations with a focus on adventure and comedy. These recommendations are evidently relevant and appealing to fans of Toy Story.\n", "\n", - " def train_collaborative_model(self):\n", - " reader = Reader(rating_scale=(1, 5))\n", - " data = Dataset.load_from_df(self.df[['user_id', 'movieId', 'rating']], reader)\n", - " trainset, _ = train_test_split(data, test_size=0.2)\n", - " self.collaborative_model = SVD()\n", - " self.collaborative_model.fit(trainset)\n", "\n", - " def train_content_based_model(self):\n", - " self.df['clean_title'] = self.df['title'].apply(lambda x: re.sub(\"[^a-zA-Z0-9 ]\", \"\", x).lower())\n", - " self.df['features'] = self.df['clean_title'] + ' ' + self.df['genres']\n", - " \n", - " tfidf = TfidfVectorizer(stop_words='english')\n", - " self.tfidf_matrix = tfidf.fit_transform(self.df['features'])\n", - " self.cosine_sim = cosine_similarity(self.tfidf_matrix)\n", "\n", - " def get_collaborative_recommendations(self, user_id, n=5):\n", - " user_movies = self.df[self.df['user_id'] == user_id]['movieId'].unique()\n", - " all_movies = self.df['movieId'].unique()\n", - " movies_to_predict = np.setdiff1d(all_movies, user_movies)\n", - " \n", - " predictions = [\n", - " (movie_id, self.collaborative_model.predict(user_id, movie_id).est)\n", - " for movie_id in movies_to_predict\n", - " ]\n", - " \n", - " recommendations = sorted(predictions, key=lambda x: x[1], reverse=True)\n", - " return recommendations[:n]\n", "\n", - " def get_content_based_recommendations(self, title, top_n=10):\n", - " idx = self.df.index[self.df['title'] == title].tolist()\n", - " if not idx:\n", - " return []\n", - "\n", - " idx = idx[0]\n", - " sim_scores = list(enumerate(self.cosine_sim[idx]))\n", - " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n", - " sim_scores = sim_scores[1:top_n+1]\n", - " movie_indices = [i[0] for i in sim_scores]\n", - " \n", - " return self.df[['movieId', 'title', 'genres']].iloc[movie_indices].reset_index(drop=True)\n", "\n", - " def get_hybrid_recommendations(self, user_id, n=10):\n", - " collaborative_recs = self.get_collaborative_recommendations(user_id, n=n)\n", - " \n", - " hybrid_recs = []\n", - " for movie_id, collab_score in collaborative_recs:\n", - " movie = self.df[self.df['movieId'] == movie_id].iloc[0]\n", - " content_recs = self.get_content_based_recommendations(movie['title'], top_n=5)\n", - " content_score = np.mean([self.cosine_sim[movie_id][rec['movieId']] for _, rec in content_recs.iterrows()])\n", - " hybrid_score = 0.7 * collab_score + 0.3 * content_score\n", - " hybrid_recs.append((movie_id, hybrid_score))\n", - " \n", - " hybrid_recs = sorted(hybrid_recs, key=lambda x: x[1], reverse=True)\n", - " return hybrid_recs[:n]\n", - "\n", - " def evaluate_model(self):\n", - " reader = Reader(rating_scale=(1, 5))\n", - " data = Dataset.load_from_df(self.df[['user_id', 'movieId', 'rating']], reader)\n", - " \n", - " # Collaborative Filtering Evaluation\n", - " cv_results = cross_validate(self.collaborative_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)\n", - " \n", - " print(\"Collaborative Filtering Evaluation:\")\n", - " print(f\"RMSE: {np.mean(cv_results['test_rmse']):.4f} (+/- {np.std(cv_results['test_rmse']):.4f})\")\n", - " print(f\"MAE: {np.mean(cv_results['test_mae']):.4f} (+/- {np.std(cv_results['test_mae']):.4f})\")\n", - " \n", - " # Content-Based Filtering Evaluation\n", - " sample_size = min(1000, len(self.df))\n", - " sample = self.df.sample(sample_size)\n", - " \n", - " content_based_rmse = []\n", - " content_based_mae = []\n", - " \n", - " for _, row in sample.iterrows():\n", - " recs = self.get_content_based_recommendations(row['title'], top_n=10)\n", - " if not recs.empty:\n", - " pred_ratings = [self.cosine_sim[row['movieId']][rec['movieId']] for _, rec in recs.iterrows()]\n", - " true_rating = row['rating']\n", - " content_based_rmse.append(mean_squared_error([true_rating], pred_ratings, squared=False))\n", - " content_based_mae.append(mean_absolute_error([true_rating], pred_ratings))\n", - " \n", - " print(\"\\nContent-Based Filtering Evaluation:\")\n", - " print(f\"RMSE: {np.mean(content_based_rmse):.4f} (+/- {np.std(content_based_rmse):.4f})\")\n", - " print(f\"MAE: {np.mean(content_based_mae):.4f} (+/- {np.std(content_based_mae):.4f})\")\n", - "\n", - "# Example usage\n", - "if __name__ == \"__main__\":\n", - " # # Load movie data\n", - " # df = pd.read_csv('movies_data.csv') # Update with the correct path\n", - " \n", - " recommender = HybridMovieRecommender(df)\n", - " recommender.train_collaborative_model()\n", - " recommender.train_content_based_model()\n", - " \n", - " # Get hybrid recommendations for a sample user\n", - " sample_user_id = 1\n", - " recommendations = recommender.get_hybrid_recommendations(sample_user_id, n=5)\n", - " \n", - " print(f\"Hybrid Recommendations for User {sample_user_id}:\")\n", - " for movie_id, score in recommendations:\n", - " movie = df[df['movieId'] == movie_id].iloc[0]\n", - " print(f\"{movie['title']} (Score: {score:.2f})\")\n", - " \n", - " # Evaluate the model\n", - " recommender.evaluate_model()" + "\n" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "ee92de36-5476-4199-a1f0-2ce55ec1e33b", + "cell_type": "markdown", + "id": "fcd47967-e4a5-4915-a4ad-5ba0ebf8cae4", "metadata": {}, - "outputs": [], "source": [ - "import pandas as pd\n", - "\n", - "class HybridRecommender:\n", - " def __init__(self, content_df, collab_df):\n", - " self.content_model = ContentBasedModel(content_df)\n", - " self.collab_model = CollabBasedModel(collab_df)\n", - " self.content_model.train_model()\n", - " self.collab_model.train_model()\n", - "\n", - " def get_hybrid_recommendations(self, title, user_ratings, top_n=5, alpha=0.5):\n", - " # Get recommendations from both models\n", - " content_recs = self.content_model.get_recommendations(title, top_n=top_n)\n", - " collab_recs = self.collab_model.get_recommendations(user_ratings, n=top_n)\n", - " \n", - " # Convert collaborative recommendations to DataFrame\n", - " collab_recs_df = pd.DataFrame(collab_recs, columns=['movieId', 'predicted_rating'])\n", - " \n", - " # Merge content-based recommendations with collaborative predictions\n", - " recommendations = content_recs.merge(collab_recs_df, left_on='movieId', right_on='movieId', how='left')\n", - " \n", - " # Normalize and weight recommendations\n", - " recommendations['final_score'] = (alpha * recommendations['predicted_rating']) + ((1 - alpha) * recommendations['cosine_sim'])\n", - " recommendations = recommendations.sort_values(by='final_score', ascending=False)\n", - " \n", - " return recommendations[['title', 'genres']].head(top_n).reset_index(drop=True)\n", - "\n", - "# Example usage\n", - "if __name__ == \"__main__\":\n", - " content_df = pd.read_csv('content_movies.csv') # Replace with actual path\n", - " collab_df = pd.read_csv('collab_ratings.csv') # Replace with actual path\n", - "\n", - " hybrid_recommender = HybridRecommender(content_df, collab_df)\n", - " user_ratings = [(1, 5), (2, 4)] # Example user ratings\n", - " recommendations = hybrid_recommender.get_hybrid_recommendations('Sommersby', user_ratings, top_n=5)\n", - "\n", - " print(\"Hybrid Recommendations:\")\n", - " print(recommendations)\n" + "## Hybrid System" + ] + }, + { + "cell_type": "markdown", + "id": "c07a19f0-fcde-450c-8fd0-eac2c0eac9f7", + "metadata": {}, + "source": [ + "The `HybridRecommender` class combines two types of recommendation systems namely content-based and collaborative filtering into a hybrid model. This class takes in two datasets (one for each recommendation model) and provides a unified recommendation list based on both models." ] }, { "cell_type": "code", - "execution_count": 66, - "id": "4d918c38-f2d3-4d13-b0dc-4d6c36a57493", + "execution_count": 49, + "id": "29fa9bcb-c197-453e-b5f5-0bf14cc8943b", "metadata": {}, "outputs": [ { - "ename": "KeyError", - "evalue": "\"['rating', 'user_id'] not in index\"", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[66], line 85\u001b[0m\n\u001b[1;32m 82\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmovies_data/movies.csv\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;66;03m# Update with the correct path\u001b[39;00m\n\u001b[1;32m 84\u001b[0m recommender \u001b[38;5;241m=\u001b[39m HybridMovieRecommender(df)\n\u001b[0;32m---> 85\u001b[0m \u001b[43mrecommender\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_models\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;66;03m# Get hybrid recommendations for a sample user\u001b[39;00m\n\u001b[1;32m 88\u001b[0m sample_user_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n", - "Cell \u001b[0;32mIn[66], line 13\u001b[0m, in \u001b[0;36mHybridMovieRecommender.train_models\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtrain_models\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtrain_content_based_model()\n\u001b[0;32m---> 13\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_collaborative_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn[66], line 25\u001b[0m, in \u001b[0;36mHybridMovieRecommender.train_collaborative_model\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtrain_collaborative_model\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 24\u001b[0m reader \u001b[38;5;241m=\u001b[39m Reader(rating_scale\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m5\u001b[39m))\n\u001b[0;32m---> 25\u001b[0m data \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mload_from_df(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43muser_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmovieId\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrating\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m, reader)\n\u001b[1;32m 26\u001b[0m trainset, _ \u001b[38;5;241m=\u001b[39m train_test_split(data, test_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.2\u001b[39m)\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcf_model \u001b[38;5;241m=\u001b[39m SVD()\n", - "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/frame.py:2908\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2906\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m 2907\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 2908\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_listlike_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mraise_missing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 2910\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m 2911\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n", - "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/indexing.py:1254\u001b[0m, in \u001b[0;36m_LocIndexer._get_listlike_indexer\u001b[0;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1251\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1252\u001b[0m keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m ax\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 1254\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_read_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mraise_missing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mraise_missing\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1255\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m keyarr, indexer\n", - "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/indexing.py:1304\u001b[0m, in \u001b[0;36m_LocIndexer._validate_read_indexer\u001b[0;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1302\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m raise_missing:\n\u001b[1;32m 1303\u001b[0m not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mset\u001b[39m(key) \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mset\u001b[39m(ax))\n\u001b[0;32m-> 1304\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1306\u001b[0m \u001b[38;5;66;03m# we skip the warning on Categorical\u001b[39;00m\n\u001b[1;32m 1307\u001b[0m \u001b[38;5;66;03m# as this check is actually done (check for\u001b[39;00m\n\u001b[1;32m 1308\u001b[0m \u001b[38;5;66;03m# non-missing values), but a bit later in the\u001b[39;00m\n\u001b[1;32m 1309\u001b[0m \u001b[38;5;66;03m# code, so we want to avoid warning & then\u001b[39;00m\n\u001b[1;32m 1310\u001b[0m \u001b[38;5;66;03m# just raising\u001b[39;00m\n\u001b[1;32m 1311\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m ax\u001b[38;5;241m.\u001b[39mis_categorical():\n", - "\u001b[0;31mKeyError\u001b[0m: \"['rating', 'user_id'] not in index\"" + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Movie: Dick (1999)\n", + "Genre: comedy\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Rate this movie from 1 to 5 (or 'x' if you haven't watched it): 3\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Movie: Girl Who Kicked the Hornet's Nest, The (Luftslottet som sprängdes) (2009)\n", + "Genre: action, crime, mystery\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Rate this movie from 1 to 5 (or 'x' if you haven't watched it): 5\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Movie: Return to Treasure Island (1988)\n", + "Genre: adventure, animation, comedy\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Rate this movie from 1 to 5 (or 'x' if you haven't watched it): 5\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Movie: Center Stage (2000)\n", + "Genre: drama, musical\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Rate this movie from 1 to 5 (or 'x' if you haven't watched it): x\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Movie: Sleepaway Camp (1983)\n", + "Genre: horror\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Rate this movie from 1 to 5 (or 'x' if you haven't watched it): 3\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Hybrid Recommended movies:\n", + "1. Shawshank Redemption, The (1994) - Hybrid score: 4.20\n", + " Genre: crime, drama\n", + "2. Lawrence of Arabia (1962) - Hybrid score: 4.19\n", + " Genre: adventure, drama, war\n", + "3. Cool Hand Luke (1967) - Hybrid score: 4.14\n", + " Genre: drama\n", + "4. Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) - Hybrid score: 3.63\n", + " Genre: action, adventure\n", + "5. Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) - Hybrid score: 3.14\n", + " Genre: comedy, war\n" ] } ], "source": [ - "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", - "import re\n", + "class HybridModel:\n", + " '''\n", + " A hybrid recommender system that combines collaborative filtering and content-based filtering.\n", + " '''\n", "\n", - "class HybridMovieRecommender:\n", - " def __init__(self, df):\n", - " self.df = df\n", - " self.cf_model = None\n", - " self.tfidf_matrix = None\n", - " self.cosine_sim = None\n", + " def __init__(self, collab_df, content_df):\n", + " '''\n", + " Initialize the HybridModel with collaborative and content-based dataframes.\n", + "\n", + " Args:\n", + " collab_df (pd.DataFrame): Dataframe for collaborative filtering.\n", + " content_df (pd.DataFrame): Dataframe for content-based filtering.\n", + " '''\n", + " self.collab_model = CollabBasedModel(collab_df)\n", + " self.content_model = ContentBasedModel(content_df)\n", + " self.df = pd.merge(collab_df, content_df, on='movieId').drop_duplicates(subset=['movieId'])\n", "\n", " def train_models(self):\n", - " self.train_content_based_model()\n", - " self.train_collaborative_model()\n", + " '''\n", + " Train both collaborative and content-based models.\n", + " '''\n", + " self.collab_model.train_model()\n", + " self.content_model.train_model()\n", "\n", - " def train_content_based_model(self):\n", - " self.df['clean_title'] = self.df['title'].apply(lambda x: re.sub(\"[^a-zA-Z0-9 ]\", \"\", x).lower())\n", - " self.df['features'] = self.df['clean_title'] + ' ' + self.df['genres']\n", - " \n", - " tfidf = TfidfVectorizer(stop_words='english')\n", - " self.tfidf_matrix = tfidf.fit_transform(self.df['features'])\n", - " self.cosine_sim = cosine_similarity(self.tfidf_matrix)\n", + " def get_user_ratings(self, num_movies=5):\n", + " '''\n", + " Get user ratings for a specified number of random movies.\n", "\n", - " def train_collaborative_model(self):\n", - " reader = Reader(rating_scale=(1, 5))\n", - " data = Dataset.load_from_df(self.df[['user_id', 'movieId', 'rating']], reader)\n", - " trainset, _ = train_test_split(data, test_size=0.2)\n", - " self.cf_model = SVD()\n", - " self.cf_model.fit(trainset)\n", + " Args:\n", + " num_movies (int): Number of movies to rate.\n", "\n", - " def get_content_based_recommendations(self, title, top_n=10):\n", - " idx = self.df.index[self.df['title'] == title].tolist()\n", - " if not idx:\n", - " return []\n", + " Returns:\n", + " list: List of tuples containing movie IDs and ratings.\n", + " '''\n", + " # Initialize an empty list to store user ratings\n", + " user_ratings = []\n", "\n", - " idx = idx[0]\n", - " sim_scores = list(enumerate(self.cosine_sim[idx]))\n", - " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n", - " sim_scores = sim_scores[1:top_n+1]\n", - " movie_indices = [i[0] for i in sim_scores]\n", + " # Loop to collect ratings for a specified number of movies\n", + " for _ in range(num_movies):\n", + " # Randomly sample one movie from the DataFrame\n", + " movie = self.df.sample(1).iloc[0]\n", + " \n", + " # Display the movie details to the user\n", + " print(f\"\\nMovie: {movie['title']} ({movie['release_year']})\")\n", + " print(f\"Genre: {movie['genres']}\")\n", + " \n", + " # Prompt the user to rate the movie or indicate they haven't watched it\n", + " rating = input(\"Rate this movie from 1 to 5 (or 'x' if you haven't watched it): \")\n", + " \n", + " # If the user has watched the movie and provided a rating, add it to the list\n", + " if rating.lower() != 'x':\n", + " user_ratings.append((movie['movieId'], float(rating)))\n", " \n", - " return self.df.iloc[movie_indices][['movieId', 'title', 'genres']]\n", + " # Return the list of user ratings\n", + " return user_ratings\n", "\n", - " def get_collaborative_recommendations(self, user_id, n=5):\n", - " user_movies = self.df[self.df['user_id'] == user_id]['movieId'].unique()\n", - " all_movies = self.df['movieId'].unique()\n", - " movies_to_predict = np.setdiff1d(all_movies, user_movies)\n", - " \n", - " predictions = [\n", - " (movie_id, self.cf_model.predict(user_id, movie_id).est)\n", - " for movie_id in movies_to_predict\n", - " ]\n", - " \n", - " predictions = sorted(predictions, key=lambda x: x[1], reverse=True)\n", - " return predictions[:n]\n", "\n", - " def get_hybrid_recommendations(self, user_id, n=5):\n", - " cf_recs = self.get_collaborative_recommendations(user_id, n=n)\n", - " \n", - " hybrid_recs = []\n", - " for movie_id, cf_score in cf_recs:\n", - " movie = self.df[self.df['movieId'] == movie_id].iloc[0]\n", - " cb_recs = self.get_content_based_recommendations(movie['title'], top_n=5)\n", - " cb_score = np.mean([self.cosine_sim[movie_id][rec['movieId']] for _, rec in cb_recs.iterrows()])\n", - " hybrid_score = 0.7 * cf_score + 0.3 * cb_score\n", - " hybrid_recs.append((movie_id, hybrid_score))\n", - " \n", - " hybrid_recs = sorted(hybrid_recs, key=lambda x: x[1], reverse=True)\n", - " return [(self.df[self.df['movieId'] == movie_id]['title'].iloc[0], score) for movie_id, score in hybrid_recs[:n]]\n", + " def get_hybrid_recommendations(self, user_ratings, n=5, collab_weight=0.5):\n", + " '''\n", + " Get hybrid recommendations based on user ratings.\n", "\n", - " def evaluate_recommendations(self, test_set):\n", - " predictions = [self.cf_model.predict(uid, iid).est for uid, iid, true_r, _ in test_set]\n", - " true_ratings = [true_r for _, _, true_r, _ in test_set]\n", - " \n", - " rmse = np.sqrt(mean_squared_error(true_ratings, predictions))\n", - " mae = mean_absolute_error(true_ratings, predictions)\n", + " Args:\n", + " user_ratings (list): List of tuples containing movie IDs and ratings.\n", + " n (int): Number of recommendations to return.\n", + " collab_weight (float): Weight for collaborative filtering (0 to 1).\n", + "\n", + " Returns:\n", + " list: List of tuples containing recommended movie IDs and hybrid scores.\n", + " '''\n", + " # Generate a new user ID by incrementing the maximum user ID in the DataFrame\n", + " new_user_id = self.df['user_id'].max() + 1\n", " \n", - " return rmse, mae\n", + " # Get collaborative filtering recommendations\n", + " collab_recommendations = self.collab_model.get_recommendations(user_ratings, n)\n", + " # Extract movie IDs from collaborative filtering recommendations\n", + " collab_movie_ids = [rec[0] for rec in collab_recommendations]\n", + " # Extract scores from collaborative filtering recommendations\n", + " collab_scores = np.array([rec[1] for rec in collab_recommendations])\n", + " \n", + " # Initialize a list to store content-based scores\n", + " content_scores = []\n", + " # Loop through each movie ID from collaborative filtering recommendations\n", + " for movie_id in collab_movie_ids:\n", + " # Get the title of the movie corresponding to the movie ID\n", + " title = self.df[self.df['movieId'] == movie_id]['title'].values[0]\n", + " # Get the top content-based recommendation for the movie\n", + " content_rec = self.content_model.get_recommendations(title, k=1)\n", + " # Calculate the average rating of the content-based recommendation\n", + " content_score = self.df[self.df['title'] == content_rec.iloc[0]]['rating'].mean()\n", + " # Append the content-based score to the list\n", + " content_scores.append(content_score)\n", + " # Convert the list of content-based scores to a NumPy array\n", + " content_scores = np.array(content_scores)\n", + " \n", + " # Combine collaborative and content-based scores using a weighted average\n", + " hybrid_scores = collab_weight * collab_scores + (1 - collab_weight) * content_scores\n", + " # Combine movie IDs with their hybrid scores and sort them in descending order of scores\n", + " hybrid_recommendations = sorted(zip(collab_movie_ids, hybrid_scores), key=lambda x: x[1], reverse=True)\n", + " \n", + " # Return the top n hybrid recommendations\n", + " return hybrid_recommendations[:n]\n", "\n", - "# Example usage\n", - "if __name__ == \"__main__\":\n", - " # # # Load movie data\n", - " # df = pd.read_csv('movies_data/movies.csv') # Update with the correct path\n", - " \n", - " # recommender = HybridMovieRecommender(df)\n", - " recommender.train_models()\n", - " \n", - " # Get hybrid recommendations for a sample user\n", - " sample_user_id = 1\n", - " recommendations = recommender.get_hybrid_recommendations(sample_user_id, n=5)\n", - " \n", - " print(f\"Hybrid Recommendations for User {sample_user_id}:\")\n", - " for title, score in recommendations:\n", - " print(f\"{title} (Score: {score:.2f})\")\n", - " \n", - " # Evaluate the model\n", - " reader = Reader(rating_scale=(1, 5))\n", - " data = Dataset.load_from_df(df[['user_id', 'movieId', 'rating']], reader)\n", - " _, testset = train_test_split(data, test_size=0.2, random_state=42)\n", - " \n", - " rmse, mae = recommender.evaluate_recommendations(testset)\n", - " print(f\"\\nModel Evaluation:\")\n", - " print(f\"RMSE: {rmse:.4f}\")\n", - " print(f\"MAE: {mae:.4f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "id": "670bf220-0e8a-4dd5-b349-8cbf3aa7bcbe", - "metadata": {}, - "outputs": [ - { - "ename": "IndexError", - "evalue": "index 0 is out of bounds for axis 0 with size 0", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[73], line 87\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[38;5;66;03m# Example usage\u001b[39;00m\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 84\u001b[0m \u001b[38;5;66;03m# # Load your data\u001b[39;00m\n\u001b[1;32m 85\u001b[0m \u001b[38;5;66;03m# df = pd.read_csv('your_movie_data.csv') # Replace with your actual data file\u001b[39;00m\n\u001b[0;32m---> 87\u001b[0m recommender \u001b[38;5;241m=\u001b[39m \u001b[43mHybridRecommender\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 88\u001b[0m recommender\u001b[38;5;241m.\u001b[39mrecommend_movies(user_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, n\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m) \u001b[38;5;66;03m# Replace with an actual user ID from your data\u001b[39;00m\n", - "Cell \u001b[0;32mIn[73], line 17\u001b[0m, in \u001b[0;36mHybridRecommender.__init__\u001b[0;34m(self, df)\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcosine_sim \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# Map DataFrame columns to expected names\u001b[39;00m\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muser_col \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcontains\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43muser\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcase\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitem_col \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf\u001b[38;5;241m.\u001b[39mcolumns[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmovie\u001b[39m\u001b[38;5;124m'\u001b[39m, case\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)][\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrating_col \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf\u001b[38;5;241m.\u001b[39mcolumns[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrating\u001b[39m\u001b[38;5;124m'\u001b[39m, case\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)][\u001b[38;5;241m0\u001b[39m]\n", - "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/indexes/base.py:4101\u001b[0m, in \u001b[0;36mIndex.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4099\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_scalar(key):\n\u001b[1;32m 4100\u001b[0m key \u001b[38;5;241m=\u001b[39m com\u001b[38;5;241m.\u001b[39mcast_scalar_indexer(key, warn_float\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m-> 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mgetitem\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mslice\u001b[39m):\n\u001b[1;32m 4104\u001b[0m \u001b[38;5;66;03m# This case is separated from the conditional above to avoid\u001b[39;00m\n\u001b[1;32m 4105\u001b[0m \u001b[38;5;66;03m# pessimization of basic indexing.\u001b[39;00m\n\u001b[1;32m 4106\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m promote(getitem(key))\n", - "\u001b[0;31mIndexError\u001b[0m: index 0 is out of bounds for axis 0 with size 0" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from surprise import Dataset, Reader, SVD\n", - "from surprise.model_selection import train_test_split\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "from sklearn.metrics.pairwise import cosine_similarity\n", - "import re\n", "\n", - "class HybridRecommender:\n", - " def __init__(self, df):\n", - " self.df = df\n", - " self.collab_model = None\n", - " self.tfidf_matrix = None\n", - " self.cosine_sim = None\n", - " \n", - " # Map DataFrame columns to expected names\n", - " self.user_col = self.df.columns[self.df.columns.str.contains('user', case=False)][0]\n", - " self.item_col = self.df.columns[self.df.columns.str.contains('movie', case=False)][0]\n", - " self.rating_col = self.df.columns[self.df.columns.str.contains('rating', case=False)][0]\n", - " self.title_col = self.df.columns[self.df.columns.str.contains('title', case=False)][0]\n", - " self.genre_col = self.df.columns[self.df.columns.str.contains('genre', case=False)][0]\n", - "\n", - " def train_collaborative_model(self):\n", - " reader = Reader(rating_scale=(self.df[self.rating_col].min(), self.df[self.rating_col].max()))\n", - " data = Dataset.load_from_df(self.df[[self.user_col, self.item_col, self.rating_col]], reader)\n", - " trainset, _ = train_test_split(data, test_size=0.2)\n", - " self.collab_model = SVD()\n", - " self.collab_model.fit(trainset)\n", + " def print_recommendations(self, recommendations):\n", + " '''\n", + " Print the recommended movies with their hybrid scores.\n", "\n", - " def train_content_based_model(self):\n", - " self.df['clean_title'] = self.df[self.title_col].apply(lambda x: re.sub(\"[^a-zA-Z0-9 ]\", \"\", str(x).lower()))\n", - " self.df['features'] = self.df['clean_title'] + ' ' + self.df[self.genre_col].astype(str)\n", - " \n", - " tfidf = TfidfVectorizer(stop_words='english')\n", - " self.tfidf_matrix = tfidf.fit_transform(self.df['features'])\n", - " self.cosine_sim = cosine_similarity(self.tfidf_matrix)\n", + " Args:\n", + " recommendations (list): List of tuples containing movie IDs and hybrid scores.\n", + " '''\n", + " # Loop through the recommendations and print the details of each recommended movie\n", + " for i, (movie_id, score) in enumerate(recommendations, 1):\n", + " # Retrieve the movie details based on the movie ID\n", + " movie = self.df[self.df['movieId'] == movie_id].iloc[0]\n", + " # Print the movie rank, title, release year and hybrid score\n", + " print(f\"{i}. {movie['title']} ({movie['release_year']}) - Hybrid score: {score:.2f}\")\n", + " # Print the genre of the movie\n", + " print(f\" Genre: {movie['genres']}\")\n", "\n", - " def train_models(self):\n", - " self.train_collaborative_model()\n", - " self.train_content_based_model()\n", "\n", - " def get_collaborative_recommendations(self, user_id, n=10):\n", - " user_movies = self.df[self.df[self.user_col] == user_id][self.item_col].unique()\n", - " all_movies = self.df[self.item_col].unique()\n", - " movies_to_predict = np.setdiff1d(all_movies, user_movies)\n", + " def recommend_movies(self, num_ratings=5, num_recommendations=5, collab_weight=0.5):\n", + " '''\n", + " Get user ratings and provide hybrid movie recommendations.\n", + "\n", + " Args:\n", + " num_ratings (int): Number of movies to rate.\n", + " num_recommendations (int): Number of recommendations to provide.\n", + " collab_weight (float): Weight for collaborative filtering (0 to 1).\n", + " '''\n", + " # Get user ratings for a specified number of movies\n", + " user_ratings = self.get_user_ratings(num_ratings)\n", " \n", - " predictions = [\n", - " (movie_id, self.collab_model.predict(user_id, movie_id).est)\n", - " for movie_id in movies_to_predict\n", - " ]\n", + " # Generate hybrid recommendations based on the user ratings\n", + " recommendations = self.get_hybrid_recommendations(user_ratings, num_recommendations, collab_weight)\n", " \n", - " return sorted(predictions, key=lambda x: x[1], reverse=True)[:n]\n", + " # Print the hybrid recommended movies\n", + " print(\"\\nHybrid Recommended movies:\")\n", + " self.print_recommendations(recommendations)\n", "\n", - " def get_content_based_recommendations(self, movie_id, n=10):\n", - " idx = self.df[self.df[self.item_col] == movie_id].index[0]\n", - " sim_scores = list(enumerate(self.cosine_sim[idx]))\n", - " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n", - " sim_scores = sim_scores[1:n+1]\n", - " movie_indices = [i[0] for i in sim_scores]\n", - " return self.df.iloc[movie_indices][[self.item_col, self.title_col, self.genre_col]]\n", "\n", - " def get_hybrid_recommendations(self, user_id, n=10):\n", - " collab_recs = self.get_collaborative_recommendations(user_id, n=n)\n", - " \n", - " hybrid_recs = []\n", - " for movie_id, collab_score in collab_recs:\n", - " content_recs = self.get_content_based_recommendations(movie_id, n=5)\n", - " content_score = content_recs[self.item_col].apply(lambda x: self.cosine_sim[movie_id][x]).mean()\n", - " hybrid_score = 0.7 * collab_score + 0.3 * content_score\n", - " hybrid_recs.append((movie_id, hybrid_score))\n", - " \n", - " hybrid_recs = sorted(hybrid_recs, key=lambda x: x[1], reverse=True)\n", - " return [(self.df[self.df[self.item_col] == movie_id][self.title_col].iloc[0], score) for movie_id, score in hybrid_recs[:n]]\n", + "# Instantiate\n", + "hybrid_model = HybridModel(collab_df, content_df)\n", + "# Train both the collaborative filtering and content-based models\n", + "hybrid_model.train_models()\n", "\n", - " def recommend_movies(self, user_id, n=10):\n", - " self.train_models()\n", - " recommendations = self.get_hybrid_recommendations(user_id, n)\n", - " print(f\"\\nTop {n} Recommendations for User {user_id}:\")\n", - " for i, (title, score) in enumerate(recommendations, 1):\n", - " print(f\"{i}. {title} (Score: {score:.2f})\")\n", + "# Recommend movies using the hybrid model specifying the number of user ratings to collect,\n", + "# the number of movie recommendations to generate and the weight for collaborative filtering in the hybrid model\n", + "hybrid_model.recommend_movies(num_ratings=5, num_recommendations=5, collab_weight=0.5)\n", "\n", - "# Example usage\n", - "if __name__ == \"__main__\":\n", - " # # Load your data\n", - " # df = pd.read_csv('your_movie_data.csv') # Replace with your actual data file\n", - " \n", - " recommender = HybridRecommender(df)\n", - " recommender.recommend_movies(user_id=1, n=5) # Replace with an actual user ID from your data" + "# Save the hybrid model\n", + "with open('hybrid_model.pkl', 'wb') as f:\n", + " pickle.dump(hybrid_model, f)" ] }, {