From 865098de711cbb07cbd61fed3aa8f30b90a7baad Mon Sep 17 00:00:00 2001 From: Evaclaire Wamitu Date: Sun, 28 Jul 2024 02:15:37 +0300 Subject: [PATCH] Debug modeling code --- movie_recommendor.ipynb | 890 +++++++++++++++++++++++++++++++++------- 1 file changed, 747 insertions(+), 143 deletions(-) diff --git a/movie_recommendor.ipynb b/movie_recommendor.ipynb index e5e19c8..eede6fb 100644 --- a/movie_recommendor.ipynb +++ b/movie_recommendor.ipynb @@ -2106,7 +2106,7 @@ }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -2346,21 +2346,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "RMSE: 1.4178\n", - "Baseline Model RMSE: 1.417800588660784\n" + "RMSE: 1.4145\n", + "Baseline Model RMSE: 1.4145059805741038\n" ] } ], "source": [ - "import logging\n", "from surprise import NormalPredictor\n", "from surprise import Reader, Dataset, SVD, KNNBasic\n", "from surprise.model_selection import cross_validate, GridSearchCV\n", "from surprise.model_selection import train_test_split\n", "from surprise import accuracy\n", - "# Set up logging to suppress lower-level messages\n", - "logging.basicConfig(level=logging.WARNING)\n", - "\n", "\n", "# Prepare the data\n", "reader = Reader(rating_scale=(1, 5))\n", @@ -2383,7 +2379,7 @@ "id": "cea8df25-678d-4664-84bb-38d915eeffcf", "metadata": {}, "source": [ - "We find that the RMSE of the baseline model is aproximately 1.43. The next step is to perform grid search cross validation to find the best parameters for the Singular Value Decomposition (SVD) and K-Nearest Neighbors (KNN) models. The `Surprise` library hosts a `GridSearchCV` feature that performs this task.\n", + "We find that the RMSE of the baseline model is aproximately 1.41. The next step is to perform grid search cross validation to find the best parameters for the Singular Value Decomposition (SVD) and K-Nearest Neighbors (KNN) models. The `Surprise` library hosts a `GridSearchCV` feature that performs this task.\n", "Grid searching the SVD model focuses on tuning the following hyperparameters:\n", "\n", "**`n_factors`**: Number of latent factors\n", @@ -2417,7 +2413,7 @@ "text": [ "Tuning SVD...\n", "Best SVD parameters: {'n_factors': 100, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}\n", - "Best SVD RMSE: 0.8633982401542354\n", + "Best SVD RMSE: 0.8640925302163475\n", "Tuning KNN...\n", "Computing the pearson similarity matrix...\n", "Done computing similarity matrix.\n", @@ -2645,6 +2641,7 @@ "Done computing similarity matrix.\n", "Computing the pearson similarity matrix...\n", "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", "Done computing similarity matrix.\n", "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", @@ -2743,11 +2740,11 @@ "Computing the msd similarity matrix...\n", "Done computing similarity matrix.\n", "Best KNN parameters: {'k': 30, 'min_k': 10, 'sim_options': {'name': 'msd', 'user_based': False}}\n", - "Best KNN RMSE: 0.9176043364136052\n", + "Best KNN RMSE: 0.9169397503126548\n", "\n", "Best model: SVD\n", "\n", - "Best RMSE: 0.8633982401542354\n" + "Best RMSE: 0.8640925302163475\n" ] } ], @@ -2832,7 +2829,7 @@ "id": "0c954d06-e38c-447b-901f-d4bdd36aad59", "metadata": {}, "source": [ - "The output reveals the results of tuning and evaluating recommendation models using grid search. For the SVD model, the optimal parameters were identified as having 100 factors, 30 epochs, a learning rate of 0.01 and regularization of 0.1 achieving the best RMSE of approximately 0.863. In contrast, the KNN model required extensive computation of similarity matrices for various configurations including Pearson, cosine and MSD (Mean Squared Difference) similarities. The best parameters for the KNN model were found to be 30 neighbors, a minimum of 5 neighbors and using the MSD similarity metric with a non-user-based approach resulting in a higher RMSE of about 0.918. Consequently, the SVD model emerged as the superior choice with the lowest RMSE and we therefore selected it as the best model overall." + "The output reveals the results of tuning and evaluating recommendation models using grid search. For the SVD model, the optimal parameters were identified as having 100 factors, 30 epochs, a learning rate of 0.01 and regularization of 0.1 achieving the best RMSE of approximately 0.864. In contrast, the KNN model required extensive computation of similarity matrices for various configurations including Pearson, cosine and MSD (Mean Squared Difference) similarities. The best parameters for the KNN model were found to be 30 neighbors, a minimum of 10 neighbors and using the MSD similarity metric with a non-user-based approach resulting in a higher RMSE of about 0.917. Consequently, the SVD model emerged as the superior choice with the lowest RMSE and we therefore selected it as the best model overall." ] }, { @@ -2853,7 +2850,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "id": "04358685-5ec8-4d2a-99e5-fca4203c4911", "metadata": {}, "outputs": [ @@ -2864,11 +2861,11 @@ "Evaluating RMSE of algorithm SVD on 5 split(s).\n", "\n", " Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std \n", - "RMSE (testset) 0.8578 0.8446 0.8631 0.8555 0.8551 0.8552 0.0060 \n", - "Fit time 10.20 10.28 10.37 10.88 10.87 10.52 0.30 \n", - "Test time 0.16 0.35 0.16 0.15 0.34 0.23 0.09 \n", - "SVD Model Mean RMSE: 0.8552039743462021\n", - "SVD Model Standard Deviation RMSE: 0.006010221806834606\n" + "RMSE (testset) 0.8573 0.8559 0.8537 0.8531 0.8529 0.8546 0.0017 \n", + "Fit time 8.97 10.98 10.51 10.45 9.24 10.03 0.78 \n", + "Test time 0.14 0.26 0.15 0.19 0.14 0.17 0.05 \n", + "SVD Model Mean RMSE: 0.8545790913260245\n", + "SVD Model Standard Deviation RMSE: 0.0017211166150516342\n" ] } ], @@ -2885,12 +2882,12 @@ "id": "cccdc46d-7e10-4265-9deb-7b8a532dd9b5", "metadata": {}, "source": [ - "The cross-validation results for the SVD model indicate strong and consistent performance. The model achieved an average RMSE of approximately 0.856 across five folds with a very low standard deviation of 0.0060 demonstrating stable performance across different data splits. The model's training time averaged 10.52 seconds per fold with minimal variation and the prediction time was consistently around 0.23 seconds. Overall these metrics suggest that the SVD model not only provides reliable predictions with low error but also maintains efficient and consistent training and prediction times. The next step is to build a class that will provide the top 5 recommendations." + "The cross-validation results for the SVD model indicate strong and consistent performance. The model achieved an average RMSE of approximately 0.855 across five folds with a very low standard deviation of 0.0031 demonstrating stable performance across different data splits. Overall these metrics suggest that the SVD model not only provides reliable predictions with low error but also maintains efficient and consistent training and prediction times. The next step is to build a class that will provide the top 5 recommendations." ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 35, "id": "133e166f-e332-4457-8d10-6c5003684986", "metadata": {}, "outputs": [ @@ -2899,15 +2896,15 @@ "output_type": "stream", "text": [ "\n", - "Movie: Ex Machina (2015)\n", - "Genre: drama, sci-fi, thriller\n" + "Movie: Lord of the Flies (1963)\n", + "Genre: adventure, drama, thriller\n" ] }, { "name": "stdin", "output_type": "stream", "text": [ - "Rate this movie from 1 to 5 (or 'n' if you haven't seen it): 5\n" + "Rate this movie from 1 to 5 (or 'n' if you haven't seen it): 3\n" ] }, { @@ -2915,15 +2912,15 @@ "output_type": "stream", "text": [ "\n", - "Movie: Closet, The (Placard, Le) (2001)\n", - "Genre: comedy\n" + "Movie: 101 Dalmatians (One Hundred and One Dalmatians) (1961)\n", + "Genre: adventure, animation, children\n" ] }, { "name": "stdin", "output_type": "stream", "text": [ - "Rate this movie from 1 to 5 (or 'n' if you haven't seen it): n\n" + "Rate this movie from 1 to 5 (or 'n' if you haven't seen it): 5\n" ] }, { @@ -2931,15 +2928,15 @@ "output_type": "stream", "text": [ "\n", - "Movie: First Knight (1995)\n", - "Genre: action, drama, romance\n" + "Movie: Armageddon (1998)\n", + "Genre: action, romance, sci-fi, thriller\n" ] }, { "name": "stdin", "output_type": "stream", "text": [ - "Rate this movie from 1 to 5 (or 'n' if you haven't seen it): 4\n" + "Rate this movie from 1 to 5 (or 'n' if you haven't seen it): 5\n" ] }, { @@ -2947,8 +2944,8 @@ "output_type": "stream", "text": [ "\n", - "Movie: Ocean's Thirteen (2007)\n", - "Genre: crime, thriller\n" + "Movie: 2001: A Space Odyssey (1968)\n", + "Genre: adventure, drama, sci-fi\n" ] }, { @@ -2963,15 +2960,15 @@ "output_type": "stream", "text": [ "\n", - "Movie: Sunshine (2007)\n", - "Genre: adventure, drama, sci-fi, thriller\n" + "Movie: Life Aquatic with Steve Zissou, The (2004)\n", + "Genre: adventure, comedy, fantasy\n" ] }, { "name": "stdin", "output_type": "stream", "text": [ - "Rate this movie from 1 to 5 (or 'n' if you haven't seen it): 3\n" + "Rate this movie from 1 to 5 (or 'n' if you haven't seen it): n\n" ] }, { @@ -2980,34 +2977,42 @@ "text": [ "\n", "Recommended movies:\n", - "1. Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) - Predicted rating: 4.32\n", - " Genre: comedy, war\n", - "2. Princess Bride, The (1987) - Predicted rating: 4.21\n", - " Genre: action, adventure, comedy, fantasy, romance\n", - "3. Toy Story 3 (2010) - Predicted rating: 4.20\n", - " Genre: adventure, animation, children, comedy, fantasy, imax\n", - "4. Fargo (1996) - Predicted rating: 4.18\n", - " Genre: comedy, crime, drama, thriller\n", - "5. Grand Day Out with Wallace and Gromit, A (1989) - Predicted rating: 4.18\n", - " Genre: adventure, animation, children, comedy, sci-fi\n" + "1. Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) - Predicted rating: 4.35\n", + "Genre: comedy, war\n", + "2. Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001) - Predicted rating: 4.26\n", + "Genre: comedy, romance\n", + "3. Philadelphia Story, The (1940) - Predicted rating: 4.22\n", + "Genre: comedy, drama, romance\n", + "4. Toy Story 3 (2010) - Predicted rating: 4.20\n", + "Genre: adventure, animation, children, comedy, fantasy, imax\n", + "5. Princess Bride, The (1987) - Predicted rating: 4.20\n", + "Genre: action, adventure, comedy, fantasy, romance\n" ] } ], "source": [ - "class MovieRecommender:\n", - " def __init__(self, df):\n", + "import pickle \n", + "\n", + "# Create collab df\n", + "collab_df = df[['user_id', 'movieId', 'rating']].copy()\n", + "\n", + "# Save to a CSV file\n", + "collab_df.to_csv('collab_movies.csv', index=False)\n", + "\n", + "class CollabBasedModel:\n", + " def __init__(self, collab_df):\n", " '''\n", " Initializes the MovieRecommender with a DataFrame containing movie data.\n", "\n", " Parameters:\n", " df (pd.DataFrame): DataFrame containing movie information with columns 'user_id', 'movieId', 'rating', 'title', 'release_year', and 'genres'.\n", " '''\n", - " self.df = df\n", + " self.df = collab_df\n", " self.model = None\n", "\n", " def train_model(self):\n", " '''\n", - " Trains the SVD model on the movie ratings data. Splits the data into training and test sets, and fits the model.\n", + " Trains the SVD model on the movie ratings data. Splits the data into training and test sets and fits the model.\n", " '''\n", " reader = Reader(rating_scale=(1, 5))\n", " data = Dataset.load_from_df(self.df[['user_id', 'movieId', 'rating']], reader)\n", @@ -3018,13 +3023,8 @@ " def get_user_ratings(self, num_movies=5):\n", " '''\n", " Collects ratings from the user for a specified number of movies.\n", - "\n", - " Parameters:\n", - " num_movies (int): Number of movies to present to the user for rating.\n", - "\n", - " Returns:\n", - " list: A list of tuples containing movie IDs and user ratings.\n", " '''\n", + " \n", " user_ratings = []\n", " for _ in range(num_movies):\n", " movie = self.df.sample(1).iloc[0]\n", @@ -3038,15 +3038,8 @@ " def get_recommendations(self, user_ratings, n=5, genre=None):\n", " '''\n", " Provides movie recommendations based on user ratings and optional genre filtering.\n", - "\n", - " Parameters:\n", - " user_ratings (list): List of tuples with movie IDs and user ratings.\n", - " n (int): Number of recommendations to return.\n", - " genre (str): Optional genre to filter recommendations.\n", - "\n", - " Returns:\n", - " list: A list of tuples with movie IDs and predicted ratings, optionally filtered by genre.\n", " '''\n", + " # Generate a unique user ID for a new user who is providing ratings for the first time\n", " new_user_id = self.df['user_id'].max() + 1\n", " movies_to_predict = self.df[~self.df['movieId'].isin([x[0] for x in user_ratings])]['movieId'].unique()\n", " \n", @@ -3069,33 +3062,112 @@ " def print_recommendations(self, recommendations):\n", " '''\n", " Prints the recommended movies with their predicted ratings.\n", - "\n", - " Parameters:\n", - " recommendations (list): List of tuples with movie IDs and predicted ratings.\n", " '''\n", + " \n", " for i, (movie_id, predicted_rating) in enumerate(recommendations, 1):\n", " movie = self.df[self.df['movieId'] == movie_id].iloc[0]\n", " print(f\"{i}. {movie['title']} ({movie['release_year']}) - Predicted rating: {predicted_rating:.2f}\")\n", - " print(f\" Genre: {movie['genres']}\")\n", + " print(f\"Genre: {movie['genres']}\")\n", "\n", - " def recommend_movies(self, num_ratings=5, num_recommendations=10, genre=None):\n", + " def recommend_movies(self, num_ratings=5, num_recommendations=5, genre=None):\n", " '''\n", " Recommends movies based on user input ratings and optionally filters by genre.\n", - "\n", - " Parameters:\n", - " num_ratings (int): Number of movies to rate for generating recommendations.\n", - " num_recommendations (int): Number of recommended movies to return.\n", - " genre (str): Optional genre to filter recommendations.\n", " '''\n", + " \n", " user_ratings = self.get_user_ratings(num_ratings)\n", " recommendations = self.get_recommendations(user_ratings, num_recommendations, genre)\n", " print(\"\\nRecommended movies:\")\n", " self.print_recommendations(recommendations)\n", "\n", "# Instantiate\n", - "recommender = MovieRecommender(df)\n", + "recommender = CollabBasedModel(df)\n", "recommender.train_model()\n", - "recommender.recommend_movies(num_ratings=5, num_recommendations=5, genre='Comedy')" + "\n", + "# Save the trained model using pickle\n", + "with open('collaborative_model.pkl', 'wb') as f:\n", + " pickle.dump(recommender.model, f)\n", + " \n", + "# Get recommendations\n", + "recommender.recommend_movies(num_ratings=5, num_recommendations=5, genre='Comedy')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "ddc7d36d-6264-4ba1-a986-f6d9b59c2193", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlegenres
100813Black Butler: Book of the Atlanticaction, animation, comedy, fantasy
100814No Game No Life: Zeroanimation, comedy, fantasy
100815Flintdrama
100816Bungo Stray Dogs: Dead Appleaction, animation
100817Andrew Dice Clay: Dice Rulescomedy
\n", + "
" + ], + "text/plain": [ + " title genres\n", + "100813 Black Butler: Book of the Atlantic action, animation, comedy, fantasy\n", + "100814 No Game No Life: Zero animation, comedy, fantasy\n", + "100815 Flint drama\n", + "100816 Bungo Stray Dogs: Dead Apple action, animation\n", + "100817 Andrew Dice Clay: Dice Rules comedy" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "content_df.tail()" ] }, { @@ -3116,112 +3188,644 @@ }, { "cell_type": "code", - "execution_count": 37, - "id": "49ba70aa-eaff-467e-9085-df0cfa47312f", + "execution_count": 61, + "id": "865e5ff2-ee9a-4e76-8873-5b71931a5159", + "metadata": {}, + "outputs": [], + "source": [ + "# create content df\n", + "content_df = df[['title', 'genres']].copy()\n", + "\n", + "# Save to a CSV file\n", + "content_df.to_csv('content_movies.csv', index=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "a7deb54f-6a28-4dff-aaa7-89981d51ead7", + "metadata": {}, + "outputs": [ + { + "ename": "RuntimeError", + "evalue": "nnz of the result is too large", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[63], line 59\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m: \n\u001b[1;32m 56\u001b[0m \n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m# Instantiate and train the content-based model\u001b[39;00m\n\u001b[1;32m 58\u001b[0m recommender \u001b[38;5;241m=\u001b[39m ContentBasedModel(content_df)\n\u001b[0;32m---> 59\u001b[0m \u001b[43mrecommender\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;66;03m# Save the trained model using pickle\u001b[39;00m\n\u001b[1;32m 62\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcontent_model.pkl\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mwb\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n", + "Cell \u001b[0;32mIn[63], line 30\u001b[0m, in \u001b[0;36mContentBasedModel.train_model\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtfidf_matrix \u001b[38;5;241m=\u001b[39m tfidf\u001b[38;5;241m.\u001b[39mfit_transform(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfeatures\u001b[39m\u001b[38;5;124m'\u001b[39m]) \u001b[38;5;66;03m# Transform the features column\u001b[39;00m\n\u001b[1;32m 29\u001b[0m \u001b[38;5;66;03m# Compute cosine similarity matrix\u001b[39;00m\n\u001b[0;32m---> 30\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcosine_sim \u001b[38;5;241m=\u001b[39m \u001b[43mcosine_similarity\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtfidf_matrix\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/utils/_param_validation.py:214\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 208\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[1;32m 210\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[1;32m 211\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[1;32m 212\u001b[0m )\n\u001b[1;32m 213\u001b[0m ):\n\u001b[0;32m--> 214\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 215\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 216\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[1;32m 217\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[1;32m 219\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[1;32m 220\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[1;32m 221\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 222\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 223\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[1;32m 224\u001b[0m )\n", + "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/metrics/pairwise.py:1586\u001b[0m, in \u001b[0;36mcosine_similarity\u001b[0;34m(X, Y, dense_output)\u001b[0m\n\u001b[1;32m 1583\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1584\u001b[0m Y_normalized \u001b[38;5;241m=\u001b[39m normalize(Y, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m-> 1586\u001b[0m K \u001b[38;5;241m=\u001b[39m \u001b[43msafe_sparse_dot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_normalized\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY_normalized\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mT\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdense_output\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdense_output\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1588\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m K\n", + "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/utils/extmath.py:192\u001b[0m, in \u001b[0;36msafe_sparse_dot\u001b[0;34m(a, b, dense_output)\u001b[0m\n\u001b[1;32m 190\u001b[0m ret \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mdot(a, b)\n\u001b[1;32m 191\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 192\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[43ma\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m@\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mb\u001b[49m\n\u001b[1;32m 194\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 195\u001b[0m sparse\u001b[38;5;241m.\u001b[39missparse(a)\n\u001b[1;32m 196\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m sparse\u001b[38;5;241m.\u001b[39missparse(b)\n\u001b[1;32m 197\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m dense_output\n\u001b[1;32m 198\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(ret, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoarray\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 199\u001b[0m ):\n\u001b[1;32m 200\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ret\u001b[38;5;241m.\u001b[39mtoarray()\n", + "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/scipy/sparse/base.py:560\u001b[0m, in \u001b[0;36mspmatrix.__matmul__\u001b[0;34m(self, other)\u001b[0m\n\u001b[1;32m 557\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m isscalarlike(other):\n\u001b[1;32m 558\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mScalar operands are not allowed, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 559\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muse \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m*\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m instead\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 560\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__mul__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mother\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/scipy/sparse/base.py:480\u001b[0m, in \u001b[0;36mspmatrix.__mul__\u001b[0;34m(self, other)\u001b[0m\n\u001b[1;32m 478\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m!=\u001b[39m other\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m]:\n\u001b[1;32m 479\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdimension mismatch\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m--> 480\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mul_sparse_matrix\u001b[49m\u001b[43m(\u001b[49m\u001b[43mother\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 482\u001b[0m \u001b[38;5;66;03m# If it's a list or whatever, treat it like a matrix\u001b[39;00m\n\u001b[1;32m 483\u001b[0m other_a \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39masanyarray(other)\n", + "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/scipy/sparse/compressed.py:505\u001b[0m, in \u001b[0;36m_cs_matrix._mul_sparse_matrix\u001b[0;34m(self, other)\u001b[0m\n\u001b[1;32m 501\u001b[0m idx_dtype \u001b[38;5;241m=\u001b[39m get_index_dtype((\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindptr, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindices,\n\u001b[1;32m 502\u001b[0m other\u001b[38;5;241m.\u001b[39mindptr, other\u001b[38;5;241m.\u001b[39mindices))\n\u001b[1;32m 504\u001b[0m fn \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(_sparsetools, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mformat \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_matmat_maxnnz\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m--> 505\u001b[0m nnz \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mM\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mN\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 506\u001b[0m \u001b[43m \u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43masarray\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindptr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43midx_dtype\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 507\u001b[0m \u001b[43m \u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43masarray\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindices\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43midx_dtype\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 508\u001b[0m \u001b[43m \u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43masarray\u001b[49m\u001b[43m(\u001b[49m\u001b[43mother\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindptr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43midx_dtype\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 509\u001b[0m \u001b[43m \u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43masarray\u001b[49m\u001b[43m(\u001b[49m\u001b[43mother\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindices\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43midx_dtype\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 511\u001b[0m idx_dtype \u001b[38;5;241m=\u001b[39m get_index_dtype((\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindptr, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindices,\n\u001b[1;32m 512\u001b[0m other\u001b[38;5;241m.\u001b[39mindptr, other\u001b[38;5;241m.\u001b[39mindices),\n\u001b[1;32m 513\u001b[0m maxval\u001b[38;5;241m=\u001b[39mnnz)\n\u001b[1;32m 515\u001b[0m indptr \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mempty(major_axis \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m, dtype\u001b[38;5;241m=\u001b[39midx_dtype)\n", + "\u001b[0;31mRuntimeError\u001b[0m: nnz of the result is too large" + ] + } + ], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import re\n", + "\n", + "class ContentBasedModel:\n", + " def __init__(self, content_df):\n", + " '''\n", + " Initializes the ContentBasedRecommender with a DataFrame containing movie data.\n", + "\n", + " Parameters:\n", + " df (pd.DataFrame): DataFrame containing movie information with columns 'title' and 'genres'.\n", + " '''\n", + " self.df = content_df\n", + " self.tfidf_matrix = None\n", + " self.cosine_sim = None\n", + "\n", + " def train_model(self):\n", + " '''\n", + " Trains a TF-IDF model for content-based filtering and computes the cosine similarity matrix.\n", + " '''\n", + " # Preprocess the data\n", + " self.df['clean_title'] = self.df['title'].apply(lambda x: re.sub(\"[^a-zA-Z0-9 ]\", \"\", x).lower())\n", + " self.df['features'] = self.df['clean_title'] + ' ' + self.df['genres']\n", + " \n", + " # Create TF-IDF vectorizer and transform features\n", + " tfidf = TfidfVectorizer(stop_words='english')\n", + " self.tfidf_matrix = tfidf.fit_transform(self.df['features']) # Transform the features column\n", + " \n", + " # Compute cosine similarity matrix\n", + " self.cosine_sim = cosine_similarity(self.tfidf_matrix)\n", + "\n", + " def get_recommendations(self, title, top_n=5):\n", + " '''\n", + " Provides top 5 movie recommendations based on content similarity.\n", + " '''\n", + " \n", + " # Get the index of the movie that matches the title\n", + " idx = self.df.index[self.df['title'] == title].tolist()\n", + " if not idx:\n", + " return pd.DataFrame(columns=['title', 'genres']) # Return an empty DataFrame if title not found\n", + "\n", + " idx = idx[0]\n", + " \n", + " # Get similarity scores for all movies with the given movie\n", + " sim_scores = list(enumerate(self.cosine_sim[idx]))\n", + " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n", + " sim_scores = sim_scores[1:top_n+1] # Exclude the first movie (which is the movie itself)\n", + " \n", + " # Get movie indices\n", + " movie_indices = [i[0] for i in sim_scores]\n", + " \n", + " return self.df[['title', 'genres']].iloc[movie_indices].reset_index(drop=True)\n", + "\n", + "\n", + "if __name__ == \"__main__\": \n", + " \n", + " # Instantiate and train the content-based model\n", + " recommender = ContentBasedModel(content_df)\n", + " recommender.train_model()\n", + "\n", + " # Save the trained model using pickle\n", + " with open('content_model.pkl', 'wb') as f:\n", + " pickle.dump(recommender.model, f)\n", + " \n", + " # Get recommendations for a sample movie\n", + " sample_movie = 'Sommersby'\n", + " recommendations = recommender.get_recommendations(sample_movie, top_n=5)\n", + " \n", + " # Print recommendations\n", + " print(f\"Recommendations for '{sample_movie}':\")\n", + " print(recommendations)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "84b34985-1874-4755-986d-6d42d9a7dfd7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Recommendations for 'Toy Story (1995)':\n", - " title \\\n", - "0 Toy Story 2 (1999) \n", - "1 Toy Story 3 (2010) \n", - "2 Toy, The (1982) \n", - "3 We're Back! A Dinosaur's Story (1993) \n", - "4 Now and Then (1995) \n", - "\n", - " genres \n", - "0 Adventure|Animation|Children|Comedy|Fantasy \n", - "1 Adventure|Animation|Children|Comedy|Fantasy|IMAX \n", - "2 Comedy \n", - "3 Adventure|Animation|Children|Fantasy \n", - "4 Children|Drama \n" + "TF-IDF matrix shape: (100818, 5000)\n", + "SVD matrix shape: (100818, 200)\n", + "Similarity scores: [-0.00121944 -0.00121944 -0.00121944 -0.00121944 -0.00121944 -0.00121944\n", + " -0.00121944 -0.00121944 -0.00121944 -0.00121944]\n", + "Recommended movie indices: [48978, 48979, 48980, 48981, 48982]\n", + "Recommendations for 'Sommersby':\n", + " title genres\n", + "0 Sommersby drama, mystery, romance\n", + "1 Sommersby drama, mystery, romance\n", + "2 Sommersby drama, mystery, romance\n", + "3 Sommersby drama, mystery, romance\n", + "4 Sommersby drama, mystery, romance\n" ] } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", + "from sklearn.decomposition import TruncatedSVD\n", "import re\n", + "import pickle\n", "\n", - "def train_content_based_model(df):\n", - " '''\n", - " Trains a TF-IDF model for content-based filtering.\n", + "class ContentBasedModel:\n", + " def __init__(self, content_df):\n", + " self.df = content_df\n", + " self.tfidf_matrix = None\n", + " self.tfidf_feature_names = None\n", + " self.svd_matrix = None\n", "\n", - " Parameters:\n", - " df (pd.DataFrame): DataFrame containing movie information with columns 'title' and 'genres'.\n", + " def train_model(self):\n", + " self.df['clean_title'] = self.df['title'].apply(lambda x: re.sub(r\"[^a-zA-Z0-9 ]\", \"\", x).lower())\n", + " self.df['features'] = self.df['clean_title'] + ' ' + self.df['genres']\n", + " \n", + " tfidf = TfidfVectorizer(stop_words='english', max_features=5000)\n", + " self.tfidf_matrix = tfidf.fit_transform(self.df['features'])\n", + " self.tfidf_feature_names = tfidf.get_feature_names_out()\n", + " \n", + " print(f\"TF-IDF matrix shape: {self.tfidf_matrix.shape}\")\n", + " \n", + " svd = TruncatedSVD(n_components=200)\n", + " self.svd_matrix = svd.fit_transform(self.tfidf_matrix)\n", + " \n", + " print(f\"SVD matrix shape: {self.svd_matrix.shape}\")\n", "\n", - " Returns:\n", - " tuple: TF-IDF matrix and cosine similarity matrix.\n", - " '''\n", - " # Preprocess the data\n", - " df['clean_title'] = df['title'].apply(lambda x: re.sub(\"[^a-zA-Z0-9 ]\", \"\", x).lower())\n", - " df['features'] = df['clean_title'] + ' ' + df['genres']\n", + " def get_recommendations(self, title, top_n=5):\n", + " idx = self.df.index[self.df['title'] == title].tolist()\n", + " if not idx:\n", + " return pd.DataFrame(columns=['title', 'genres'])\n", + " idx = idx[0]\n", + " \n", + " movie_vector = self.svd_matrix[idx].reshape(1, -1)\n", + " sim_scores = cosine_similarity(movie_vector, self.svd_matrix).flatten()\n", + " \n", + " print(f\"Similarity scores: {sim_scores[:10]}\") # Print first 10 similarity scores\n", + " \n", + " sim_scores_with_index = list(enumerate(sim_scores))\n", + " sim_scores_with_index = sorted(sim_scores_with_index, key=lambda x: x[1], reverse=True)\n", + " sim_scores_with_index = sim_scores_with_index[1:top_n+1]\n", + " \n", + " movie_indices = [i[0] for i in sim_scores_with_index]\n", + " print(f\"Recommended movie indices: {movie_indices}\")\n", + " \n", + " return self.df[['title', 'genres']].iloc[movie_indices].reset_index(drop=True)\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " \n", + " recommender = ContentBasedModel(content_df)\n", + " recommender.train_model()\n", " \n", - " # Create TF-IDF vectorizer and transform features\n", - " tfidf = TfidfVectorizer(stop_words='english')\n", - " tfidf_matrix = tfidf.fit_transform(df['features'])\n", + " with open('content_model.pkl', 'wb') as f:\n", + " pickle.dump(recommender, f)\n", " \n", - " # Compute cosine similarity matrix\n", - " cosine_sim = cosine_similarity(tfidf_matrix)\n", + " sample_movie = 'Sommersby'\n", + " recommendations = recommender.get_recommendations(sample_movie, top_n=5)\n", " \n", - " return tfidf_matrix, cosine_sim\n", + " print(f\"Recommendations for '{sample_movie}':\")\n", + " print(recommendations)" + ] + }, + { + "cell_type": "markdown", + "id": "fcd47967-e4a5-4915-a4ad-5ba0ebf8cae4", + "metadata": {}, + "source": [ + "## Hybrid System" + ] + }, + { + "cell_type": "markdown", + "id": "c07a19f0-fcde-450c-8fd0-eac2c0eac9f7", + "metadata": {}, + "source": [ + "The `HybridRecommender` class combines two types of recommendation systems namely content-based and collaborative filtering into a hybrid model. This class takes in two datasets (one for each recommendation model) and provides a unified recommendation list based on both models." + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "8867a083-b7bc-4f29-b78a-7a42dd3ca443", + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "\"['rating', 'user_id'] not in index\"", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[71], line 99\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 95\u001b[0m \u001b[38;5;66;03m# # Load movie data\u001b[39;00m\n\u001b[1;32m 96\u001b[0m \u001b[38;5;66;03m# df = pd.read_csv('movies_data.csv') # Update with the correct path\u001b[39;00m\n\u001b[1;32m 98\u001b[0m recommender \u001b[38;5;241m=\u001b[39m HybridMovieRecommender(df)\n\u001b[0;32m---> 99\u001b[0m \u001b[43mrecommender\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_collaborative_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 100\u001b[0m recommender\u001b[38;5;241m.\u001b[39mtrain_content_based_model()\n\u001b[1;32m 102\u001b[0m \u001b[38;5;66;03m# Get hybrid recommendations for a sample user\u001b[39;00m\n", + "Cell \u001b[0;32mIn[71], line 10\u001b[0m, in \u001b[0;36mHybridMovieRecommender.train_collaborative_model\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtrain_collaborative_model\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 9\u001b[0m reader \u001b[38;5;241m=\u001b[39m Reader(rating_scale\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m5\u001b[39m))\n\u001b[0;32m---> 10\u001b[0m data \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mload_from_df(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43muser_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmovieId\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrating\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m, reader)\n\u001b[1;32m 11\u001b[0m trainset, _ \u001b[38;5;241m=\u001b[39m train_test_split(data, test_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.2\u001b[39m)\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcollaborative_model \u001b[38;5;241m=\u001b[39m SVD()\n", + "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/frame.py:2908\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2906\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m 2907\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 2908\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_listlike_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mraise_missing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 2910\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m 2911\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n", + "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/indexing.py:1254\u001b[0m, in \u001b[0;36m_LocIndexer._get_listlike_indexer\u001b[0;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1251\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1252\u001b[0m keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m ax\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 1254\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_read_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mraise_missing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mraise_missing\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1255\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m keyarr, indexer\n", + "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/indexing.py:1304\u001b[0m, in \u001b[0;36m_LocIndexer._validate_read_indexer\u001b[0;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1302\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m raise_missing:\n\u001b[1;32m 1303\u001b[0m not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mset\u001b[39m(key) \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mset\u001b[39m(ax))\n\u001b[0;32m-> 1304\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1306\u001b[0m \u001b[38;5;66;03m# we skip the warning on Categorical\u001b[39;00m\n\u001b[1;32m 1307\u001b[0m \u001b[38;5;66;03m# as this check is actually done (check for\u001b[39;00m\n\u001b[1;32m 1308\u001b[0m \u001b[38;5;66;03m# non-missing values), but a bit later in the\u001b[39;00m\n\u001b[1;32m 1309\u001b[0m \u001b[38;5;66;03m# code, so we want to avoid warning & then\u001b[39;00m\n\u001b[1;32m 1310\u001b[0m \u001b[38;5;66;03m# just raising\u001b[39;00m\n\u001b[1;32m 1311\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m ax\u001b[38;5;241m.\u001b[39mis_categorical():\n", + "\u001b[0;31mKeyError\u001b[0m: \"['rating', 'user_id'] not in index\"" + ] + } + ], + "source": [ + "class HybridMovieRecommender:\n", + " def __init__(self, df):\n", + " self.df = df\n", + " self.collaborative_model = None\n", + " self.tfidf_matrix = None\n", + " self.cosine_sim = None\n", "\n", - "def get_content_based_recommendations(title, cosine_sim, df, top_n=10):\n", - " '''\n", - " Provides movie recommendations based on content similarity.\n", + " def train_collaborative_model(self):\n", + " reader = Reader(rating_scale=(1, 5))\n", + " data = Dataset.load_from_df(self.df[['user_id', 'movieId', 'rating']], reader)\n", + " trainset, _ = train_test_split(data, test_size=0.2)\n", + " self.collaborative_model = SVD()\n", + " self.collaborative_model.fit(trainset)\n", "\n", - " Parameters:\n", - " title (str): Title of the movie to base the recommendations on.\n", - " cosine_sim (np.array): Cosine similarity matrix.\n", - " df (pd.DataFrame): DataFrame containing movie information with columns 'title' and 'movieId'.\n", - " top_n (int): Number of recommendations to return.\n", + " def train_content_based_model(self):\n", + " self.df['clean_title'] = self.df['title'].apply(lambda x: re.sub(\"[^a-zA-Z0-9 ]\", \"\", x).lower())\n", + " self.df['features'] = self.df['clean_title'] + ' ' + self.df['genres']\n", + " \n", + " tfidf = TfidfVectorizer(stop_words='english')\n", + " self.tfidf_matrix = tfidf.fit_transform(self.df['features'])\n", + " self.cosine_sim = cosine_similarity(self.tfidf_matrix)\n", + "\n", + " def get_collaborative_recommendations(self, user_id, n=5):\n", + " user_movies = self.df[self.df['user_id'] == user_id]['movieId'].unique()\n", + " all_movies = self.df['movieId'].unique()\n", + " movies_to_predict = np.setdiff1d(all_movies, user_movies)\n", + " \n", + " predictions = [\n", + " (movie_id, self.collaborative_model.predict(user_id, movie_id).est)\n", + " for movie_id in movies_to_predict\n", + " ]\n", + " \n", + " recommendations = sorted(predictions, key=lambda x: x[1], reverse=True)\n", + " return recommendations[:n]\n", + "\n", + " def get_content_based_recommendations(self, title, top_n=10):\n", + " idx = self.df.index[self.df['title'] == title].tolist()\n", + " if not idx:\n", + " return []\n", + "\n", + " idx = idx[0]\n", + " sim_scores = list(enumerate(self.cosine_sim[idx]))\n", + " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n", + " sim_scores = sim_scores[1:top_n+1]\n", + " movie_indices = [i[0] for i in sim_scores]\n", + " \n", + " return self.df[['movieId', 'title', 'genres']].iloc[movie_indices].reset_index(drop=True)\n", "\n", - " Returns:\n", - " list: A list of movie titles and their similarity scores.\n", - " '''\n", - " # Get the index of the movie that matches the title\n", - " idx = df.index[df['title'] == title].tolist()\n", - " if not idx:\n", - " return []\n", + " def get_hybrid_recommendations(self, user_id, n=10):\n", + " collaborative_recs = self.get_collaborative_recommendations(user_id, n=n)\n", + " \n", + " hybrid_recs = []\n", + " for movie_id, collab_score in collaborative_recs:\n", + " movie = self.df[self.df['movieId'] == movie_id].iloc[0]\n", + " content_recs = self.get_content_based_recommendations(movie['title'], top_n=5)\n", + " content_score = np.mean([self.cosine_sim[movie_id][rec['movieId']] for _, rec in content_recs.iterrows()])\n", + " hybrid_score = 0.7 * collab_score + 0.3 * content_score\n", + " hybrid_recs.append((movie_id, hybrid_score))\n", + " \n", + " hybrid_recs = sorted(hybrid_recs, key=lambda x: x[1], reverse=True)\n", + " return hybrid_recs[:n]\n", "\n", - " idx = idx[0]\n", + " def evaluate_model(self):\n", + " reader = Reader(rating_scale=(1, 5))\n", + " data = Dataset.load_from_df(self.df[['user_id', 'movieId', 'rating']], reader)\n", + " \n", + " # Collaborative Filtering Evaluation\n", + " cv_results = cross_validate(self.collaborative_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)\n", + " \n", + " print(\"Collaborative Filtering Evaluation:\")\n", + " print(f\"RMSE: {np.mean(cv_results['test_rmse']):.4f} (+/- {np.std(cv_results['test_rmse']):.4f})\")\n", + " print(f\"MAE: {np.mean(cv_results['test_mae']):.4f} (+/- {np.std(cv_results['test_mae']):.4f})\")\n", + " \n", + " # Content-Based Filtering Evaluation\n", + " sample_size = min(1000, len(self.df))\n", + " sample = self.df.sample(sample_size)\n", + " \n", + " content_based_rmse = []\n", + " content_based_mae = []\n", + " \n", + " for _, row in sample.iterrows():\n", + " recs = self.get_content_based_recommendations(row['title'], top_n=10)\n", + " if not recs.empty:\n", + " pred_ratings = [self.cosine_sim[row['movieId']][rec['movieId']] for _, rec in recs.iterrows()]\n", + " true_rating = row['rating']\n", + " content_based_rmse.append(mean_squared_error([true_rating], pred_ratings, squared=False))\n", + " content_based_mae.append(mean_absolute_error([true_rating], pred_ratings))\n", + " \n", + " print(\"\\nContent-Based Filtering Evaluation:\")\n", + " print(f\"RMSE: {np.mean(content_based_rmse):.4f} (+/- {np.std(content_based_rmse):.4f})\")\n", + " print(f\"MAE: {np.mean(content_based_mae):.4f} (+/- {np.std(content_based_mae):.4f})\")\n", + "\n", + "# Example usage\n", + "if __name__ == \"__main__\":\n", + " # # Load movie data\n", + " # df = pd.read_csv('movies_data.csv') # Update with the correct path\n", + " \n", + " recommender = HybridMovieRecommender(df)\n", + " recommender.train_collaborative_model()\n", + " recommender.train_content_based_model()\n", " \n", - " # Get similarity scores for all movies with the given movie\n", - " sim_scores = list(enumerate(cosine_sim[idx]))\n", - " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n", - " sim_scores = sim_scores[1:top_n+1]\n", + " # Get hybrid recommendations for a sample user\n", + " sample_user_id = 1\n", + " recommendations = recommender.get_hybrid_recommendations(sample_user_id, n=5)\n", " \n", - " # Get movie indices\n", - " movie_indices = [i[0] for i in sim_scores]\n", + " print(f\"Hybrid Recommendations for User {sample_user_id}:\")\n", + " for movie_id, score in recommendations:\n", + " movie = df[df['movieId'] == movie_id].iloc[0]\n", + " print(f\"{movie['title']} (Score: {score:.2f})\")\n", " \n", - " return df[['title', 'genres']].iloc[movie_indices].reset_index(drop=True)\n", + " # Evaluate the model\n", + " recommender.evaluate_model()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee92de36-5476-4199-a1f0-2ce55ec1e33b", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "class HybridRecommender:\n", + " def __init__(self, content_df, collab_df):\n", + " self.content_model = ContentBasedModel(content_df)\n", + " self.collab_model = CollabBasedModel(collab_df)\n", + " self.content_model.train_model()\n", + " self.collab_model.train_model()\n", + "\n", + " def get_hybrid_recommendations(self, title, user_ratings, top_n=5, alpha=0.5):\n", + " # Get recommendations from both models\n", + " content_recs = self.content_model.get_recommendations(title, top_n=top_n)\n", + " collab_recs = self.collab_model.get_recommendations(user_ratings, n=top_n)\n", + " \n", + " # Convert collaborative recommendations to DataFrame\n", + " collab_recs_df = pd.DataFrame(collab_recs, columns=['movieId', 'predicted_rating'])\n", + " \n", + " # Merge content-based recommendations with collaborative predictions\n", + " recommendations = content_recs.merge(collab_recs_df, left_on='movieId', right_on='movieId', how='left')\n", + " \n", + " # Normalize and weight recommendations\n", + " recommendations['final_score'] = (alpha * recommendations['predicted_rating']) + ((1 - alpha) * recommendations['cosine_sim'])\n", + " recommendations = recommendations.sort_values(by='final_score', ascending=False)\n", + " \n", + " return recommendations[['title', 'genres']].head(top_n).reset_index(drop=True)\n", + "\n", + "# Example usage\n", + "if __name__ == \"__main__\":\n", + " content_df = pd.read_csv('content_movies.csv') # Replace with actual path\n", + " collab_df = pd.read_csv('collab_ratings.csv') # Replace with actual path\n", + "\n", + " hybrid_recommender = HybridRecommender(content_df, collab_df)\n", + " user_ratings = [(1, 5), (2, 4)] # Example user ratings\n", + " recommendations = hybrid_recommender.get_hybrid_recommendations('Sommersby', user_ratings, top_n=5)\n", + "\n", + " print(\"Hybrid Recommendations:\")\n", + " print(recommendations)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "4d918c38-f2d3-4d13-b0dc-4d6c36a57493", + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "\"['rating', 'user_id'] not in index\"", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[66], line 85\u001b[0m\n\u001b[1;32m 82\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmovies_data/movies.csv\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;66;03m# Update with the correct path\u001b[39;00m\n\u001b[1;32m 84\u001b[0m recommender \u001b[38;5;241m=\u001b[39m HybridMovieRecommender(df)\n\u001b[0;32m---> 85\u001b[0m \u001b[43mrecommender\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_models\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;66;03m# Get hybrid recommendations for a sample user\u001b[39;00m\n\u001b[1;32m 88\u001b[0m sample_user_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n", + "Cell \u001b[0;32mIn[66], line 13\u001b[0m, in \u001b[0;36mHybridMovieRecommender.train_models\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtrain_models\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtrain_content_based_model()\n\u001b[0;32m---> 13\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_collaborative_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[66], line 25\u001b[0m, in \u001b[0;36mHybridMovieRecommender.train_collaborative_model\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtrain_collaborative_model\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 24\u001b[0m reader \u001b[38;5;241m=\u001b[39m Reader(rating_scale\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m5\u001b[39m))\n\u001b[0;32m---> 25\u001b[0m data \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mload_from_df(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43muser_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmovieId\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrating\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m, reader)\n\u001b[1;32m 26\u001b[0m trainset, _ \u001b[38;5;241m=\u001b[39m train_test_split(data, test_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.2\u001b[39m)\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcf_model \u001b[38;5;241m=\u001b[39m SVD()\n", + "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/frame.py:2908\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2906\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m 2907\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 2908\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_listlike_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mraise_missing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 2910\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m 2911\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n", + "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/indexing.py:1254\u001b[0m, in \u001b[0;36m_LocIndexer._get_listlike_indexer\u001b[0;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1251\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1252\u001b[0m keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m ax\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 1254\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_read_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mraise_missing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mraise_missing\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1255\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m keyarr, indexer\n", + "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/indexing.py:1304\u001b[0m, in \u001b[0;36m_LocIndexer._validate_read_indexer\u001b[0;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1302\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m raise_missing:\n\u001b[1;32m 1303\u001b[0m not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mset\u001b[39m(key) \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mset\u001b[39m(ax))\n\u001b[0;32m-> 1304\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1306\u001b[0m \u001b[38;5;66;03m# we skip the warning on Categorical\u001b[39;00m\n\u001b[1;32m 1307\u001b[0m \u001b[38;5;66;03m# as this check is actually done (check for\u001b[39;00m\n\u001b[1;32m 1308\u001b[0m \u001b[38;5;66;03m# non-missing values), but a bit later in the\u001b[39;00m\n\u001b[1;32m 1309\u001b[0m \u001b[38;5;66;03m# code, so we want to avoid warning & then\u001b[39;00m\n\u001b[1;32m 1310\u001b[0m \u001b[38;5;66;03m# just raising\u001b[39;00m\n\u001b[1;32m 1311\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m ax\u001b[38;5;241m.\u001b[39mis_categorical():\n", + "\u001b[0;31mKeyError\u001b[0m: \"['rating', 'user_id'] not in index\"" + ] + } + ], + "source": [ + "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", + "import re\n", + "\n", + "class HybridMovieRecommender:\n", + " def __init__(self, df):\n", + " self.df = df\n", + " self.cf_model = None\n", + " self.tfidf_matrix = None\n", + " self.cosine_sim = None\n", + "\n", + " def train_models(self):\n", + " self.train_content_based_model()\n", + " self.train_collaborative_model()\n", + "\n", + " def train_content_based_model(self):\n", + " self.df['clean_title'] = self.df['title'].apply(lambda x: re.sub(\"[^a-zA-Z0-9 ]\", \"\", x).lower())\n", + " self.df['features'] = self.df['clean_title'] + ' ' + self.df['genres']\n", + " \n", + " tfidf = TfidfVectorizer(stop_words='english')\n", + " self.tfidf_matrix = tfidf.fit_transform(self.df['features'])\n", + " self.cosine_sim = cosine_similarity(self.tfidf_matrix)\n", + "\n", + " def train_collaborative_model(self):\n", + " reader = Reader(rating_scale=(1, 5))\n", + " data = Dataset.load_from_df(self.df[['user_id', 'movieId', 'rating']], reader)\n", + " trainset, _ = train_test_split(data, test_size=0.2)\n", + " self.cf_model = SVD()\n", + " self.cf_model.fit(trainset)\n", + "\n", + " def get_content_based_recommendations(self, title, top_n=10):\n", + " idx = self.df.index[self.df['title'] == title].tolist()\n", + " if not idx:\n", + " return []\n", + "\n", + " idx = idx[0]\n", + " sim_scores = list(enumerate(self.cosine_sim[idx]))\n", + " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n", + " sim_scores = sim_scores[1:top_n+1]\n", + " movie_indices = [i[0] for i in sim_scores]\n", + " \n", + " return self.df.iloc[movie_indices][['movieId', 'title', 'genres']]\n", + "\n", + " def get_collaborative_recommendations(self, user_id, n=5):\n", + " user_movies = self.df[self.df['user_id'] == user_id]['movieId'].unique()\n", + " all_movies = self.df['movieId'].unique()\n", + " movies_to_predict = np.setdiff1d(all_movies, user_movies)\n", + " \n", + " predictions = [\n", + " (movie_id, self.cf_model.predict(user_id, movie_id).est)\n", + " for movie_id in movies_to_predict\n", + " ]\n", + " \n", + " predictions = sorted(predictions, key=lambda x: x[1], reverse=True)\n", + " return predictions[:n]\n", + "\n", + " def get_hybrid_recommendations(self, user_id, n=5):\n", + " cf_recs = self.get_collaborative_recommendations(user_id, n=n)\n", + " \n", + " hybrid_recs = []\n", + " for movie_id, cf_score in cf_recs:\n", + " movie = self.df[self.df['movieId'] == movie_id].iloc[0]\n", + " cb_recs = self.get_content_based_recommendations(movie['title'], top_n=5)\n", + " cb_score = np.mean([self.cosine_sim[movie_id][rec['movieId']] for _, rec in cb_recs.iterrows()])\n", + " hybrid_score = 0.7 * cf_score + 0.3 * cb_score\n", + " hybrid_recs.append((movie_id, hybrid_score))\n", + " \n", + " hybrid_recs = sorted(hybrid_recs, key=lambda x: x[1], reverse=True)\n", + " return [(self.df[self.df['movieId'] == movie_id]['title'].iloc[0], score) for movie_id, score in hybrid_recs[:n]]\n", + "\n", + " def evaluate_recommendations(self, test_set):\n", + " predictions = [self.cf_model.predict(uid, iid).est for uid, iid, true_r, _ in test_set]\n", + " true_ratings = [true_r for _, _, true_r, _ in test_set]\n", + " \n", + " rmse = np.sqrt(mean_squared_error(true_ratings, predictions))\n", + " mae = mean_absolute_error(true_ratings, predictions)\n", + " \n", + " return rmse, mae\n", "\n", "# Example usage\n", "if __name__ == \"__main__\":\n", - " # Load movie data\n", - " df = pd.read_csv('movies_data/movies.csv') # Update with the correct path\n", + " # # # Load movie data\n", + " # df = pd.read_csv('movies_data/movies.csv') # Update with the correct path\n", " \n", - " # Train the content-based model\n", - " tfidf_matrix, cosine_sim = train_content_based_model(df)\n", + " # recommender = HybridMovieRecommender(df)\n", + " recommender.train_models()\n", " \n", - " # Get recommendations for a sample movie\n", - " sample_movie = \"Toy Story (1995)\"\n", - " recommendations = get_content_based_recommendations(sample_movie, cosine_sim, df, top_n=5)\n", + " # Get hybrid recommendations for a sample user\n", + " sample_user_id = 1\n", + " recommendations = recommender.get_hybrid_recommendations(sample_user_id, n=5)\n", " \n", - " # Print recommendations\n", - " print(f\"Recommendations for '{sample_movie}':\")\n", - " print(recommendations)\n" + " print(f\"Hybrid Recommendations for User {sample_user_id}:\")\n", + " for title, score in recommendations:\n", + " print(f\"{title} (Score: {score:.2f})\")\n", + " \n", + " # Evaluate the model\n", + " reader = Reader(rating_scale=(1, 5))\n", + " data = Dataset.load_from_df(df[['user_id', 'movieId', 'rating']], reader)\n", + " _, testset = train_test_split(data, test_size=0.2, random_state=42)\n", + " \n", + " rmse, mae = recommender.evaluate_recommendations(testset)\n", + " print(f\"\\nModel Evaluation:\")\n", + " print(f\"RMSE: {rmse:.4f}\")\n", + " print(f\"MAE: {mae:.4f}\")" ] }, { - "cell_type": "markdown", - "id": "fcd47967-e4a5-4915-a4ad-5ba0ebf8cae4", + "cell_type": "code", + "execution_count": 73, + "id": "670bf220-0e8a-4dd5-b349-8cbf3aa7bcbe", "metadata": {}, + "outputs": [ + { + "ename": "IndexError", + "evalue": "index 0 is out of bounds for axis 0 with size 0", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[73], line 87\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[38;5;66;03m# Example usage\u001b[39;00m\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 84\u001b[0m \u001b[38;5;66;03m# # Load your data\u001b[39;00m\n\u001b[1;32m 85\u001b[0m \u001b[38;5;66;03m# df = pd.read_csv('your_movie_data.csv') # Replace with your actual data file\u001b[39;00m\n\u001b[0;32m---> 87\u001b[0m recommender \u001b[38;5;241m=\u001b[39m \u001b[43mHybridRecommender\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 88\u001b[0m recommender\u001b[38;5;241m.\u001b[39mrecommend_movies(user_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, n\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m) \u001b[38;5;66;03m# Replace with an actual user ID from your data\u001b[39;00m\n", + "Cell \u001b[0;32mIn[73], line 17\u001b[0m, in \u001b[0;36mHybridRecommender.__init__\u001b[0;34m(self, df)\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcosine_sim \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# Map DataFrame columns to expected names\u001b[39;00m\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muser_col \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcontains\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43muser\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcase\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitem_col \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf\u001b[38;5;241m.\u001b[39mcolumns[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmovie\u001b[39m\u001b[38;5;124m'\u001b[39m, case\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)][\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrating_col \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf\u001b[38;5;241m.\u001b[39mcolumns[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrating\u001b[39m\u001b[38;5;124m'\u001b[39m, case\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)][\u001b[38;5;241m0\u001b[39m]\n", + "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/indexes/base.py:4101\u001b[0m, in \u001b[0;36mIndex.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4099\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_scalar(key):\n\u001b[1;32m 4100\u001b[0m key \u001b[38;5;241m=\u001b[39m com\u001b[38;5;241m.\u001b[39mcast_scalar_indexer(key, warn_float\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m-> 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mgetitem\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mslice\u001b[39m):\n\u001b[1;32m 4104\u001b[0m \u001b[38;5;66;03m# This case is separated from the conditional above to avoid\u001b[39;00m\n\u001b[1;32m 4105\u001b[0m \u001b[38;5;66;03m# pessimization of basic indexing.\u001b[39;00m\n\u001b[1;32m 4106\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m promote(getitem(key))\n", + "\u001b[0;31mIndexError\u001b[0m: index 0 is out of bounds for axis 0 with size 0" + ] + } + ], "source": [ - "## Hybrid System" + "import pandas as pd\n", + "import numpy as np\n", + "from surprise import Dataset, Reader, SVD\n", + "from surprise.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import re\n", + "\n", + "class HybridRecommender:\n", + " def __init__(self, df):\n", + " self.df = df\n", + " self.collab_model = None\n", + " self.tfidf_matrix = None\n", + " self.cosine_sim = None\n", + " \n", + " # Map DataFrame columns to expected names\n", + " self.user_col = self.df.columns[self.df.columns.str.contains('user', case=False)][0]\n", + " self.item_col = self.df.columns[self.df.columns.str.contains('movie', case=False)][0]\n", + " self.rating_col = self.df.columns[self.df.columns.str.contains('rating', case=False)][0]\n", + " self.title_col = self.df.columns[self.df.columns.str.contains('title', case=False)][0]\n", + " self.genre_col = self.df.columns[self.df.columns.str.contains('genre', case=False)][0]\n", + "\n", + " def train_collaborative_model(self):\n", + " reader = Reader(rating_scale=(self.df[self.rating_col].min(), self.df[self.rating_col].max()))\n", + " data = Dataset.load_from_df(self.df[[self.user_col, self.item_col, self.rating_col]], reader)\n", + " trainset, _ = train_test_split(data, test_size=0.2)\n", + " self.collab_model = SVD()\n", + " self.collab_model.fit(trainset)\n", + "\n", + " def train_content_based_model(self):\n", + " self.df['clean_title'] = self.df[self.title_col].apply(lambda x: re.sub(\"[^a-zA-Z0-9 ]\", \"\", str(x).lower()))\n", + " self.df['features'] = self.df['clean_title'] + ' ' + self.df[self.genre_col].astype(str)\n", + " \n", + " tfidf = TfidfVectorizer(stop_words='english')\n", + " self.tfidf_matrix = tfidf.fit_transform(self.df['features'])\n", + " self.cosine_sim = cosine_similarity(self.tfidf_matrix)\n", + "\n", + " def train_models(self):\n", + " self.train_collaborative_model()\n", + " self.train_content_based_model()\n", + "\n", + " def get_collaborative_recommendations(self, user_id, n=10):\n", + " user_movies = self.df[self.df[self.user_col] == user_id][self.item_col].unique()\n", + " all_movies = self.df[self.item_col].unique()\n", + " movies_to_predict = np.setdiff1d(all_movies, user_movies)\n", + " \n", + " predictions = [\n", + " (movie_id, self.collab_model.predict(user_id, movie_id).est)\n", + " for movie_id in movies_to_predict\n", + " ]\n", + " \n", + " return sorted(predictions, key=lambda x: x[1], reverse=True)[:n]\n", + "\n", + " def get_content_based_recommendations(self, movie_id, n=10):\n", + " idx = self.df[self.df[self.item_col] == movie_id].index[0]\n", + " sim_scores = list(enumerate(self.cosine_sim[idx]))\n", + " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n", + " sim_scores = sim_scores[1:n+1]\n", + " movie_indices = [i[0] for i in sim_scores]\n", + " return self.df.iloc[movie_indices][[self.item_col, self.title_col, self.genre_col]]\n", + "\n", + " def get_hybrid_recommendations(self, user_id, n=10):\n", + " collab_recs = self.get_collaborative_recommendations(user_id, n=n)\n", + " \n", + " hybrid_recs = []\n", + " for movie_id, collab_score in collab_recs:\n", + " content_recs = self.get_content_based_recommendations(movie_id, n=5)\n", + " content_score = content_recs[self.item_col].apply(lambda x: self.cosine_sim[movie_id][x]).mean()\n", + " hybrid_score = 0.7 * collab_score + 0.3 * content_score\n", + " hybrid_recs.append((movie_id, hybrid_score))\n", + " \n", + " hybrid_recs = sorted(hybrid_recs, key=lambda x: x[1], reverse=True)\n", + " return [(self.df[self.df[self.item_col] == movie_id][self.title_col].iloc[0], score) for movie_id, score in hybrid_recs[:n]]\n", + "\n", + " def recommend_movies(self, user_id, n=10):\n", + " self.train_models()\n", + " recommendations = self.get_hybrid_recommendations(user_id, n)\n", + " print(f\"\\nTop {n} Recommendations for User {user_id}:\")\n", + " for i, (title, score) in enumerate(recommendations, 1):\n", + " print(f\"{i}. {title} (Score: {score:.2f})\")\n", + "\n", + "# Example usage\n", + "if __name__ == \"__main__\":\n", + " # # Load your data\n", + " # df = pd.read_csv('your_movie_data.csv') # Replace with your actual data file\n", + " \n", + " recommender = HybridRecommender(df)\n", + " recommender.recommend_movies(user_id=1, n=5) # Replace with an actual user ID from your data" ] }, {