From 8320b2ef8be8d63abbaf17c6cf9e2137ac8f2b43 Mon Sep 17 00:00:00 2001
From: Evaclaire Wamitu <evaclaire.wamitu@student.moringaschool.com>
Date: Mon, 29 Jul 2024 09:55:06 +0300
Subject: [PATCH] Add hybrid model

---
 movie_recommendor.ipynb | 733 ++++++++++++++--------------------------
 1 file changed, 259 insertions(+), 474 deletions(-)

diff --git a/movie_recommendor.ipynb b/movie_recommendor.ipynb
index d36f0af..2a3d64a 100644
--- a/movie_recommendor.ipynb
+++ b/movie_recommendor.ipynb
@@ -120,10 +120,9 @@
    "id": "f28f3f7d-5712-458c-bc61-7730778d795e",
    "metadata": {},
    "source": [
-    "1. To build a collaborative filtering model using user ratings to generate top 5 movie recommendations, leveraging algorithms such as Singular Value Decomposition (SVD) and k-Nearest Neighbors (k-NN).\n",
-    "2. To address the cold start problem for new users by integrating content-based filtering, utilizing features such as movie genres, directors, and cast.\n",
-    "3. To evaluate the hybrid recommendation system using appropriate metrics like Root Mean Square Error (RMSE), Mean Average Precision (MAP), and Normalized Discounted Cumulative Gain (NDCG) to ensure accuracy and relevance of the recommendations.\n",
-    "\n"
+    "1. To build a collaborative filtering model using user ratings to generate top 5 movie recommendations leveraging algorithms such as Singular Value Decomposition (SVD) and k-Nearest Neighbors (k-NN).\n",
+    "2. To address the cold start problem for new users by developing content-based filtering, utilizing features such as movie genres and titles.\n",
+    "3. To build a hybrid recommendation system using the collaborative and content based filtering models and evaluate it using metrics such as Root Mean Square Error (RMSE) and Mean Average Precision (MAP) to ensure accuracy and relevance of the recommendations."
    ]
   },
   {
@@ -134,10 +133,7 @@
     "## Success Metrics\n",
     "\n",
     "1. Root Mean Square Error (RMSE) < 0.9 for rating predictions\n",
-    "2. Mean Average Precision @5 (MAP@5) > 0.3 for recommended movies\n",
-    "3. Precision@5 of around 0.2 to 0.5\n",
-    "4. Recall@5 of around 0.2 to 0.5\n",
-    "5. F1 Score of around 0.3 to 0.7"
+    "2. Mean Average Precision @k (MAP@k) > 0.3 for recommended movies where k = 5\n"
    ]
   },
   {
@@ -2720,7 +2716,7 @@
    "id": "0c954d06-e38c-447b-901f-d4bdd36aad59",
    "metadata": {},
    "source": [
-    "The output reveals the results of tuning and evaluating recommendation models using grid search. For the SVD model, the optimal parameters were identified as having 100 factors, 30 epochs, a learning rate of 0.01 and regularization of 0.1 achieving the best RMSE of approximately 0.862. In contrast, the KNN model required extensive computation of similarity matrices for various configurations including Pearson, cosine and MSD (Mean Squared Difference) similarities. The best parameters for the KNN model were found to be 30 neighbors, a minimum of 10 neighbors and using the MSD similarity metric with a non-user-based approach resulting in a higher RMSE of about 0.917. Consequently, the SVD model emerged as the superior choice with the lowest RMSE and we therefore selected it as the best model overall."
+    "The output reveals the results of tuning and evaluating recommendation models using grid search. For the SVD model, the optimal parameters were identified as having 100 factors, 30 epochs, a learning rate of 0.01 and regularization of 0.1 achieving the best RMSE of approximately 0.862. In contrast, the KNN model required extensive computation of similarity matrices for various configurations including Pearson, cosine and MSD (Mean Squared Difference) similarities. The best parameters for the KNN model were found to be 30 neighbors, a minimum of 10 neighbors and using the cosine similarity metric with a non-user-based approach resulting in a higher RMSE of about 0.917. Consequently, the SVD model emerged as the superior choice with the lowest RMSE and we therefore selected it as the best model overall."
    ]
   },
   {
@@ -2983,90 +2979,11 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "57e1252e-44b5-4605-a71a-ff86b4496b3a",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 65,
-   "id": "ddc7d36d-6264-4ba1-a986-f6d9b59c2193",
+   "cell_type": "markdown",
+   "id": "8f9045e0-ca85-4ebf-a75d-a68c2a73006f",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>title</th>\n",
-       "      <th>genres</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>100813</th>\n",
-       "      <td>Black Butler: Book of the Atlantic</td>\n",
-       "      <td>action, animation, comedy, fantasy</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>100814</th>\n",
-       "      <td>No Game No Life: Zero</td>\n",
-       "      <td>animation, comedy, fantasy</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>100815</th>\n",
-       "      <td>Flint</td>\n",
-       "      <td>drama</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>100816</th>\n",
-       "      <td>Bungo Stray Dogs: Dead Apple</td>\n",
-       "      <td>action, animation</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>100817</th>\n",
-       "      <td>Andrew Dice Clay: Dice Rules</td>\n",
-       "      <td>comedy</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                     title                              genres\n",
-       "100813  Black Butler: Book of the Atlantic  action, animation, comedy, fantasy\n",
-       "100814               No Game No Life: Zero          animation, comedy, fantasy\n",
-       "100815                               Flint                               drama\n",
-       "100816        Bungo Stray Dogs: Dead Apple                   action, animation\n",
-       "100817        Andrew Dice Clay: Dice Rules                              comedy"
-      ]
-     },
-     "execution_count": 65,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
-    "content_df.tail()"
+    "### Summary"
    ]
   },
   {
@@ -3085,6 +3002,14 @@
     "## Content Based"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "5682dd7a-ffa5-4238-828d-1eda7c95e082",
+   "metadata": {},
+   "source": [
+    "A content based recommender system is a type of recommendation algorithm that suggests items to users based on the characteristics or features of the items they have previously liked or interacted with. It analyzes the content or attributes of items (such as movies, books, or products) to find similarities and make recommendations. "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 28,
@@ -3176,441 +3101,301 @@
   },
   {
    "cell_type": "markdown",
-   "id": "fcd47967-e4a5-4915-a4ad-5ba0ebf8cae4",
+   "id": "0e1fd17e-6eb7-49f5-9bb5-b6cb231f45b5",
    "metadata": {},
    "source": [
-    "## Hybrid System"
+    "### Summary"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "c07a19f0-fcde-450c-8fd0-eac2c0eac9f7",
-   "metadata": {},
-   "source": [
-    "The `HybridRecommender` class combines two types of recommendation systems namely content-based and collaborative filtering into a hybrid model. This class takes in two datasets (one for each recommendation model) and provides a unified recommendation list based on both models."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 71,
-   "id": "8867a083-b7bc-4f29-b78a-7a42dd3ca443",
+   "id": "ba4d1bad-c35d-4932-bdf9-4ded5bd19984",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyError",
-     "evalue": "\"['rating', 'user_id'] not in index\"",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[71], line 99\u001b[0m\n\u001b[1;32m     94\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m     95\u001b[0m     \u001b[38;5;66;03m# # Load movie data\u001b[39;00m\n\u001b[1;32m     96\u001b[0m     \u001b[38;5;66;03m# df = pd.read_csv('movies_data.csv')  # Update with the correct path\u001b[39;00m\n\u001b[1;32m     98\u001b[0m     recommender \u001b[38;5;241m=\u001b[39m HybridMovieRecommender(df)\n\u001b[0;32m---> 99\u001b[0m     \u001b[43mrecommender\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_collaborative_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    100\u001b[0m     recommender\u001b[38;5;241m.\u001b[39mtrain_content_based_model()\n\u001b[1;32m    102\u001b[0m     \u001b[38;5;66;03m# Get hybrid recommendations for a sample user\u001b[39;00m\n",
-      "Cell \u001b[0;32mIn[71], line 10\u001b[0m, in \u001b[0;36mHybridMovieRecommender.train_collaborative_model\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtrain_collaborative_model\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m      9\u001b[0m     reader \u001b[38;5;241m=\u001b[39m Reader(rating_scale\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m5\u001b[39m))\n\u001b[0;32m---> 10\u001b[0m     data \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mload_from_df(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43muser_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmovieId\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrating\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m, reader)\n\u001b[1;32m     11\u001b[0m     trainset, _ \u001b[38;5;241m=\u001b[39m train_test_split(data, test_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.2\u001b[39m)\n\u001b[1;32m     12\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcollaborative_model \u001b[38;5;241m=\u001b[39m SVD()\n",
-      "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/frame.py:2908\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   2906\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m   2907\u001b[0m         key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 2908\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_listlike_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mraise_missing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m   2910\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m   2911\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n",
-      "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/indexing.py:1254\u001b[0m, in \u001b[0;36m_LocIndexer._get_listlike_indexer\u001b[0;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[1;32m   1251\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1252\u001b[0m     keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m ax\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 1254\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_read_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mraise_missing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mraise_missing\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1255\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m keyarr, indexer\n",
-      "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/indexing.py:1304\u001b[0m, in \u001b[0;36m_LocIndexer._validate_read_indexer\u001b[0;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[1;32m   1302\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m raise_missing:\n\u001b[1;32m   1303\u001b[0m     not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mset\u001b[39m(key) \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mset\u001b[39m(ax))\n\u001b[0;32m-> 1304\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1306\u001b[0m \u001b[38;5;66;03m# we skip the warning on Categorical\u001b[39;00m\n\u001b[1;32m   1307\u001b[0m \u001b[38;5;66;03m# as this check is actually done (check for\u001b[39;00m\n\u001b[1;32m   1308\u001b[0m \u001b[38;5;66;03m# non-missing values), but a bit later in the\u001b[39;00m\n\u001b[1;32m   1309\u001b[0m \u001b[38;5;66;03m# code, so we want to avoid warning & then\u001b[39;00m\n\u001b[1;32m   1310\u001b[0m \u001b[38;5;66;03m# just raising\u001b[39;00m\n\u001b[1;32m   1311\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m ax\u001b[38;5;241m.\u001b[39mis_categorical():\n",
-      "\u001b[0;31mKeyError\u001b[0m: \"['rating', 'user_id'] not in index\""
-     ]
-    }
-   ],
    "source": [
-    "class HybridMovieRecommender:\n",
-    "    def __init__(self, df):\n",
-    "        self.df = df\n",
-    "        self.collaborative_model = None\n",
-    "        self.tfidf_matrix = None\n",
-    "        self.cosine_sim = None\n",
+    "The class above implements a content based movie recommender system using genres as the primary feature. It begins by importing necessary libraries and preparing the dataset ensuring only unique movie titles are retained. The `ContentBasedModel` class encapsulates the core functionality. Within its `train_model` method, movie genres are converted into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency) vectorization creating a matrix of genres. The cosine similarity between all pairs of movies is then calculated based on these TF-IDF representations producing a similarity matrix that identifies movies with similar genres. The `get_recommendations` method uses this similarity matrix to provide the top 5 most similar movies to a given title. The code demonstrates the class's usage by creating an instance, training the model and getting recommendations for 'Toy Story'. Finally, it saves the trained model including the TF-IDF matrix, cosine similarity matrix and movie indices to a pickle file for future use.\n",
+    "The recommender system outputs the top 5 movies similar to 'Toy Story' based on genre similarity i.e. 'Antz','Toy Story 2', 'The Adventures of Rocky and Bullwinkle', 'The Emperor's New Groove' and 'Monsters, Inc.'. Each recommendation shares key characteristics with 'Toy Story' being that they are family-friendly animations with a focus on adventure and comedy. These recommendations are evidently relevant and appealing to fans of Toy Story.\n",
     "\n",
-    "    def train_collaborative_model(self):\n",
-    "        reader = Reader(rating_scale=(1, 5))\n",
-    "        data = Dataset.load_from_df(self.df[['user_id', 'movieId', 'rating']], reader)\n",
-    "        trainset, _ = train_test_split(data, test_size=0.2)\n",
-    "        self.collaborative_model = SVD()\n",
-    "        self.collaborative_model.fit(trainset)\n",
     "\n",
-    "    def train_content_based_model(self):\n",
-    "        self.df['clean_title'] = self.df['title'].apply(lambda x: re.sub(\"[^a-zA-Z0-9 ]\", \"\", x).lower())\n",
-    "        self.df['features'] = self.df['clean_title'] + ' ' + self.df['genres']\n",
-    "        \n",
-    "        tfidf = TfidfVectorizer(stop_words='english')\n",
-    "        self.tfidf_matrix = tfidf.fit_transform(self.df['features'])\n",
-    "        self.cosine_sim = cosine_similarity(self.tfidf_matrix)\n",
     "\n",
-    "    def get_collaborative_recommendations(self, user_id, n=5):\n",
-    "        user_movies = self.df[self.df['user_id'] == user_id]['movieId'].unique()\n",
-    "        all_movies = self.df['movieId'].unique()\n",
-    "        movies_to_predict = np.setdiff1d(all_movies, user_movies)\n",
-    "        \n",
-    "        predictions = [\n",
-    "            (movie_id, self.collaborative_model.predict(user_id, movie_id).est)\n",
-    "            for movie_id in movies_to_predict\n",
-    "        ]\n",
-    "        \n",
-    "        recommendations = sorted(predictions, key=lambda x: x[1], reverse=True)\n",
-    "        return recommendations[:n]\n",
     "\n",
-    "    def get_content_based_recommendations(self, title, top_n=10):\n",
-    "        idx = self.df.index[self.df['title'] == title].tolist()\n",
-    "        if not idx:\n",
-    "            return []\n",
-    "\n",
-    "        idx = idx[0]\n",
-    "        sim_scores = list(enumerate(self.cosine_sim[idx]))\n",
-    "        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
-    "        sim_scores = sim_scores[1:top_n+1]\n",
-    "        movie_indices = [i[0] for i in sim_scores]\n",
-    "        \n",
-    "        return self.df[['movieId', 'title', 'genres']].iloc[movie_indices].reset_index(drop=True)\n",
     "\n",
-    "    def get_hybrid_recommendations(self, user_id, n=10):\n",
-    "        collaborative_recs = self.get_collaborative_recommendations(user_id, n=n)\n",
-    "        \n",
-    "        hybrid_recs = []\n",
-    "        for movie_id, collab_score in collaborative_recs:\n",
-    "            movie = self.df[self.df['movieId'] == movie_id].iloc[0]\n",
-    "            content_recs = self.get_content_based_recommendations(movie['title'], top_n=5)\n",
-    "            content_score = np.mean([self.cosine_sim[movie_id][rec['movieId']] for _, rec in content_recs.iterrows()])\n",
-    "            hybrid_score = 0.7 * collab_score + 0.3 * content_score\n",
-    "            hybrid_recs.append((movie_id, hybrid_score))\n",
-    "        \n",
-    "        hybrid_recs = sorted(hybrid_recs, key=lambda x: x[1], reverse=True)\n",
-    "        return hybrid_recs[:n]\n",
-    "\n",
-    "    def evaluate_model(self):\n",
-    "        reader = Reader(rating_scale=(1, 5))\n",
-    "        data = Dataset.load_from_df(self.df[['user_id', 'movieId', 'rating']], reader)\n",
-    "        \n",
-    "        # Collaborative Filtering Evaluation\n",
-    "        cv_results = cross_validate(self.collaborative_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)\n",
-    "        \n",
-    "        print(\"Collaborative Filtering Evaluation:\")\n",
-    "        print(f\"RMSE: {np.mean(cv_results['test_rmse']):.4f} (+/- {np.std(cv_results['test_rmse']):.4f})\")\n",
-    "        print(f\"MAE: {np.mean(cv_results['test_mae']):.4f} (+/- {np.std(cv_results['test_mae']):.4f})\")\n",
-    "        \n",
-    "        # Content-Based Filtering Evaluation\n",
-    "        sample_size = min(1000, len(self.df))\n",
-    "        sample = self.df.sample(sample_size)\n",
-    "        \n",
-    "        content_based_rmse = []\n",
-    "        content_based_mae = []\n",
-    "        \n",
-    "        for _, row in sample.iterrows():\n",
-    "            recs = self.get_content_based_recommendations(row['title'], top_n=10)\n",
-    "            if not recs.empty:\n",
-    "                pred_ratings = [self.cosine_sim[row['movieId']][rec['movieId']] for _, rec in recs.iterrows()]\n",
-    "                true_rating = row['rating']\n",
-    "                content_based_rmse.append(mean_squared_error([true_rating], pred_ratings, squared=False))\n",
-    "                content_based_mae.append(mean_absolute_error([true_rating], pred_ratings))\n",
-    "        \n",
-    "        print(\"\\nContent-Based Filtering Evaluation:\")\n",
-    "        print(f\"RMSE: {np.mean(content_based_rmse):.4f} (+/- {np.std(content_based_rmse):.4f})\")\n",
-    "        print(f\"MAE: {np.mean(content_based_mae):.4f} (+/- {np.std(content_based_mae):.4f})\")\n",
-    "\n",
-    "# Example usage\n",
-    "if __name__ == \"__main__\":\n",
-    "    # # Load movie data\n",
-    "    # df = pd.read_csv('movies_data.csv')  # Update with the correct path\n",
-    "    \n",
-    "    recommender = HybridMovieRecommender(df)\n",
-    "    recommender.train_collaborative_model()\n",
-    "    recommender.train_content_based_model()\n",
-    "    \n",
-    "    # Get hybrid recommendations for a sample user\n",
-    "    sample_user_id = 1\n",
-    "    recommendations = recommender.get_hybrid_recommendations(sample_user_id, n=5)\n",
-    "    \n",
-    "    print(f\"Hybrid Recommendations for User {sample_user_id}:\")\n",
-    "    for movie_id, score in recommendations:\n",
-    "        movie = df[df['movieId'] == movie_id].iloc[0]\n",
-    "        print(f\"{movie['title']} (Score: {score:.2f})\")\n",
-    "    \n",
-    "    # Evaluate the model\n",
-    "    recommender.evaluate_model()"
+    "\n"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ee92de36-5476-4199-a1f0-2ce55ec1e33b",
+   "cell_type": "markdown",
+   "id": "fcd47967-e4a5-4915-a4ad-5ba0ebf8cae4",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "\n",
-    "class HybridRecommender:\n",
-    "    def __init__(self, content_df, collab_df):\n",
-    "        self.content_model = ContentBasedModel(content_df)\n",
-    "        self.collab_model = CollabBasedModel(collab_df)\n",
-    "        self.content_model.train_model()\n",
-    "        self.collab_model.train_model()\n",
-    "\n",
-    "    def get_hybrid_recommendations(self, title, user_ratings, top_n=5, alpha=0.5):\n",
-    "        # Get recommendations from both models\n",
-    "        content_recs = self.content_model.get_recommendations(title, top_n=top_n)\n",
-    "        collab_recs = self.collab_model.get_recommendations(user_ratings, n=top_n)\n",
-    "        \n",
-    "        # Convert collaborative recommendations to DataFrame\n",
-    "        collab_recs_df = pd.DataFrame(collab_recs, columns=['movieId', 'predicted_rating'])\n",
-    "        \n",
-    "        # Merge content-based recommendations with collaborative predictions\n",
-    "        recommendations = content_recs.merge(collab_recs_df, left_on='movieId', right_on='movieId', how='left')\n",
-    "        \n",
-    "        # Normalize and weight recommendations\n",
-    "        recommendations['final_score'] = (alpha * recommendations['predicted_rating']) + ((1 - alpha) * recommendations['cosine_sim'])\n",
-    "        recommendations = recommendations.sort_values(by='final_score', ascending=False)\n",
-    "        \n",
-    "        return recommendations[['title', 'genres']].head(top_n).reset_index(drop=True)\n",
-    "\n",
-    "# Example usage\n",
-    "if __name__ == \"__main__\":\n",
-    "    content_df = pd.read_csv('content_movies.csv')  # Replace with actual path\n",
-    "    collab_df = pd.read_csv('collab_ratings.csv')  # Replace with actual path\n",
-    "\n",
-    "    hybrid_recommender = HybridRecommender(content_df, collab_df)\n",
-    "    user_ratings = [(1, 5), (2, 4)]  # Example user ratings\n",
-    "    recommendations = hybrid_recommender.get_hybrid_recommendations('Sommersby', user_ratings, top_n=5)\n",
-    "\n",
-    "    print(\"Hybrid Recommendations:\")\n",
-    "    print(recommendations)\n"
+    "## Hybrid System"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c07a19f0-fcde-450c-8fd0-eac2c0eac9f7",
+   "metadata": {},
+   "source": [
+    "The `HybridRecommender` class combines two types of recommendation systems namely content-based and collaborative filtering into a hybrid model. This class takes in two datasets (one for each recommendation model) and provides a unified recommendation list based on both models."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
-   "id": "4d918c38-f2d3-4d13-b0dc-4d6c36a57493",
+   "execution_count": 49,
+   "id": "29fa9bcb-c197-453e-b5f5-0bf14cc8943b",
    "metadata": {},
    "outputs": [
     {
-     "ename": "KeyError",
-     "evalue": "\"['rating', 'user_id'] not in index\"",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[66], line 85\u001b[0m\n\u001b[1;32m     82\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmovies_data/movies.csv\u001b[39m\u001b[38;5;124m'\u001b[39m)  \u001b[38;5;66;03m# Update with the correct path\u001b[39;00m\n\u001b[1;32m     84\u001b[0m recommender \u001b[38;5;241m=\u001b[39m HybridMovieRecommender(df)\n\u001b[0;32m---> 85\u001b[0m \u001b[43mrecommender\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_models\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     87\u001b[0m \u001b[38;5;66;03m# Get hybrid recommendations for a sample user\u001b[39;00m\n\u001b[1;32m     88\u001b[0m sample_user_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
-      "Cell \u001b[0;32mIn[66], line 13\u001b[0m, in \u001b[0;36mHybridMovieRecommender.train_models\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     11\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtrain_models\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m     12\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtrain_content_based_model()\n\u001b[0;32m---> 13\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_collaborative_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
-      "Cell \u001b[0;32mIn[66], line 25\u001b[0m, in \u001b[0;36mHybridMovieRecommender.train_collaborative_model\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     23\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtrain_collaborative_model\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m     24\u001b[0m     reader \u001b[38;5;241m=\u001b[39m Reader(rating_scale\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m5\u001b[39m))\n\u001b[0;32m---> 25\u001b[0m     data \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mload_from_df(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43muser_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmovieId\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrating\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m, reader)\n\u001b[1;32m     26\u001b[0m     trainset, _ \u001b[38;5;241m=\u001b[39m train_test_split(data, test_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.2\u001b[39m)\n\u001b[1;32m     27\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcf_model \u001b[38;5;241m=\u001b[39m SVD()\n",
-      "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/frame.py:2908\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   2906\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m   2907\u001b[0m         key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 2908\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_listlike_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mraise_missing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m   2910\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m   2911\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n",
-      "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/indexing.py:1254\u001b[0m, in \u001b[0;36m_LocIndexer._get_listlike_indexer\u001b[0;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[1;32m   1251\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1252\u001b[0m     keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m ax\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 1254\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_read_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mraise_missing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mraise_missing\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1255\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m keyarr, indexer\n",
-      "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/indexing.py:1304\u001b[0m, in \u001b[0;36m_LocIndexer._validate_read_indexer\u001b[0;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[1;32m   1302\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m raise_missing:\n\u001b[1;32m   1303\u001b[0m     not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mset\u001b[39m(key) \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mset\u001b[39m(ax))\n\u001b[0;32m-> 1304\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1306\u001b[0m \u001b[38;5;66;03m# we skip the warning on Categorical\u001b[39;00m\n\u001b[1;32m   1307\u001b[0m \u001b[38;5;66;03m# as this check is actually done (check for\u001b[39;00m\n\u001b[1;32m   1308\u001b[0m \u001b[38;5;66;03m# non-missing values), but a bit later in the\u001b[39;00m\n\u001b[1;32m   1309\u001b[0m \u001b[38;5;66;03m# code, so we want to avoid warning & then\u001b[39;00m\n\u001b[1;32m   1310\u001b[0m \u001b[38;5;66;03m# just raising\u001b[39;00m\n\u001b[1;32m   1311\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m ax\u001b[38;5;241m.\u001b[39mis_categorical():\n",
-      "\u001b[0;31mKeyError\u001b[0m: \"['rating', 'user_id'] not in index\""
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Movie: Dick (1999)\n",
+      "Genre: comedy\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "Rate this movie from 1 to 5 (or 'x' if you haven't watched it):  3\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Movie: Girl Who Kicked the Hornet's Nest, The (Luftslottet som sprängdes) (2009)\n",
+      "Genre: action, crime, mystery\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "Rate this movie from 1 to 5 (or 'x' if you haven't watched it):  5\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Movie: Return to Treasure Island (1988)\n",
+      "Genre: adventure, animation, comedy\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "Rate this movie from 1 to 5 (or 'x' if you haven't watched it):  5\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Movie: Center Stage (2000)\n",
+      "Genre: drama, musical\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "Rate this movie from 1 to 5 (or 'x' if you haven't watched it):  x\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Movie: Sleepaway Camp (1983)\n",
+      "Genre: horror\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "Rate this movie from 1 to 5 (or 'x' if you haven't watched it):  3\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Hybrid Recommended movies:\n",
+      "1. Shawshank Redemption, The (1994) - Hybrid score: 4.20\n",
+      "   Genre: crime, drama\n",
+      "2. Lawrence of Arabia (1962) - Hybrid score: 4.19\n",
+      "   Genre: adventure, drama, war\n",
+      "3. Cool Hand Luke (1967) - Hybrid score: 4.14\n",
+      "   Genre: drama\n",
+      "4. Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) - Hybrid score: 3.63\n",
+      "   Genre: action, adventure\n",
+      "5. Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) - Hybrid score: 3.14\n",
+      "   Genre: comedy, war\n"
      ]
     }
    ],
    "source": [
-    "from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
-    "import re\n",
+    "class HybridModel:\n",
+    "    '''\n",
+    "    A hybrid recommender system that combines collaborative filtering and content-based filtering.\n",
+    "    '''\n",
     "\n",
-    "class HybridMovieRecommender:\n",
-    "    def __init__(self, df):\n",
-    "        self.df = df\n",
-    "        self.cf_model = None\n",
-    "        self.tfidf_matrix = None\n",
-    "        self.cosine_sim = None\n",
+    "    def __init__(self, collab_df, content_df):\n",
+    "        '''\n",
+    "        Initialize the HybridModel with collaborative and content-based dataframes.\n",
+    "\n",
+    "        Args:\n",
+    "            collab_df (pd.DataFrame): Dataframe for collaborative filtering.\n",
+    "            content_df (pd.DataFrame): Dataframe for content-based filtering.\n",
+    "        '''\n",
+    "        self.collab_model = CollabBasedModel(collab_df)\n",
+    "        self.content_model = ContentBasedModel(content_df)\n",
+    "        self.df = pd.merge(collab_df, content_df, on='movieId').drop_duplicates(subset=['movieId'])\n",
     "\n",
     "    def train_models(self):\n",
-    "        self.train_content_based_model()\n",
-    "        self.train_collaborative_model()\n",
+    "        '''\n",
+    "        Train both collaborative and content-based models.\n",
+    "        '''\n",
+    "        self.collab_model.train_model()\n",
+    "        self.content_model.train_model()\n",
     "\n",
-    "    def train_content_based_model(self):\n",
-    "        self.df['clean_title'] = self.df['title'].apply(lambda x: re.sub(\"[^a-zA-Z0-9 ]\", \"\", x).lower())\n",
-    "        self.df['features'] = self.df['clean_title'] + ' ' + self.df['genres']\n",
-    "        \n",
-    "        tfidf = TfidfVectorizer(stop_words='english')\n",
-    "        self.tfidf_matrix = tfidf.fit_transform(self.df['features'])\n",
-    "        self.cosine_sim = cosine_similarity(self.tfidf_matrix)\n",
+    "    def get_user_ratings(self, num_movies=5):\n",
+    "        '''\n",
+    "        Get user ratings for a specified number of random movies.\n",
     "\n",
-    "    def train_collaborative_model(self):\n",
-    "        reader = Reader(rating_scale=(1, 5))\n",
-    "        data = Dataset.load_from_df(self.df[['user_id', 'movieId', 'rating']], reader)\n",
-    "        trainset, _ = train_test_split(data, test_size=0.2)\n",
-    "        self.cf_model = SVD()\n",
-    "        self.cf_model.fit(trainset)\n",
+    "        Args:\n",
+    "            num_movies (int): Number of movies to rate.\n",
     "\n",
-    "    def get_content_based_recommendations(self, title, top_n=10):\n",
-    "        idx = self.df.index[self.df['title'] == title].tolist()\n",
-    "        if not idx:\n",
-    "            return []\n",
+    "        Returns:\n",
+    "            list: List of tuples containing movie IDs and ratings.\n",
+    "        '''\n",
+    "        # Initialize an empty list to store user ratings\n",
+    "        user_ratings = []\n",
     "\n",
-    "        idx = idx[0]\n",
-    "        sim_scores = list(enumerate(self.cosine_sim[idx]))\n",
-    "        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
-    "        sim_scores = sim_scores[1:top_n+1]\n",
-    "        movie_indices = [i[0] for i in sim_scores]\n",
+    "        # Loop to collect ratings for a specified number of movies\n",
+    "        for _ in range(num_movies):\n",
+    "            # Randomly sample one movie from the DataFrame\n",
+    "            movie = self.df.sample(1).iloc[0]\n",
+    "            \n",
+    "            # Display the movie details to the user\n",
+    "            print(f\"\\nMovie: {movie['title']} ({movie['release_year']})\")\n",
+    "            print(f\"Genre: {movie['genres']}\")\n",
+    "            \n",
+    "            # Prompt the user to rate the movie or indicate they haven't watched it\n",
+    "            rating = input(\"Rate this movie from 1 to 5 (or 'x' if you haven't watched it): \")\n",
+    "            \n",
+    "            # If the user has watched the movie and provided a rating, add it to the list\n",
+    "            if rating.lower() != 'x':\n",
+    "                user_ratings.append((movie['movieId'], float(rating)))\n",
     "        \n",
-    "        return self.df.iloc[movie_indices][['movieId', 'title', 'genres']]\n",
+    "        # Return the list of user ratings\n",
+    "        return user_ratings\n",
     "\n",
-    "    def get_collaborative_recommendations(self, user_id, n=5):\n",
-    "        user_movies = self.df[self.df['user_id'] == user_id]['movieId'].unique()\n",
-    "        all_movies = self.df['movieId'].unique()\n",
-    "        movies_to_predict = np.setdiff1d(all_movies, user_movies)\n",
-    "        \n",
-    "        predictions = [\n",
-    "            (movie_id, self.cf_model.predict(user_id, movie_id).est)\n",
-    "            for movie_id in movies_to_predict\n",
-    "        ]\n",
-    "        \n",
-    "        predictions = sorted(predictions, key=lambda x: x[1], reverse=True)\n",
-    "        return predictions[:n]\n",
     "\n",
-    "    def get_hybrid_recommendations(self, user_id, n=5):\n",
-    "        cf_recs = self.get_collaborative_recommendations(user_id, n=n)\n",
-    "        \n",
-    "        hybrid_recs = []\n",
-    "        for movie_id, cf_score in cf_recs:\n",
-    "            movie = self.df[self.df['movieId'] == movie_id].iloc[0]\n",
-    "            cb_recs = self.get_content_based_recommendations(movie['title'], top_n=5)\n",
-    "            cb_score = np.mean([self.cosine_sim[movie_id][rec['movieId']] for _, rec in cb_recs.iterrows()])\n",
-    "            hybrid_score = 0.7 * cf_score + 0.3 * cb_score\n",
-    "            hybrid_recs.append((movie_id, hybrid_score))\n",
-    "        \n",
-    "        hybrid_recs = sorted(hybrid_recs, key=lambda x: x[1], reverse=True)\n",
-    "        return [(self.df[self.df['movieId'] == movie_id]['title'].iloc[0], score) for movie_id, score in hybrid_recs[:n]]\n",
+    "    def get_hybrid_recommendations(self, user_ratings, n=5, collab_weight=0.5):\n",
+    "        '''\n",
+    "        Get hybrid recommendations based on user ratings.\n",
     "\n",
-    "    def evaluate_recommendations(self, test_set):\n",
-    "        predictions = [self.cf_model.predict(uid, iid).est for uid, iid, true_r, _ in test_set]\n",
-    "        true_ratings = [true_r for _, _, true_r, _ in test_set]\n",
-    "        \n",
-    "        rmse = np.sqrt(mean_squared_error(true_ratings, predictions))\n",
-    "        mae = mean_absolute_error(true_ratings, predictions)\n",
+    "        Args:\n",
+    "            user_ratings (list): List of tuples containing movie IDs and ratings.\n",
+    "            n (int): Number of recommendations to return.\n",
+    "            collab_weight (float): Weight for collaborative filtering (0 to 1).\n",
+    "\n",
+    "        Returns:\n",
+    "            list: List of tuples containing recommended movie IDs and hybrid scores.\n",
+    "        '''\n",
+    "        # Generate a new user ID by incrementing the maximum user ID in the DataFrame\n",
+    "        new_user_id = self.df['user_id'].max() + 1\n",
     "        \n",
-    "        return rmse, mae\n",
+    "        # Get collaborative filtering recommendations\n",
+    "        collab_recommendations = self.collab_model.get_recommendations(user_ratings, n)\n",
+    "        # Extract movie IDs from collaborative filtering recommendations\n",
+    "        collab_movie_ids = [rec[0] for rec in collab_recommendations]\n",
+    "        # Extract scores from collaborative filtering recommendations\n",
+    "        collab_scores = np.array([rec[1] for rec in collab_recommendations])\n",
+    "        \n",
+    "        # Initialize a list to store content-based scores\n",
+    "        content_scores = []\n",
+    "        # Loop through each movie ID from collaborative filtering recommendations\n",
+    "        for movie_id in collab_movie_ids:\n",
+    "            # Get the title of the movie corresponding to the movie ID\n",
+    "            title = self.df[self.df['movieId'] == movie_id]['title'].values[0]\n",
+    "            # Get the top content-based recommendation for the movie\n",
+    "            content_rec = self.content_model.get_recommendations(title, k=1)\n",
+    "            # Calculate the average rating of the content-based recommendation\n",
+    "            content_score = self.df[self.df['title'] == content_rec.iloc[0]]['rating'].mean()\n",
+    "            # Append the content-based score to the list\n",
+    "            content_scores.append(content_score)\n",
+    "        # Convert the list of content-based scores to a NumPy array\n",
+    "        content_scores = np.array(content_scores)\n",
+    "        \n",
+    "        # Combine collaborative and content-based scores using a weighted average\n",
+    "        hybrid_scores = collab_weight * collab_scores + (1 - collab_weight) * content_scores\n",
+    "        # Combine movie IDs with their hybrid scores and sort them in descending order of scores\n",
+    "        hybrid_recommendations = sorted(zip(collab_movie_ids, hybrid_scores), key=lambda x: x[1], reverse=True)\n",
+    "        \n",
+    "        # Return the top n hybrid recommendations\n",
+    "        return hybrid_recommendations[:n]\n",
     "\n",
-    "# Example usage\n",
-    "if __name__ == \"__main__\":\n",
-    "    # # # Load movie data\n",
-    "    # df = pd.read_csv('movies_data/movies.csv')  # Update with the correct path\n",
-    "    \n",
-    "    # recommender = HybridMovieRecommender(df)\n",
-    "    recommender.train_models()\n",
-    "    \n",
-    "    # Get hybrid recommendations for a sample user\n",
-    "    sample_user_id = 1\n",
-    "    recommendations = recommender.get_hybrid_recommendations(sample_user_id, n=5)\n",
-    "    \n",
-    "    print(f\"Hybrid Recommendations for User {sample_user_id}:\")\n",
-    "    for title, score in recommendations:\n",
-    "        print(f\"{title} (Score: {score:.2f})\")\n",
-    "    \n",
-    "    # Evaluate the model\n",
-    "    reader = Reader(rating_scale=(1, 5))\n",
-    "    data = Dataset.load_from_df(df[['user_id', 'movieId', 'rating']], reader)\n",
-    "    _, testset = train_test_split(data, test_size=0.2, random_state=42)\n",
-    "    \n",
-    "    rmse, mae = recommender.evaluate_recommendations(testset)\n",
-    "    print(f\"\\nModel Evaluation:\")\n",
-    "    print(f\"RMSE: {rmse:.4f}\")\n",
-    "    print(f\"MAE: {mae:.4f}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 73,
-   "id": "670bf220-0e8a-4dd5-b349-8cbf3aa7bcbe",
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "IndexError",
-     "evalue": "index 0 is out of bounds for axis 0 with size 0",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[73], line 87\u001b[0m\n\u001b[1;32m     82\u001b[0m \u001b[38;5;66;03m# Example usage\u001b[39;00m\n\u001b[1;32m     83\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m     84\u001b[0m     \u001b[38;5;66;03m# # Load your data\u001b[39;00m\n\u001b[1;32m     85\u001b[0m     \u001b[38;5;66;03m# df = pd.read_csv('your_movie_data.csv')  # Replace with your actual data file\u001b[39;00m\n\u001b[0;32m---> 87\u001b[0m     recommender \u001b[38;5;241m=\u001b[39m \u001b[43mHybridRecommender\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     88\u001b[0m     recommender\u001b[38;5;241m.\u001b[39mrecommend_movies(user_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, n\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m)  \u001b[38;5;66;03m# Replace with an actual user ID from your data\u001b[39;00m\n",
-      "Cell \u001b[0;32mIn[73], line 17\u001b[0m, in \u001b[0;36mHybridRecommender.__init__\u001b[0;34m(self, df)\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcosine_sim \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m     16\u001b[0m \u001b[38;5;66;03m# Map DataFrame columns to expected names\u001b[39;00m\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muser_col \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcontains\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43muser\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcase\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m     18\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitem_col \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf\u001b[38;5;241m.\u001b[39mcolumns[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmovie\u001b[39m\u001b[38;5;124m'\u001b[39m, case\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)][\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrating_col \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf\u001b[38;5;241m.\u001b[39mcolumns[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrating\u001b[39m\u001b[38;5;124m'\u001b[39m, case\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)][\u001b[38;5;241m0\u001b[39m]\n",
-      "File \u001b[0;32m~/anaconda3/envs/learn-env/lib/python3.8/site-packages/pandas/core/indexes/base.py:4101\u001b[0m, in \u001b[0;36mIndex.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   4099\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_scalar(key):\n\u001b[1;32m   4100\u001b[0m     key \u001b[38;5;241m=\u001b[39m com\u001b[38;5;241m.\u001b[39mcast_scalar_indexer(key, warn_float\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m-> 4101\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mgetitem\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mslice\u001b[39m):\n\u001b[1;32m   4104\u001b[0m     \u001b[38;5;66;03m# This case is separated from the conditional above to avoid\u001b[39;00m\n\u001b[1;32m   4105\u001b[0m     \u001b[38;5;66;03m# pessimization of basic indexing.\u001b[39;00m\n\u001b[1;32m   4106\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m promote(getitem(key))\n",
-      "\u001b[0;31mIndexError\u001b[0m: index 0 is out of bounds for axis 0 with size 0"
-     ]
-    }
-   ],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "from surprise import Dataset, Reader, SVD\n",
-    "from surprise.model_selection import train_test_split\n",
-    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
-    "from sklearn.metrics.pairwise import cosine_similarity\n",
-    "import re\n",
     "\n",
-    "class HybridRecommender:\n",
-    "    def __init__(self, df):\n",
-    "        self.df = df\n",
-    "        self.collab_model = None\n",
-    "        self.tfidf_matrix = None\n",
-    "        self.cosine_sim = None\n",
-    "        \n",
-    "        # Map DataFrame columns to expected names\n",
-    "        self.user_col = self.df.columns[self.df.columns.str.contains('user', case=False)][0]\n",
-    "        self.item_col = self.df.columns[self.df.columns.str.contains('movie', case=False)][0]\n",
-    "        self.rating_col = self.df.columns[self.df.columns.str.contains('rating', case=False)][0]\n",
-    "        self.title_col = self.df.columns[self.df.columns.str.contains('title', case=False)][0]\n",
-    "        self.genre_col = self.df.columns[self.df.columns.str.contains('genre', case=False)][0]\n",
-    "\n",
-    "    def train_collaborative_model(self):\n",
-    "        reader = Reader(rating_scale=(self.df[self.rating_col].min(), self.df[self.rating_col].max()))\n",
-    "        data = Dataset.load_from_df(self.df[[self.user_col, self.item_col, self.rating_col]], reader)\n",
-    "        trainset, _ = train_test_split(data, test_size=0.2)\n",
-    "        self.collab_model = SVD()\n",
-    "        self.collab_model.fit(trainset)\n",
+    "    def print_recommendations(self, recommendations):\n",
+    "        '''\n",
+    "        Print the recommended movies with their hybrid scores.\n",
     "\n",
-    "    def train_content_based_model(self):\n",
-    "        self.df['clean_title'] = self.df[self.title_col].apply(lambda x: re.sub(\"[^a-zA-Z0-9 ]\", \"\", str(x).lower()))\n",
-    "        self.df['features'] = self.df['clean_title'] + ' ' + self.df[self.genre_col].astype(str)\n",
-    "        \n",
-    "        tfidf = TfidfVectorizer(stop_words='english')\n",
-    "        self.tfidf_matrix = tfidf.fit_transform(self.df['features'])\n",
-    "        self.cosine_sim = cosine_similarity(self.tfidf_matrix)\n",
+    "        Args:\n",
+    "            recommendations (list): List of tuples containing movie IDs and hybrid scores.\n",
+    "        '''\n",
+    "        # Loop through the recommendations and print the details of each recommended movie\n",
+    "        for i, (movie_id, score) in enumerate(recommendations, 1):\n",
+    "            # Retrieve the movie details based on the movie ID\n",
+    "            movie = self.df[self.df['movieId'] == movie_id].iloc[0]\n",
+    "            # Print the movie rank, title, release year and hybrid score\n",
+    "            print(f\"{i}. {movie['title']} ({movie['release_year']}) - Hybrid score: {score:.2f}\")\n",
+    "            # Print the genre of the movie\n",
+    "            print(f\"   Genre: {movie['genres']}\")\n",
     "\n",
-    "    def train_models(self):\n",
-    "        self.train_collaborative_model()\n",
-    "        self.train_content_based_model()\n",
     "\n",
-    "    def get_collaborative_recommendations(self, user_id, n=10):\n",
-    "        user_movies = self.df[self.df[self.user_col] == user_id][self.item_col].unique()\n",
-    "        all_movies = self.df[self.item_col].unique()\n",
-    "        movies_to_predict = np.setdiff1d(all_movies, user_movies)\n",
+    "    def recommend_movies(self, num_ratings=5, num_recommendations=5, collab_weight=0.5):\n",
+    "        '''\n",
+    "        Get user ratings and provide hybrid movie recommendations.\n",
+    "\n",
+    "        Args:\n",
+    "            num_ratings (int): Number of movies to rate.\n",
+    "            num_recommendations (int): Number of recommendations to provide.\n",
+    "            collab_weight (float): Weight for collaborative filtering (0 to 1).\n",
+    "        '''\n",
+    "        # Get user ratings for a specified number of movies\n",
+    "        user_ratings = self.get_user_ratings(num_ratings)\n",
     "        \n",
-    "        predictions = [\n",
-    "            (movie_id, self.collab_model.predict(user_id, movie_id).est)\n",
-    "            for movie_id in movies_to_predict\n",
-    "        ]\n",
+    "        # Generate hybrid recommendations based on the user ratings\n",
+    "        recommendations = self.get_hybrid_recommendations(user_ratings, num_recommendations, collab_weight)\n",
     "        \n",
-    "        return sorted(predictions, key=lambda x: x[1], reverse=True)[:n]\n",
+    "        # Print the hybrid recommended movies\n",
+    "        print(\"\\nHybrid Recommended movies:\")\n",
+    "        self.print_recommendations(recommendations)\n",
     "\n",
-    "    def get_content_based_recommendations(self, movie_id, n=10):\n",
-    "        idx = self.df[self.df[self.item_col] == movie_id].index[0]\n",
-    "        sim_scores = list(enumerate(self.cosine_sim[idx]))\n",
-    "        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
-    "        sim_scores = sim_scores[1:n+1]\n",
-    "        movie_indices = [i[0] for i in sim_scores]\n",
-    "        return self.df.iloc[movie_indices][[self.item_col, self.title_col, self.genre_col]]\n",
     "\n",
-    "    def get_hybrid_recommendations(self, user_id, n=10):\n",
-    "        collab_recs = self.get_collaborative_recommendations(user_id, n=n)\n",
-    "        \n",
-    "        hybrid_recs = []\n",
-    "        for movie_id, collab_score in collab_recs:\n",
-    "            content_recs = self.get_content_based_recommendations(movie_id, n=5)\n",
-    "            content_score = content_recs[self.item_col].apply(lambda x: self.cosine_sim[movie_id][x]).mean()\n",
-    "            hybrid_score = 0.7 * collab_score + 0.3 * content_score\n",
-    "            hybrid_recs.append((movie_id, hybrid_score))\n",
-    "        \n",
-    "        hybrid_recs = sorted(hybrid_recs, key=lambda x: x[1], reverse=True)\n",
-    "        return [(self.df[self.df[self.item_col] == movie_id][self.title_col].iloc[0], score) for movie_id, score in hybrid_recs[:n]]\n",
+    "# Instantiate\n",
+    "hybrid_model = HybridModel(collab_df, content_df)\n",
+    "# Train both the collaborative filtering and content-based models\n",
+    "hybrid_model.train_models()\n",
     "\n",
-    "    def recommend_movies(self, user_id, n=10):\n",
-    "        self.train_models()\n",
-    "        recommendations = self.get_hybrid_recommendations(user_id, n)\n",
-    "        print(f\"\\nTop {n} Recommendations for User {user_id}:\")\n",
-    "        for i, (title, score) in enumerate(recommendations, 1):\n",
-    "            print(f\"{i}. {title} (Score: {score:.2f})\")\n",
+    "# Recommend movies using the hybrid model specifying the number of user ratings to collect,\n",
+    "# the number of movie recommendations to generate and the weight for collaborative filtering in the hybrid model\n",
+    "hybrid_model.recommend_movies(num_ratings=5, num_recommendations=5, collab_weight=0.5)\n",
     "\n",
-    "# Example usage\n",
-    "if __name__ == \"__main__\":\n",
-    "    # # Load your data\n",
-    "    # df = pd.read_csv('your_movie_data.csv')  # Replace with your actual data file\n",
-    "    \n",
-    "    recommender = HybridRecommender(df)\n",
-    "    recommender.recommend_movies(user_id=1, n=5)  # Replace with an actual user ID from your data"
+    "# Save the hybrid model\n",
+    "with open('hybrid_model.pkl', 'wb') as f:\n",
+    "    pickle.dump(hybrid_model, f)"
    ]
   },
   {