From f338378938ddf3095c6d6db6dd4f110cdecdca80 Mon Sep 17 00:00:00 2001 From: Evaclaire Wamitu Date: Wed, 31 Jul 2024 09:32:47 +0300 Subject: [PATCH] Add comments and docstring --- .../movie_recommendor-checkpoint.ipynb | 1561 ++++++++++++++++- movie_recommendor.ipynb | 73 +- 2 files changed, 1569 insertions(+), 65 deletions(-) diff --git a/.ipynb_checkpoints/movie_recommendor-checkpoint.ipynb b/.ipynb_checkpoints/movie_recommendor-checkpoint.ipynb index 4aa8390..b433d82 100644 --- a/.ipynb_checkpoints/movie_recommendor-checkpoint.ipynb +++ b/.ipynb_checkpoints/movie_recommendor-checkpoint.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "9538a32c-c02a-4a83-8819-249bed447e92", + "id": "866a7435-710a-41fb-a929-5aeb9d0081fb", "metadata": {}, "source": [ "![attachment:logo.png](logo.png)" @@ -120,10 +120,9 @@ "id": "f28f3f7d-5712-458c-bc61-7730778d795e", "metadata": {}, "source": [ - "1. To build a collaborative filtering model using user ratings to generate top 5 movie recommendations, leveraging algorithms such as Singular Value Decomposition (SVD) and k-Nearest Neighbors (k-NN).\n", - "2. To address the cold start problem for new users by integrating content-based filtering, utilizing features such as movie genres, directors, and cast.\n", - "3. To evaluate the hybrid recommendation system using appropriate metrics like Root Mean Square Error (RMSE), Mean Average Precision (MAP), and Normalized Discounted Cumulative Gain (NDCG) to ensure accuracy and relevance of the recommendations.\n", - "\n" + "1. To build a collaborative filtering model using user ratings to generate top 5 movie recommendations leveraging algorithms such as Singular Value Decomposition (SVD) and K-Nearest Neighbors (k-NN).\n", + "2. To tackle the cold start problem for new users by developing a content-based filtering system utilizing movie genres and titles.\n", + "3. To build a weighted hybrid recommendation system by combining the collaborative and conten-based filtering systems and evaluating the Root Mean Square Error (RMSE) to ensure accuracy and relevance of the recommendations." ] }, { @@ -131,13 +130,9 @@ "id": "5efa31bb-7862-49a4-9ce8-cc8fc893043a", "metadata": {}, "source": [ - "## Success Metrics\n", + "## Success Metric\n", "\n", - "1. Root Mean Square Error (RMSE) < 0.9 for rating predictions\n", - "2. Mean Average Precision @5 (MAP@5) > 0.3 for recommended movies\n", - "3. Precision@5 of around 0.2 to 0.5\n", - "4. Recall@5 of around 0.2 to 0.5\n", - "5. F1 Score of around 0.3 to 0.7" + "- Root Mean Square Error (RMSE) < 0.9 for rating predictions\n" ] }, { @@ -211,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "7018cb24-f44b-4c38-ab33-d45adfde8093", "metadata": {}, "outputs": [], @@ -242,7 +237,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "d587021f", "metadata": {}, "outputs": [], @@ -371,7 +366,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "8e2b2e76", "metadata": {}, "outputs": [ @@ -678,7 +673,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "9d454adf", "metadata": {}, "outputs": [ @@ -839,7 +834,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "fa4b26e5", "metadata": {}, "outputs": [ @@ -955,7 +950,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "8b2e0ae5", "metadata": {}, "outputs": [], @@ -966,7 +961,7 @@ " Renames a column in the DataFrame.\n", "\n", " Args:\n", - " df (pandas.DataFrame): The DataFrame containing the column to rename.\n", + " df: The DataFrame containing the column to rename.\n", " current_name (str): The current name of the column.\n", " new_name (str): The new name for the column.\n", "\n", @@ -984,7 +979,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "95a81b7b", "metadata": {}, "outputs": [ @@ -1148,7 +1143,7 @@ "[285783 rows x 5 columns]" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -1169,7 +1164,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "e9d0a39f", "metadata": {}, "outputs": [], @@ -1189,7 +1184,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "c9948e24", "metadata": {}, "outputs": [ @@ -1288,7 +1283,7 @@ "4 4.0 1995 " ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1299,7 +1294,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "4f6353d6-1523-4dc5-9288-ed63c622f686", "metadata": { "scrolled": true @@ -1400,7 +1395,7 @@ "4 5.0 4.0 1995 " ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1408,6 +1403,7 @@ "source": [ "# Genre Processing: Split the genres in the `movies.csv` dataset into lists for easier analysis\n", "data_explorer.merged_data['genres']=[row.strip().lower().replace('|',', ') for row in data_explorer.merged_data['genres']]\n", + "# Display first 5 rows\n", "data_explorer.merged_data.head()" ] }, @@ -1423,7 +1419,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "8d0fb7f0", "metadata": {}, "outputs": [ @@ -1464,7 +1460,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "b430f787", "metadata": {}, "outputs": [ @@ -1500,7 +1496,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "bebeff6f", "metadata": {}, "outputs": [ @@ -1531,7 +1527,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "id": "9fc27ef7", "metadata": {}, "outputs": [ @@ -1561,7 +1557,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "id": "e4df2d45-99a6-4bc0-8cae-ae9989295dc6", "metadata": {}, "outputs": [ @@ -1571,7 +1567,7 @@ "dtype('int64')" ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1579,6 +1575,7 @@ "source": [ "# Convert user_id from float to int\n", "data_explorer.merged_data['user_id'] = data_explorer.merged_data['user_id'].astype(int)\n", + "# Display converted data type\n", "data_explorer.merged_data['user_id'].dtype" ] }, @@ -1603,7 +1600,7 @@ "id": "99baee9b-276a-4882-b922-8338aec63e90", "metadata": {}, "source": [ - "In this section we will perform exploratory data analysis to identify patterns, trends and relationships within the data. This will involve visualizations as well as statistical techniques to summarize the main characteristics of the data." + "In this section we will perform exploratory data analysis to identify patterns, trends and relationships within the data. This will involve visualizations as well as statistical techniques to summarize the main characteristics of the data. The primary goals of EDA include understanding the data structure, identifying patterns and relationships, detecting anomalies, generating initial insights and informing further analysis. This process typically involves summarizing data with descriptive statistics and elaborating using data visualization techniques. EDA provides a foundational understanding of the data guiding the selection of appropriate models for further analysis." ] }, { @@ -1619,12 +1616,12 @@ "id": "b4715bd9-479f-4a1b-b005-57ae49bb8283", "metadata": {}, "source": [ - "This will involve analyzing and summarizing individual variables in our dataset to describe the basic features and patterns without considering relationships between variables. First step is to assign the variable 'df' to data_explorer.merged_data for ease of reference and then previewing the first five columns. " + "This involves analyzing and summarizing individual variables in our dataset to describe the basic features and patterns without considering relationships between variables. First step is to assign the variable 'df' to data_explorer.merged_data for ease of reference and then previewing the first five columns. " ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "id": "8c2dfcb4-a0c7-4380-8c2f-4cdc12735c9b", "metadata": {}, "outputs": [ @@ -1723,12 +1720,13 @@ "4 17 4.5 1995 " ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# Sanity check\n", "df = data_explorer.merged_data\n", "df.head()" ] @@ -1752,7 +1750,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "id": "724a5c86-19b6-4f93-80a6-ead1dd3c7916", "metadata": {}, "outputs": [ @@ -1886,7 +1884,7 @@ " '''\n", " Initializes the UnivariateAnalysis class with a DataFrame.\n", " \n", - " Parameters:\n", + " Args:\n", " df (DataFrame): A pandas DataFrame containing the movie data to analyze.\n", " '''\n", " self.df = df\n", @@ -2027,12 +2025,12 @@ "source": [ "## Bivariate Analysis\n", "\n", - "Bivariate analysis refers to the statistical examination of two variables to understand the relationship between them. We create a `BivariateAnalysis` class designed to perform various bivariate analyses on our dataset. In this instance we shall explore relationships between release years vs ratings, genres vs ratings, movie titles vs rating and movie titles vs total number of people who have rated them. ." + "Bivariate analysis refers to the statistical examination of two variables to understand the relationship between them. We create a `BivariateAnalysis` class designed to perform various bivariate analyses on our dataset. In this instance we shall explore relationships between release years vs ratings, genres vs ratings, movie titles vs rating and movie titles vs total number of users who have rated them. ." ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "id": "449c7ef5-e4da-4285-a38c-57f914dfa46e", "metadata": {}, "outputs": [ @@ -2233,7 +2231,7 @@ " and creates a bar plot to visualize the top n titles based on the total \n", " number of ratings received.\n", " \n", - " Parameters:\n", + " Args:\n", " top_n (int): The number of top-rated titles to display. Default is 20.\n", " '''\n", " \n", @@ -2308,37 +2306,1492 @@ "\n", "The `Top 20 Rated Titles by Number of Ratings` plot shows that 'Forrest Gump' leads the list with the highest number of ratings followed closely by 'The Shawshank Redemption' and 'Pulp Fiction'. The ratings range from around 200 ('Lord of the Rings: The Fellowship of the ring') to nearly 300 for the top-rated films indicating these are all highly popular and widely reviewed. \n", "\n", - "Thee `Correlation Heatmap` visualizes the relationships between the numeric variables in our dataset. It shows a strong positive correlation (0.98) between release year and decade, moderate positive correlations between movieId and both release year (0.51) and decade (0.5) and weak or negligible correlations among the other variables. The diagonal values of 1 represent perfect self-correlation." + "The `Correlation Heatmap` visualizes the relationships between the numeric variables in our dataset. It shows a strong positive correlation (0.98) between release year and decade, moderate positive correlations between movieId and both release year (0.51) and decade (0.5) and weak or negligible correlations among the other variables. The diagonal values of 1 represent perfect self-correlation." + ] + }, + { + "cell_type": "markdown", + "id": "1cb08e0f-4bf9-45d8-be50-e16c6d8f562e", + "metadata": {}, + "source": [ + "# MODELING" + ] + }, + { + "cell_type": "markdown", + "id": "36de16dd-c541-4ac5-bda5-e500095eed4e", + "metadata": {}, + "source": [ + "## Collaborative Filtering" + ] + }, + { + "cell_type": "markdown", + "id": "e08b0b56-be71-470a-9a8c-1b31bcced076", + "metadata": {}, + "source": [ + "### Dummy model\n", + "A dummy or vanilla model is a simple model that is typically used as a reference or baseline against which more complex models are compared. Its purpose is to provide a reference point to evaluate the effectiveness of the more sophisticated algorithms. For our model, we evaluate the dummy model using the Surprise library. The data is prepared using the Reader and Dataset classes from `Surprise` to format our DataFrame containing movie ratings with a rating scale between 1 and 5. The dataset is then divided into training and test sets reserving 25% of the data for testing purposes. A `NormalPredictor` dummy model is then created which generates random predictions based on the observed distribution of ratings and this model is trained on the training set. The performance of the model is evaluated on the test set using the Root Mean Squared Error (RMSE) metric which provides a quantitative measure of its accuracy. Finally, the RMSE of the dummy model is printed showing the baseline performance for comparison with Singular Value Decomposition `SVD` and K-Nearest Neighbors `KNN` algorithms." ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 19, "id": "4aa1079f-b0ea-4f89-aa54-efb6254c7753", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "1902" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "RMSE: 1.4304\n", + "Baseline Model RMSE: 1.4304275189371316\n" + ] } ], "source": [ - "df.release_year.min()" + "from surprise import NormalPredictor\n", + "from surprise import Reader, Dataset, SVD, KNNBasic\n", + "from surprise.model_selection import cross_validate, GridSearchCV\n", + "from surprise.model_selection import train_test_split\n", + "from surprise import accuracy\n", + "\n", + "# Prepare the data\n", + "reader = Reader(rating_scale=(1, 5))\n", + "data = Dataset.load_from_df(df[['user_id', 'movieId', 'rating']], reader)\n", + "\n", + "# Split the data\n", + "trainset, testset = train_test_split(data, test_size=0.25, random_state=42)\n", + "\n", + "# Create and evaluate the dummy model\n", + "dummy_model = NormalPredictor()\n", + "dummy_model.fit(trainset)\n", + "predictions = dummy_model.test(testset)\n", + "baseline_rmse = accuracy.rmse(predictions)\n", + "\n", + "# Print the RMSE\n", + "print(f\"Baseline Model RMSE: {baseline_rmse}\")" + ] + }, + { + "cell_type": "markdown", + "id": "cea8df25-678d-4664-84bb-38d915eeffcf", + "metadata": {}, + "source": [ + "We find that the RMSE of the baseline model is aproximately 1.43. The next step is to perform grid search cross validation to find the best parameters for the Singular Value Decomposition (SVD) and K-Nearest Neighbors (KNN) models. The `Surprise` library hosts a `GridSearchCV` feature that performs this task.\n", + "Grid searching the SVD model focuses on tuning the following hyperparameters:\n", + "\n", + "**`n_factors`**: Number of latent factors\n", + "**`n_epochs`**: Number of iterations for training \n", + "**`lr_all`**: Learning rate for all parameters \n", + "**`reg_all`**: Regularization term for all parameters\n", + "\n", + "Grid searching the KNN model focuses on tuning the following hyperparameters: \n", + "**`k`**: The number of neighbors to consider when making predictions.\n", + "**`min_k`**: The minimum number of neighbors required to make a prediction. If fewer neighbors are found, predictions are made based on default values.\n", + "**`sim_options`**: A dictionary specifying the similarity options. This includes:\n", + "`cosine` (Cosine Similarity)\n", + "`pearson` (Pearson Correlation)\n", + "**`user_based`**: Whether to use user-based or item-based filtering.\n", + "`True for user-based`\n", + "`False for item-based`\n", + "\n", + "These hyperparameters can be adjusted depending on the dataset and computational resources. After performing the grid search to find the best hyperparameters, the next step is to cross-validate the model with the best parameters. This ensures that the model's performance generalizes well to unseen data." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "aae5b17e-07a1-47e1-9c93-afe9fbff1fea", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tuning SVD...\n", + "Best SVD parameters: {'n_factors': 100, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}\n", + "Best SVD RMSE: 0.8621525273812124\n", + "Tuning KNN...\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the pearson similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Best KNN parameters: {'k': 30, 'min_k': 5, 'sim_options': {'name': 'cosine', 'user_based': True}}\n", + "Best KNN RMSE: 0.974567962502892\n", + "\n", + "Best model: SVD\n", + "\n", + "Best RMSE: 0.8621525273812124\n" + ] + } + ], + "source": [ + "def grid_search_models(data):\n", + " '''\n", + " Performs a grid search to tune hyperparameters for SVD and KNN models, evaluates their performance using RMSE, and returns the best model based on RMSE.\n", + "\n", + " Parameters:\n", + " data (Dataset): The Surprise Dataset object containing the rating data.\n", + "\n", + " Returns:\n", + " tuple: A tuple containing the name of the best model ('SVD' or 'KNN') and its corresponding RMSE score.\n", + " '''\n", + " # Define parameter grids\n", + " svd_param_grid = {\n", + " 'n_factors': [20, 50, 100],\n", + " 'n_epochs': [10, 15, 30], \n", + " 'lr_all': [0.002, 0.005, 0.01],\n", + " 'reg_all': [0.02, 0.05, 0.1]\n", + " }\n", + " \n", + " knn_param_grid = {\n", + " 'k': [10, 20, 30],\n", + " 'min_k': [1, 5, 10],\n", + " 'sim_options': {'name': ['pearson', 'cosine'], 'user_based': [True, False]}\n", + " }\n", + " \n", + " # Perform grid search for SVD\n", + " print('Tuning SVD...')\n", + " gs_svd = GridSearchCV(SVD, svd_param_grid, measures=['rmse'], cv=3)\n", + " gs_svd.fit(data)\n", + " \n", + " svd_best_params = gs_svd.best_params['rmse']\n", + " svd_best_score = gs_svd.best_score['rmse']\n", + " \n", + " print(f\"Best SVD parameters: {svd_best_params}\")\n", + " print(f\"Best SVD RMSE: {svd_best_score}\")\n", + " \n", + " # Perform grid search for KNN\n", + " print('Tuning KNN...')\n", + " gs_knn = GridSearchCV(KNNBasic, knn_param_grid, measures=['rmse'], cv=3)\n", + " gs_knn.fit(data)\n", + " \n", + " knn_best_params = gs_knn.best_params['rmse']\n", + " knn_best_score = gs_knn.best_score['rmse']\n", + " \n", + " print(f\"Best KNN parameters: {knn_best_params}\")\n", + " print(f\"Best KNN RMSE: {knn_best_score}\")\n", + " \n", + " # Determine the best overall model\n", + " if svd_best_score < knn_best_score:\n", + " best_model_class = SVD(**svd_best_params)\n", + " best_model_params = svd_best_params\n", + " best_model_name = 'SVD'\n", + " best_score = svd_best_score\n", + " else:\n", + " best_model_class = KNNBasic(**knn_best_params)\n", + " best_model_params = knn_best_params\n", + " best_model_name = 'KNN'\n", + " best_score = knn_best_score\n", + " \n", + " # Print the best model and best RMSE score\n", + " print(f\"\\nBest model: {best_model_name}\")\n", + " print(f\"\\nBest RMSE: {best_score}\")\n", + " \n", + " return best_model_name, best_score\n", + "\n", + "# Instantiate\n", + "best_model_name, best_score = grid_search_models(data)" + ] + }, + { + "cell_type": "markdown", + "id": "aaab006c-6092-4bf0-bff0-6e49289c48c3", + "metadata": {}, + "source": [ + "### Summary" + ] + }, + { + "cell_type": "markdown", + "id": "0c954d06-e38c-447b-901f-d4bdd36aad59", + "metadata": {}, + "source": [ + "The output reveals the results of tuning and evaluating recommendation models using grid search. For the SVD model, the optimal parameters were identified as having 100 factors, 30 epochs, a learning rate of 0.01 and regularization of 0.1 achieving the best RMSE of approximately 0.862. In contrast, the KNN model required extensive computation of similarity matrices for various configurations including Pearson, cosine and MSD (Mean Squared Difference) similarities. The best parameters for the KNN model were found to be 30 neighbors, a minimum of 10 neighbors and using the cosine similarity metric with a non-user-based approach resulting in a higher RMSE of about 0.917. Consequently, the SVD model emerged as the superior choice with the lowest RMSE and we therefore selected it as the best model overall." + ] + }, + { + "cell_type": "markdown", + "id": "3504e6c1-96da-4765-b42d-6de6f0594e14", + "metadata": {}, + "source": [ + "### Cross-validation of the best model" + ] + }, + { + "cell_type": "markdown", + "id": "c90a5694-30a2-47fb-8617-bd405616c182", + "metadata": {}, + "source": [ + "The cross_validate_best_model function evaluates the performance of the best model identified from the previous grid search through cross-validation. The best model was identified as the `SVD` model. It initializes the model with the corresponding optimal parameters and performs cross-validation using 5 folds and computes the mean RMSE from the results. The function prints out the mean RMSE and returns the model name, the best score from the grid search and the mean RMSE. The mean RMSE is " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "04358685-5ec8-4d2a-99e5-fca4203c4911", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating RMSE of algorithm SVD on 5 split(s).\n", + "\n", + " Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std \n", + "RMSE (testset) 0.8654 0.8591 0.8506 0.8529 0.8548 0.8566 0.0052 \n", + "Fit time 8.94 8.49 8.08 8.03 8.22 8.35 0.33 \n", + "Test time 0.11 0.12 0.11 0.12 0.12 0.11 0.00 \n", + "SVD Model Mean RMSE: 0.8565943146879842\n", + "SVD Model Standard Deviation RMSE: 0.0052232662741193365\n" + ] + } + ], + "source": [ + "# Initialize SVD model with specified hyperparameters\n", + "svd_model = SVD(n_factors=100, n_epochs=30, lr_all=0.01, reg_all=0.1)\n", + "\n", + "# Perform cross-validation on the SVD model using 5 folds\n", + "# Measures RMSE (Root Mean Square Error) for evaluation\n", + "cross_val_results = cross_validate(svd_model, data, measures=['RMSE'], cv=5, verbose=True)\n", + "\n", + "# Print the mean RMSE from the cross-validation results\n", + "print(f\"SVD Model Mean RMSE: {np.mean(cross_val_results['test_rmse'])}\")\n", + "\n", + "# Print the standard deviation of RMSE from the results\n", + "print(f\"SVD Model Standard Deviation RMSE: {np.std(cross_val_results['test_rmse'])}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "cccdc46d-7e10-4265-9deb-7b8a532dd9b5", + "metadata": {}, + "source": [ + "The cross-validation results for the SVD model indicate strong and consistent performance. The model achieved an average RMSE of approximately 0.857 across five folds with a very low standard deviation of 0.005 demonstrating stable performance across different data splits. Overall these metrics suggest that the SVD model not only provides reliable predictions with low error but also maintains efficient and consistent training and prediction times. The next step is to build a class that will provide the top 5 recommendations." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "133e166f-e332-4457-8d10-6c5003684986", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Movie: Before Sunrise (1995)\n", + "Genre: drama, romance\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Rate this movie from 1 to 5 (or 'n' if you haven't seen it): n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Movie: To Wong Foo, Thanks for Everything! Julie Newmar (1995)\n", + "Genre: comedy\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Rate this movie from 1 to 5 (or 'n' if you haven't seen it): 5\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Movie: Ferris Bueller's Day Off (1986)\n", + "Genre: comedy\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Rate this movie from 1 to 5 (or 'n' if you haven't seen it): 5\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Movie: Miracle on 34th Street (1994)\n", + "Genre: drama\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Rate this movie from 1 to 5 (or 'n' if you haven't seen it): 4\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Movie: Toy Story (1995)\n", + "Genre: adventure, animation, children, comedy, fantasy\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Rate this movie from 1 to 5 (or 'n' if you haven't seen it): 5\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Recommended movies:\n", + "1. Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) - Predicted rating: 4.31\n", + "Genre: comedy, war\n", + "2. Philadelphia Story, The (1940) - Predicted rating: 4.24\n", + "Genre: comedy, drama, romance\n", + "3. Pulp Fiction (1994) - Predicted rating: 4.22\n", + "Genre: comedy, crime, drama, thriller\n", + "4. Princess Bride, The (1987) - Predicted rating: 4.22\n", + "Genre: action, adventure, comedy, fantasy, romance\n", + "5. Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001) - Predicted rating: 4.21\n", + "Genre: comedy, romance\n" + ] + } + ], + "source": [ + "import pickle \n", + "\n", + "# Create collab df\n", + "collab_df = df[['user_id', 'movieId', 'rating']].copy()\n", + "\n", + "# Save to a CSV file\n", + "collab_df.to_csv('collab_movies.csv', index=False)\n", + "\n", + "class CollabBasedModel:\n", + " def __init__(self, collab_df):\n", + " '''\n", + " Initializes the MovieRecommender with a DataFrame containing movie data.\n", + "\n", + " Args:\n", + " df (pd.DataFrame): DataFrame containing movie information with columns 'user_id', 'movieId', 'rating', 'title', 'release_year', and 'genres'.\n", + " '''\n", + " self.df = collab_df\n", + " self.model = None\n", + "\n", + " def train_model(self):\n", + " '''\n", + " Trains the SVD model on the movie ratings data. Splits the data into training and test sets and fits the model.\n", + " '''\n", + " # Create a Reader object to parse the ratings data with a specified rating scale\n", + " reader = Reader(rating_scale=(1, 5))\n", + " \n", + " # Load the data from the DataFrame into a Surprise Dataset object\n", + " data = Dataset.load_from_df(self.df[['user_id', 'movieId', 'rating']], reader)\n", + " \n", + " # Split the data into a training set and a test set with 80% of the data for training\n", + " trainset, _ = train_test_split(data, test_size=0.2)\n", + " \n", + " # Initialize the SVD (Singular Value Decomposition) model for collaborative filtering\n", + " self.model = SVD()\n", + " \n", + " # Train the SVD model on the training set\n", + " self.model.fit(trainset)\n", + "\n", + "\n", + " def get_user_ratings(self, num_movies=5):\n", + " '''\n", + " Collects ratings from the user for a specified number of movies.\n", + "\n", + " Args:\n", + " num_movies (int): Number of movies to rate.\n", + "\n", + " Returns:\n", + " list: List of tuples containing movieId and user rating.\n", + " '''\n", + " \n", + " # Initialize an empty list to store user ratings\n", + " user_ratings = []\n", + " \n", + " # Loop to collect ratings for a specified number of movies\n", + " for _ in range(num_movies):\n", + " # Randomly sample one movie from the DataFrame\n", + " movie = self.df.sample(1).iloc[0]\n", + " \n", + " # Display the movie details to the user\n", + " print(f\"\\nMovie: {movie['title']} ({movie['release_year']})\")\n", + " print(f\"Genre: {movie['genres']}\")\n", + " \n", + " # Prompt the user to rate the movie or indicate they haven't seen it\n", + " rating = input(\"Rate this movie from 1 to 5 (or 'n' if you haven't seen it): \")\n", + " \n", + " # If the user has watched the movie and provided a rating, add it to the list\n", + " if rating.lower() != 'n':\n", + " user_ratings.append((movie['movieId'], float(rating)))\n", + " \n", + " # Return the list of user ratings\n", + " return user_ratings\n", + "\n", + "\n", + " def get_recommendations(self, user_ratings, n=5, genre=None):\n", + " '''\n", + " Provides movie recommendations based on user ratings and optional genre filtering.\n", + "\n", + " Args:\n", + " user_ratings (list): List of tuples containing movieId and user rating.\n", + " n (int): Number of recommendations to provide.\n", + " genre (str, optional): Genre to filter recommendations by.\n", + "\n", + " Returns:\n", + " list: List of recommended movies with their predicted ratings.\n", + " '''\n", + " \n", + " # Generate a unique user ID for a new user who is providing ratings for the first time\n", + " new_user_id = self.df['user_id'].max() + 1\n", + " \n", + " # Identify movies that the new user has not yet rated\n", + " movies_to_predict = self.df[~self.df['movieId'].isin([x[0] for x in user_ratings])]['movieId'].unique()\n", + " \n", + " # Predict ratings for each of these movies\n", + " predictions = [\n", + " (movie_id, self.model.predict(new_user_id, movie_id).est) # Predict rating for the movie\n", + " for movie_id in movies_to_predict\n", + " ]\n", + " \n", + " # Sort predictions in descending order of estimated ratings\n", + " recommendations = sorted(predictions, key=lambda x: x[1], reverse=True)\n", + " \n", + " # If a genre filter is specified\n", + " if genre:\n", + " # Filter recommendations to include only those that match the specified genre\n", + " genre_recommendations = [\n", + " (movie_id, rating) for movie_id, rating in recommendations\n", + " if genre.lower() in self.df[self.df['movieId'] == movie_id]['genres'].iloc[0].lower()\n", + " ]\n", + " return genre_recommendations[:n] # Return top-n genre-specific recommendations\n", + " else:\n", + " return recommendations[:n] # Return top-n general recommendations\n", + "\n", + "\n", + " def print_recommendations(self, recommendations):\n", + " '''\n", + " Prints the recommended movies with their predicted ratings.\n", + "\n", + " Args:\n", + " recommendations (list): List of recommended movies with their predicted ratings.\n", + " '''\n", + " # Enumerate through the sorted recommendations with an index starting at 1\n", + " for i, (movie_id, predicted_rating) in enumerate(recommendations, 1):\n", + " # Retrieve the movie details from the DataFrame using the movie_id\n", + " movie = self.df[self.df['movieId'] == movie_id].iloc[0]\n", + " \n", + " # Print the recommendation number, movie title, release year, and predicted rating formatted to two decimal places\n", + " print(f\"{i}. {movie['title']} ({movie['release_year']}) - Predicted rating: {predicted_rating:.2f}\")\n", + " \n", + " # Print the genre(s) of the movie\n", + " print(f\"Genre: {movie['genres']}\")\n", + "\n", + " def recommend_movies(self, num_ratings=5, num_recommendations=5, genre=None):\n", + " '''\n", + " Recommends movies based on user input ratings and optionally filters by genre.\n", + " \n", + " Args:\n", + " num_ratings (int): Number of movies to rate.\n", + " num_recommendations (int): Number of recommendations to provide.\n", + " genre (str, optional): Genre to filter recommendations by.\n", + " '''\n", + " \n", + " # Retrieve the user's ratings based on the number of ratings specified\n", + " user_ratings = self.get_user_ratings(num_ratings)\n", + " \n", + " # Get movie recommendations based on the user's ratings, desired number of recommendations, and optional genre filter\n", + " recommendations = self.get_recommendations(user_ratings, num_recommendations, genre)\n", + " \n", + " # Print a header for the recommended movies\n", + " print(\"\\nRecommended movies:\")\n", + " \n", + " # Print the list of recommended movies\n", + " self.print_recommendations(recommendations)\n", + "\n", + "\n", + "# Instantiate\n", + "recommender = CollabBasedModel(df)\n", + "recommender.train_model()\n", + "\n", + "# Save the trained model using pickle\n", + "with open('collaborative_model.pkl', 'wb') as f:\n", + " pickle.dump(recommender.model, f)\n", + " \n", + "# Get recommendations\n", + "recommender.recommend_movies(num_ratings=5, num_recommendations=5, genre='Comedy')\n" + ] + }, + { + "cell_type": "markdown", + "id": "8f9045e0-ca85-4ebf-a75d-a68c2a73006f", + "metadata": {}, + "source": [ + "### Summary" + ] + }, + { + "cell_type": "markdown", + "id": "675c21e7-e54e-4d8a-a958-cb572905d592", + "metadata": {}, + "source": [ + "The recommendation process involves getting user ratings for a few sample movies then uses the ratings to predict the user's preferences for unseen movies. The system can filter recommendations by genre if specified. The recommend_movies method chains all steps together prompting the user for ratings, generating recommendations and provides the output. In this case, the `recommend_movies` method uses the collected ratings to predict how the user would rate movies they haven’t rated yet. The method then sorts these predictions to find the highest-rated movies. The genre filteer applied in this case is 'Comedy' and the recommendations are further filtered to include only movies that match the specified genre as illustrated by the output above." + ] + }, + { + "cell_type": "markdown", + "id": "db412a64-bbf2-4a51-8d39-4cca89fd7c29", + "metadata": {}, + "source": [ + "## Content Based" + ] + }, + { + "cell_type": "markdown", + "id": "5682dd7a-ffa5-4238-828d-1eda7c95e082", + "metadata": {}, + "source": [ + "A content based recommender system is a type of recommendation algorithm that suggests items to users based on the characteristics or features of the items they have previously liked or interacted with. It analyzes the content or attributes of items such as movies to find similarities and make recommendations. This content-based filtering approach recommends movies based on the similarity of their genres. It utilizes TF-IDF to represent genres as vectors, calculates cosine similarity to assess the similarity between movies and retrieves movies that are most similar to a given movie. \n", + "The TfidfVectorizer from sklearn.feature_extraction.text is used to convert the genre descriptions into numerical vectors. Each genre is transformed into a TF-IDF (Term Frequency-Inverse Document Frequency) matrix which captures the importance of genres across the dataset. The TF-IDF matrix represents how important each genre is in relation to the other genres in the dataset.\n", + "The cosine similarity matrix is then computed using cosine_similarity from sklearn.metrics.pairwise. This matrix measures the similarity between movies based on their genre vectors with values ranging from 0 (no similarity) to 1 (identical genre profiles)." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "0660dda9-1dd6-42b7-9e9c-0b64deef12aa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "41657 Antz\n", + "51710 Toy Story 2\n", + "57060 Adventures of Rocky and Bullwinkle, The\n", + "59397 Emperor's New Groove, The\n", + "64009 Monsters, Inc.\n", + "Name: title, dtype: object\n" + ] + } + ], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import pickle\n", + "\n", + "# Drop duplicates to get unique movie titles\n", + "content_df = df.drop_duplicates(subset='title')[['movieId', 'title', 'genres', 'release_year']]\n", + "# Save to a CSV file\n", + "content_df.to_csv('content_movies.csv', index=False)\n", + "\n", + "class ContentBasedModel:\n", + " '''\n", + " A content-based recommender system for movies based on genre similarity.\n", + " '''\n", + "\n", + " def __init__(self, df):\n", + " '''\n", + " Initialize the ContentBasedModel with a dataframe of movie information.\n", + "\n", + " Args:\n", + " df (pd.DataFrame): Dataframe containing movie information.\n", + " '''\n", + " self.df = content_df\n", + " self.tfidf_matrix = None\n", + " self.cosine_sim = None\n", + " self.indices = None\n", + "\n", + " def train_model(self):\n", + " '''\n", + " Train the content-based model by creating a TF-IDF matrix and calculating cosine similarity.\n", + "\n", + " Args: \n", + " self (ContentBasedModel): The instance of the ContentBasedModel class.\n", + " '''\n", + " # Define the TF-IDF vectorizer\n", + " tfidf = TfidfVectorizer(stop_words='english')\n", + " \n", + " # Fit and transform the genres\n", + " self.tfidf_matrix = tfidf.fit_transform(self.df['genres'])\n", + " \n", + " # Calculate the cosine similarity matrix\n", + " self.cosine_sim = cosine_similarity(self.tfidf_matrix, self.tfidf_matrix)\n", + " \n", + " # Create a reverse mapping of indices and movie titles\n", + " self.indices = pd.Series(self.df.index, index=self.df['title']).drop_duplicates()\n", + "\n", + " def get_recommendations(self, title, k=5):\n", + " '''\n", + " Get movie recommendations based on genre similarity to the input movie.\n", + "\n", + " Args:\n", + " title (str): Title of the movie to base recommendations on.\n", + " k (int): Number of recommendations to return.\n", + "\n", + " Returns:\n", + " pd.Series: Series of recommended movie titles.\n", + " '''\n", + " # Get the index of the movie that matches the title\n", + " idx = self.indices[title]\n", + " \n", + " # Get the pairwise similarity scores of all movies with that movie\n", + " sim_scores = list(enumerate(self.cosine_sim[idx]))\n", + " \n", + " # Sort the movies based on the similarity scores\n", + " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n", + " \n", + " # Get the scores of the k most similar movies\n", + " sim_scores = sim_scores[1:k+1]\n", + " \n", + " # Get the movie indices\n", + " movie_indices = [i[0] for i in sim_scores]\n", + " \n", + " # Return the top k most similar movies\n", + " return self.df['title'].iloc[movie_indices]\n", + "\n", + "# Instantiate\n", + "content_recommender = ContentBasedModel(df)\n", + "# Train the model\n", + "content_recommender.train_model()\n", + "# Get recommendations for a specific movie\n", + "recommendations = content_recommender.get_recommendations('Toy Story')\n", + "# Print the recommendations\n", + "print(recommendations)\n", + "\n", + "# Save the trained TF-IDF matrix and cosine similarity matrix using pickle\n", + "content_model = {\n", + " 'tfidf_matrix': content_recommender.tfidf_matrix,\n", + " 'cosine_sim': content_recommender.cosine_sim,\n", + " 'indices': content_recommender.indices\n", + "}\n", + "\n", + "with open('contentbased_model.pkl', 'wb') as f:\n", + " pickle.dump(content_model, f)\n" + ] + }, + { + "cell_type": "markdown", + "id": "0e1fd17e-6eb7-49f5-9bb5-b6cb231f45b5", + "metadata": {}, + "source": [ + "### Summary" + ] + }, + { + "cell_type": "markdown", + "id": "ba4d1bad-c35d-4932-bdf9-4ded5bd19984", + "metadata": {}, + "source": [ + "The class above implements a content based movie recommender system using genres as the primary feature. It begins by importing necessary libraries and preparing the dataset ensuring only unique movie titles are retained. The `ContentBasedModel` class encapsulates the core functionality. Within its `train_model` method, movie genres are converted into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency) vectorization creating a matrix of genres. The cosine similarity between all pairs of movies is then calculated based on these TF-IDF representations producing a similarity matrix that identifies movies with similar genres. The `get_recommendations` method uses this similarity matrix to provide the top 5 most similar movies to a given title. The code demonstrates the class's usage by creating an instance, training the model and getting recommendations for 'Toy Story'. Finally, it saves the trained model including the TF-IDF matrix, cosine similarity matrix and movie indices to a pickle file for future use.\n", + "The recommender system outputs the top 5 movies similar to 'Toy Story' based on genre similarity i.e. 'Antz','Toy Story 2', 'The Adventures of Rocky and Bullwinkle', 'The Emperor's New Groove' and 'Monsters, Inc.'. Each recommendation shares key characteristics with 'Toy Story' being that they are family-friendly animations with a focus on adventure and comedy. These recommendations are evidently relevant and appealing to fans of Toy Story.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "fcd47967-e4a5-4915-a4ad-5ba0ebf8cae4", + "metadata": {}, + "source": [ + "## Hybrid System" + ] + }, + { + "cell_type": "markdown", + "id": "c07a19f0-fcde-450c-8fd0-eac2c0eac9f7", + "metadata": {}, + "source": [ + "A hybrid recommendation system integrates multiple techniques such as collaborative filtering and content-based filtering to enhance recommendation accuracy. Collaborative filtering relies on user interactions to suggest items based on similar users' preferences while content-based filtering recommends items based on features and past user preferences. The hybrid system combines these approaches to address their individual weaknesses—like the cold start problem in collaborative filtering and limited feature scope in content-based filtering. It can use methods such as weighted hybrids where recommendations from both techniques are averaged with specific weights, switching hybrids which chooses methods based on conditions or feature augmentation where one technique is enhanced with features from another. This integration aims to provide more accurate and personalized recommendations by leveraging the strengths of each method and mitigating their limitations.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "29fa9bcb-c197-453e-b5f5-0bf14cc8943b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Movie: Dick (1999)\n", + "Genre: comedy\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Rate this movie from 1 to 5 (or 'x' if you haven't watched it): 3\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Movie: Girl Who Kicked the Hornet's Nest, The (Luftslottet som sprängdes) (2009)\n", + "Genre: action, crime, mystery\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Rate this movie from 1 to 5 (or 'x' if you haven't watched it): 5\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Movie: Return to Treasure Island (1988)\n", + "Genre: adventure, animation, comedy\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Rate this movie from 1 to 5 (or 'x' if you haven't watched it): 5\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Movie: Center Stage (2000)\n", + "Genre: drama, musical\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Rate this movie from 1 to 5 (or 'x' if you haven't watched it): x\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Movie: Sleepaway Camp (1983)\n", + "Genre: horror\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Rate this movie from 1 to 5 (or 'x' if you haven't watched it): 3\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Hybrid Recommended movies:\n", + "1. Shawshank Redemption, The (1994) - Hybrid score: 4.20\n", + " Genre: crime, drama\n", + "2. Lawrence of Arabia (1962) - Hybrid score: 4.19\n", + " Genre: adventure, drama, war\n", + "3. Cool Hand Luke (1967) - Hybrid score: 4.14\n", + " Genre: drama\n", + "4. Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) - Hybrid score: 3.63\n", + " Genre: action, adventure\n", + "5. Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) - Hybrid score: 3.14\n", + " Genre: comedy, war\n" + ] + } + ], + "source": [ + "class HybridModel:\n", + " '''\n", + " A hybrid recommender system that combines collaborative filtering and content-based filtering.\n", + " '''\n", + "\n", + " def __init__(self, collab_df, content_df):\n", + " '''\n", + " Initialize the HybridModel with collaborative and content-based dataframes.\n", + "\n", + " Args:\n", + " collab_df (pd.DataFrame): Dataframe for collaborative filtering.\n", + " content_df (pd.DataFrame): Dataframe for content-based filtering.\n", + " '''\n", + " self.collab_model = CollabBasedModel(collab_df)\n", + " self.content_model = ContentBasedModel(content_df)\n", + " self.df = pd.merge(collab_df, content_df, on='movieId').drop_duplicates(subset=['movieId'])\n", + "\n", + " def train_models(self):\n", + " '''\n", + " Train both collaborative and content-based models.\n", + " '''\n", + " self.collab_model.train_model()\n", + " self.content_model.train_model()\n", + "\n", + " def get_user_ratings(self, num_movies=5):\n", + " '''\n", + " Get user ratings for a specified number of random movies.\n", + "\n", + " Args:\n", + " num_movies (int): Number of movies to rate. Defaults to 5.\n", + "\n", + " Returns:\n", + " list: List of tuples containing movie IDs and ratings.\n", + " '''\n", + " # Initialize an empty list to store user ratings\n", + " user_ratings = []\n", + "\n", + " # Loop to collect ratings for a specified number of movies\n", + " for _ in range(num_movies):\n", + " \n", + " # Randomly sample one movie from the DataFrame\n", + " movie = self.df.sample(1).iloc[0]\n", + " \n", + " # Display the movie details to the user\n", + " print(f\"\\nMovie: {movie['title']} ({movie['release_year']})\")\n", + " print(f\"Genre: {movie['genres']}\")\n", + " \n", + " # Prompt the user to rate the movie or indicate they haven't watched it\n", + " rating = input(\"Rate this movie from 1 to 5 (or 'x' if you haven't watched it): \")\n", + " \n", + " # If the user has watched the movie and provided a rating, add it to the list\n", + " if rating.lower() != 'x':\n", + " user_ratings.append((movie['movieId'], float(rating)))\n", + " \n", + " # Return the list of user ratings\n", + " return user_ratings\n", + "\n", + "\n", + " def get_hybrid_recommendations(self, user_ratings, n=5, collab_weight=0.5):\n", + " '''\n", + " Get hybrid recommendations based on user ratings.\n", + "\n", + " Args:\n", + " user_ratings (list): List of tuples containing movie IDs and ratings.\n", + " n (int): Number of recommendations to return.\n", + " collab_weight (float): Weight for collaborative filtering (0 to 1).\n", + "\n", + " Returns:\n", + " list: List of tuples containing recommended movie IDs and hybrid scores.\n", + " '''\n", + " # Generate a new user ID by incrementing the maximum user ID in the DataFrame\n", + " new_user_id = self.df['user_id'].max() + 1\n", + " \n", + " # Get collaborative filtering recommendations\n", + " collab_recommendations = self.collab_model.get_recommendations(user_ratings, n)\n", + " \n", + " # Extract movie IDs from collaborative filtering recommendations\n", + " collab_movie_ids = [rec[0] for rec in collab_recommendations]\n", + " \n", + " # Extract scores from collaborative filtering recommendations\n", + " collab_scores = np.array([rec[1] for rec in collab_recommendations])\n", + " \n", + " # Initialize a list to store content-based scores\n", + " content_scores = []\n", + " \n", + " # Loop through each movie ID from collaborative filtering recommendations\n", + " for movie_id in collab_movie_ids:\n", + " \n", + " # Get the title of the movie corresponding to the movie ID\n", + " title = self.df[self.df['movieId'] == movie_id]['title'].values[0]\n", + " \n", + " # Get the top content-based recommendation for the movie\n", + " content_rec = self.content_model.get_recommendations(title, k=1)\n", + " \n", + " # Calculate the average rating of the content-based recommendation\n", + " content_score = self.df[self.df['title'] == content_rec.iloc[0]]['rating'].mean()\n", + " \n", + " # Append the content-based score to the list\n", + " content_scores.append(content_score)\n", + " \n", + " # Convert the list of content-based scores to a NumPy array\n", + " content_scores = np.array(content_scores)\n", + " \n", + " # Combine collaborative and content-based scores using a weighted average\n", + " hybrid_scores = collab_weight * collab_scores + (1 - collab_weight) * content_scores\n", + " \n", + " # Combine movie IDs with their hybrid scores and sort them in descending order of scores\n", + " hybrid_recommendations = sorted(zip(collab_movie_ids, hybrid_scores), key=lambda x: x[1], reverse=True)\n", + " \n", + " # Return the top n hybrid recommendations\n", + " return hybrid_recommendations[:n]\n", + "\n", + "\n", + " def print_recommendations(self, recommendations):\n", + " '''\n", + " Print the recommended movies with their hybrid scores.\n", + "\n", + " Args:\n", + " recommendations (list): List of tuples containing movie IDs and hybrid scores.\n", + " '''\n", + " # Loop through the recommendations and print the details of each recommended movie\n", + " for i, (movie_id, score) in enumerate(recommendations, 1):\n", + " \n", + " # Retrieve the movie details based on the movie ID\n", + " movie = self.df[self.df['movieId'] == movie_id].iloc[0]\n", + " \n", + " # Print the movie rank, title, release year and hybrid score\n", + " print(f\"{i}. {movie['title']} ({movie['release_year']}) - Hybrid score: {score:.2f}\")\n", + " \n", + " # Print the genre of the movie\n", + " print(f\" Genre: {movie['genres']}\")\n", + "\n", + "\n", + " def recommend_movies(self, num_ratings=5, num_recommendations=5, collab_weight=0.5):\n", + " '''\n", + " Get user ratings and provide hybrid movie recommendations.\n", + "\n", + " Args:\n", + " num_ratings (int): Number of movies to rate.\n", + " num_recommendations (int): Number of recommendations to provide.\n", + " collab_weight (float): Weight for collaborative filtering (0 to 1).\n", + " \n", + " Returns:\n", + " list: A list of recommended movies based on the hybrid model.\n", + " '''\n", + " # Get user ratings for a specified number of movies\n", + " user_ratings = self.get_user_ratings(num_ratings)\n", + " \n", + " # Generate hybrid recommendations based on the user ratings\n", + " recommendations = self.get_hybrid_recommendations(user_ratings, num_recommendations, collab_weight)\n", + " \n", + " # Print the hybrid recommended movies\n", + " print(\"\\nHybrid Recommended movies:\")\n", + " self.print_recommendations(recommendations)\n", + "\n", + "\n", + "# Instantiate\n", + "hybrid_model = HybridModel(collab_df, content_df)\n", + "# Train both the collaborative filtering and content-based models\n", + "hybrid_model.train_models()\n", + "\n", + "# Recommend movies using the hybrid model specifying the number of user ratings to collect,\n", + "# the number of movie recommendations to generate and the weight for collaborative filtering in the hybrid model\n", + "hybrid_model.recommend_movies(num_ratings=5, num_recommendations=5, collab_weight=0.5)\n", + "\n", + "# Save the hybrid model\n", + "with open('hybrid_model.pkl', 'wb') as f:\n", + " pickle.dump(hybrid_model, f)" + ] + }, + { + "cell_type": "markdown", + "id": "79b94085-598d-41f5-9ed1-3eb9e3492352", + "metadata": {}, + "source": [ + "### Summary\n", + "The `HybridRecommender` class combines two types of recommendation systems namely content-based and collaborative filtering into a hybrid model to enhance the top-5 movie recommendations. This class takes in two datasets (one for each recommendation model) and provides a unified recommendation list based on both models. It initializes with separate dataframes for each filtering type and merges them for a comprehensive recommendation system. The class trains both collaborative and content-based models, allows users to rate movies on a scale of 1 to 5 and 'x' if they have not seen the movie then generates hybrid recommendations by blending the two models' scores based on a specified weight in this case 0.5 meaning half of each model's. The model is designed for user interaction, training and evaluation with capabilities for saving and restoring its state." + ] + }, + { + "cell_type": "markdown", + "id": "edacbc2c-dcaf-40ec-b840-1805755548c4", + "metadata": {}, + "source": [ + "#### Evaluating RMSE of the hybrid model\n", + "Evaluating the RMSE (Root Mean Squared Error) of the hybrid model involves measuring the accuracy of its predictions by comparing the predicted ratings to actual ratings. RMSE quantifies the average magnitude of the prediction errors, where a lower RMSE indicates better predictive accuracy. To calculate RMSE, you first predict ratings for a set of items, then compute the difference between these predicted ratings and the actual ratings, square these differences, average them and finally take the square root of the result. For a hybrid model, RMSE helps assess how well the model integrates these methods in predicting user preferences with the goal of achieving a lower RMSE to reflect more precise and reliable recommendations." + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "e4244309-de25-4da0-b248-561f2cfbc79e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting prediction generation...\n", + "Finished prediction generation.\n", + "Root Mean Square Error (RMSE): 1.2463\n" + ] + } + ], + "source": [ + "from surprise import Dataset, Reader, accuracy\n", + "from surprise.model_selection import train_test_split\n", + "\n", + "def evaluate_rmse(hybrid_model, test_size=0.2, collab_weight = None, random_state=42):\n", + " '''\n", + " Evaluate the RMSE of the hybrid model.\n", + "\n", + " Args:\n", + " hybrid_model: The hybrid recommendation model to be evaluated.\n", + " test_size (float): The proportion of the dataset to include in the test split.\n", + " collab_weight (float, optional): The weight for collaborative filtering in the hybrid model. Defaults to None.\n", + " random_state (int, optional): The seed used by the random number generator for reproducibility. Defaults to 42.\n", + "\n", + " Returns:\n", + " float: The RMSE of the hybrid model.\n", + " '''\n", + " # Extract collaborative data from the model\n", + " collab_df = hybrid_model.collab_model.df\n", + " \n", + " reader = Reader(rating_scale=(1, 5))\n", + " data = Dataset.load_from_df(collab_df[['user_id', 'movieId', 'rating']], reader)\n", + " \n", + " # Split the data into training and test sets\n", + " trainset, testset = train_test_split(data, test_size=test_size, random_state=random_state)\n", + " \n", + " # Train both models\n", + " hybrid_model.train_models()\n", + "\n", + " # Generate predictions for the test set\n", + " true_ratings = []\n", + " pred_ratings = []\n", + "\n", + " print('Starting prediction generation...')\n", + " for idx, (user_id, movie_id, true_rating) in enumerate(testset):\n", + " recommendations = hybrid_model.get_hybrid_recommendations([(movie_id, true_rating)], n=1, collab_weight=0.5)\n", + " pred_rating = recommendations[0][1] if recommendations else 0\n", + " true_ratings.append(true_rating)\n", + " pred_ratings.append(pred_rating)\n", + "\n", + " print('Finished prediction generation.')\n", + "\n", + " # Calculate RMSE using numpy vectorized operations\n", + " true_ratings = np.array(true_ratings)\n", + " pred_ratings = np.array(pred_ratings)\n", + " rmse = np.sqrt(np.mean((true_ratings - pred_ratings) ** 2))\n", + "\n", + " return rmse\n", + "\n", + "# Instantiate the hybrid model\n", + "hybrid_model = HybridModel(collab_df, content_df)\n", + "# Evaluate RMSE\n", + "rmse = evaluate_rmse(hybrid_model, collab_weight = 0.5)\n", + "print(f\"Root Mean Square Error (RMSE): {rmse:.4f}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "d10a43f7-781a-4e35-816c-2ecd91c60743", + "metadata": {}, + "source": [ + "With an RMSE of 1.2463, the hybrid model's performance shows that on average its predictions deviate from the actual ratings by approximately 1.25 units. While RMSE provides a measure of accuracy, a lower RMSE is generally preferred. An RMSE of 1.2463 suggests that the model's predictions are somewhat accurate but there is room for improvement. It indicates that while the hybrid model integrates multiple recommendation techniques, its predictions still have a notable level of error and further refinement or optimization may be needed to enhance accuracy. " + ] + }, + { + "cell_type": "markdown", + "id": "b9dd9949-f5af-40fc-97a7-6274b47ae5fb", + "metadata": {}, + "source": [ + "#### Optimizing hybrid model parameters" + ] + }, + { + "cell_type": "markdown", + "id": "358c0076-07da-4785-bbfc-e033f45b9c57", + "metadata": {}, + "source": [ + "In this next step, different collaborative filtering weights are tested to determine their impact on the hybrid model's performance. The weights list contains values representing the proportion of influence from collaborative filtering in the hybrid model. The loop iterates over each weight, adjusting the model's configuration accordingly and evaluates the RMSE for each setting using the `evaluate_rmse` function. The RMSE values are then printed alongside their corresponding weight providing insights into how varying the balance between collaborative and content-based filtering affects the model's prediction accuracy. This process helps identify the optimal weight for achieving the best performance of the hybrid recommendation system." + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "3b94a93f-7012-4b75-b72e-63b04b8ae40b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting prediction generation...\n", + "Finished prediction generation.\n", + "RMSE with collab_weight 0.2: 1.2559\n", + "Starting prediction generation...\n", + "Finished prediction generation.\n", + "RMSE with collab_weight 0.4: 1.2523\n", + "Starting prediction generation...\n", + "Finished prediction generation.\n", + "RMSE with collab_weight 0.6: 1.1263\n", + "Starting prediction generation...\n", + "Finished prediction generation.\n", + "RMSE with collab_weight 0.8: 1.1221\n" + ] + } + ], + "source": [ + "# List of different collaborative filtering weights to test\n", + "weights = [0.2, 0.4, 0.6, 0.8]\n", + "\n", + "# Iterate over each weight value\n", + "for weight in weights:\n", + " \n", + " # Evaluate the RMSE of the hybrid model with the current collaborative filtering weight\n", + " rmse = evaluate_rmse(hybrid_model, collab_weight=weight)\n", + " \n", + " # Print the RMSE result with the corresponding collaborative filtering weight\n", + " print(f\"RMSE with collab_weight {weight}: {rmse:.4f}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "215ef4ab-e717-40f1-9599-d86b5adbf0cf", + "metadata": {}, + "source": [ + "### Summary\n", + "The `evaluate_rmse` function evaluates the performance of the hybrid recommender model by calculating the Root Mean Squared Error (RMSE). It prepares test data, generates predictions using the hybrid model with a specified weight for collaborative filtering and then merges these predictions with the true ratings. By computing the RMSE, the function quantifies the accuracy of the model's predictions. As the weight assigned to collaborative filtering increases, starting from 0.2 up to 0.8, the RMSE values decrease indicating improved prediction accuracy. With a weight of 0.2 the RMSE is 1.2559 and with a weight of 0.4 it slightly improves to 1.2523. The RMSE significantly drops to 1.1263 with a weight of 0.6 and further decreases to 1.1221 with a weight of 0.8. This suggests that higher collaborative filtering weight tends to enhance the model's accuracy leading to lower prediction errors and helps in identifying the most effective balance between collaborative and content-based filtering for better performance.\n", + "The function below plots the different RMSE scores against the different collaborative weights in order to visualize the outcome.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "4a98a25c-2ed9-419c-9f04-e04670db469c", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def plot_rmse_vs_weight(weights, rmse_scores):\n", + " '''\n", + " Plot RMSE against collaborative filtering weights.\n", + "\n", + " Args:\n", + " weights (list): A list of collaborative filtering weights.\n", + " rmse_scores (list): A list of RMSE scores corresponding to the weights.\n", + "\n", + " Returns:\n", + " None\n", + " '''\n", + " plt.figure(figsize=(10, 6))\n", + " plt.plot(weights, rmse_scores, marker='o', linestyle='--', color='purple')\n", + "\n", + " # Add titles and labels\n", + " plt.title('RMSE vs. Collaborative Filtering Weight')\n", + " plt.xlabel('Collaborative Filtering Weight')\n", + " plt.ylabel('RMSE Score')\n", + " plt.grid(True)\n", + "\n", + " # Show the plot\n", + " plt.show()\n", + "\n", + "# Initializing weights and scores\n", + "weights = [0.2, 0.4, 0.6, 0.8]\n", + "rmse_scores = [1.2559, 1.2523, 1.1263, 1.1221]\n", + "plot_rmse_vs_weight(weights, rmse_scores)\n" + ] + }, + { + "cell_type": "markdown", + "id": "53328243-2794-41fc-aff0-0899b13c0652", + "metadata": {}, + "source": [ + "## Deployment on Streamlit" + ] + }, + { + "cell_type": "markdown", + "id": "1c0548a9-2cec-43b9-9cb1-03523fa09506", + "metadata": {}, + "source": [ + "Streamlit is a Python library that simplifies the creation of web applications for data science and machine learning projects. In this deployment, Streamlit is used to create an interactive movie recommendation system. The app features the collaborative filtering model using the SVD algorithm which was trained on user-movie ratings data. The model was chosen due to its better accuracy score as continuous improvement is performed on the hybrid model. \n" + ] + }, + { + "cell_type": "markdown", + "id": "986c0d7d-209c-4860-83fa-cec0add9783d", + "metadata": {}, + "source": [ + "# CONCLUSIONS" + ] + }, + { + "cell_type": "markdown", + "id": "f7cdfb43-e9bf-44c6-b095-8e41e7d91c6a", + "metadata": {}, + "source": [ + "After evaluating the collaborative filtering, content-based filtering and hybrid models, we can conclude that:\n", + "\n", + "The collaborative filtering model, with an RMSE of 0.86, performs exceptionally well by accurately predicting user preferences based on interaction data. In contrast, the hybrid model which leverages all the features included in both the collaborative and content based models shows higher RMSE indicating it may not capture user preferences as effectively. The hybrid model, in theory, benefits from integrating collaborative filtering more heavily achieving the lower RMSE scores as collaborative weight increases. This confirms that a hybrid approach with a focus on collaborative filtering provides superior accuracy and recommendation quality compared to content-based filtering alone.\n", + "By refining the collaborative filtering weight and exploring additional evaluation methods, the recommendation system can be further optimized to deliver more accurate and relevant movie recommendations." + ] + }, + { + "cell_type": "markdown", + "id": "8b70bea7-8f23-449c-929b-439f1f9c67a4", + "metadata": {}, + "source": [ + "# RECOMMENDATIONS" + ] + }, + { + "cell_type": "markdown", + "id": "71a175f9-408c-4272-b74e-ed1406e7d38f", + "metadata": {}, + "source": [ + "1. Experiment further with collaborative filtering weights in finer increments around the optimal value (e.g., between 0.6 and 0.8) to provide additional insights into achieving even better performance.\n", + "\n", + "2. Implement cross-validation to ensure that the observed improvements in RMSE are consistent across different subsets of the data. This helps in verifying that the results are not due to random chance or overfitting.\n", + "\n", + "3. Enhance the content-based model by incorporating more detailed item features such as plot summaries which could provide value especially for users with limited interaction history.\n", + "\n", + "4. Explore other hyperparameters and configurations for both collaborative filtering and content-based components of the hybrid model to potentially enhance performance.\n", + "\n", + "5. Evaluate the model using additional metrics such as Mean Average Precision (MAP) or Precision@K to gain a more comprehensive understanding of its recommendation quality.\n", + "\n", + "6. Incorporate user feedback and real-world testing to validate the model's effectiveness in practical scenarios and ensure it aligns with user preferences and expectations.\n", + "\n", + "7. Regularly evaluate the recommendation system with updated data and metrics to ensure it adapts to changing user preferences and content.\n", + "\n", + "8. Integrate additional techniques such as deep learning-based model to further enhance the system's capabilities and address any remaining limitations." + ] } ], "metadata": { diff --git a/movie_recommendor.ipynb b/movie_recommendor.ipynb index 752b0b8..b433d82 100644 --- a/movie_recommendor.ipynb +++ b/movie_recommendor.ipynb @@ -961,7 +961,7 @@ " Renames a column in the DataFrame.\n", "\n", " Args:\n", - " df (pandas.DataFrame): The DataFrame containing the column to rename.\n", + " df: The DataFrame containing the column to rename.\n", " current_name (str): The current name of the column.\n", " new_name (str): The new name for the column.\n", "\n", @@ -1403,6 +1403,7 @@ "source": [ "# Genre Processing: Split the genres in the `movies.csv` dataset into lists for easier analysis\n", "data_explorer.merged_data['genres']=[row.strip().lower().replace('|',', ') for row in data_explorer.merged_data['genres']]\n", + "# Display first 5 rows\n", "data_explorer.merged_data.head()" ] }, @@ -1574,6 +1575,7 @@ "source": [ "# Convert user_id from float to int\n", "data_explorer.merged_data['user_id'] = data_explorer.merged_data['user_id'].astype(int)\n", + "# Display converted data type\n", "data_explorer.merged_data['user_id'].dtype" ] }, @@ -1724,6 +1726,7 @@ } ], "source": [ + "# Sanity check\n", "df = data_explorer.merged_data\n", "df.head()" ] @@ -1881,7 +1884,7 @@ " '''\n", " Initializes the UnivariateAnalysis class with a DataFrame.\n", " \n", - " Parameters:\n", + " Args:\n", " df (DataFrame): A pandas DataFrame containing the movie data to analyze.\n", " '''\n", " self.df = df\n", @@ -2228,7 +2231,7 @@ " and creates a bar plot to visualize the top n titles based on the total \n", " number of ratings received.\n", " \n", - " Parameters:\n", + " Args:\n", " top_n (int): The number of top-rated titles to display. Default is 20.\n", " '''\n", " \n", @@ -2366,6 +2369,7 @@ "predictions = dummy_model.test(testset)\n", "baseline_rmse = accuracy.rmse(predictions)\n", "\n", + "# Print the RMSE\n", "print(f\"Baseline Model RMSE: {baseline_rmse}\")" ] }, @@ -2757,10 +2761,17 @@ } ], "source": [ + "# Initialize SVD model with specified hyperparameters\n", "svd_model = SVD(n_factors=100, n_epochs=30, lr_all=0.01, reg_all=0.1)\n", + "\n", + "# Perform cross-validation on the SVD model using 5 folds\n", + "# Measures RMSE (Root Mean Square Error) for evaluation\n", "cross_val_results = cross_validate(svd_model, data, measures=['RMSE'], cv=5, verbose=True)\n", "\n", + "# Print the mean RMSE from the cross-validation results\n", "print(f\"SVD Model Mean RMSE: {np.mean(cross_val_results['test_rmse'])}\")\n", + "\n", + "# Print the standard deviation of RMSE from the results\n", "print(f\"SVD Model Standard Deviation RMSE: {np.std(cross_val_results['test_rmse'])}\")\n" ] }, @@ -2891,8 +2902,8 @@ " '''\n", " Initializes the MovieRecommender with a DataFrame containing movie data.\n", "\n", - " Parameters:\n", - " df (pd.DataFrame): DataFrame containing movie information with columns 'user_id', 'movieId', 'rating', 'title', 'release_year', and 'genres'.\n", + " Args:\n", + " df (pd.DataFrame): DataFrame containing movie information with columns 'user_id', 'movieId', 'rating', 'title', 'release_year', and 'genres'.\n", " '''\n", " self.df = collab_df\n", " self.model = None\n", @@ -2920,6 +2931,12 @@ " def get_user_ratings(self, num_movies=5):\n", " '''\n", " Collects ratings from the user for a specified number of movies.\n", + "\n", + " Args:\n", + " num_movies (int): Number of movies to rate.\n", + "\n", + " Returns:\n", + " list: List of tuples containing movieId and user rating.\n", " '''\n", " \n", " # Initialize an empty list to store user ratings\n", @@ -2948,7 +2965,16 @@ " def get_recommendations(self, user_ratings, n=5, genre=None):\n", " '''\n", " Provides movie recommendations based on user ratings and optional genre filtering.\n", + "\n", + " Args:\n", + " user_ratings (list): List of tuples containing movieId and user rating.\n", + " n (int): Number of recommendations to provide.\n", + " genre (str, optional): Genre to filter recommendations by.\n", + "\n", + " Returns:\n", + " list: List of recommended movies with their predicted ratings.\n", " '''\n", + " \n", " # Generate a unique user ID for a new user who is providing ratings for the first time\n", " new_user_id = self.df['user_id'].max() + 1\n", " \n", @@ -2979,8 +3005,10 @@ " def print_recommendations(self, recommendations):\n", " '''\n", " Prints the recommended movies with their predicted ratings.\n", + "\n", + " Args:\n", + " recommendations (list): List of recommended movies with their predicted ratings.\n", " '''\n", - " \n", " # Enumerate through the sorted recommendations with an index starting at 1\n", " for i, (movie_id, predicted_rating) in enumerate(recommendations, 1):\n", " # Retrieve the movie details from the DataFrame using the movie_id\n", @@ -2995,6 +3023,11 @@ " def recommend_movies(self, num_ratings=5, num_recommendations=5, genre=None):\n", " '''\n", " Recommends movies based on user input ratings and optionally filters by genre.\n", + " \n", + " Args:\n", + " num_ratings (int): Number of movies to rate.\n", + " num_recommendations (int): Number of recommendations to provide.\n", + " genre (str, optional): Genre to filter recommendations by.\n", " '''\n", " \n", " # Retrieve the user's ratings based on the number of ratings specified\n", @@ -3105,6 +3138,9 @@ " def train_model(self):\n", " '''\n", " Train the content-based model by creating a TF-IDF matrix and calculating cosine similarity.\n", + "\n", + " Args: \n", + " self (ContentBasedModel): The instance of the ContentBasedModel class.\n", " '''\n", " # Define the TF-IDF vectorizer\n", " tfidf = TfidfVectorizer(stop_words='english')\n", @@ -3341,7 +3377,7 @@ " Get user ratings for a specified number of random movies.\n", "\n", " Args:\n", - " num_movies (int): Number of movies to rate.\n", + " num_movies (int): Number of movies to rate. Defaults to 5.\n", "\n", " Returns:\n", " list: List of tuples containing movie IDs and ratings.\n", @@ -3453,6 +3489,9 @@ " num_ratings (int): Number of movies to rate.\n", " num_recommendations (int): Number of recommendations to provide.\n", " collab_weight (float): Weight for collaborative filtering (0 to 1).\n", + " \n", + " Returns:\n", + " list: A list of recommended movies based on the hybrid model.\n", " '''\n", " # Get user ratings for a specified number of movies\n", " user_ratings = self.get_user_ratings(num_ratings)\n", @@ -3518,9 +3557,18 @@ "from surprise.model_selection import train_test_split\n", "\n", "def evaluate_rmse(hybrid_model, test_size=0.2, collab_weight = None, random_state=42):\n", - " \"\"\"\n", + " '''\n", " Evaluate the RMSE of the hybrid model.\n", - " \"\"\"\n", + "\n", + " Args:\n", + " hybrid_model: The hybrid recommendation model to be evaluated.\n", + " test_size (float): The proportion of the dataset to include in the test split.\n", + " collab_weight (float, optional): The weight for collaborative filtering in the hybrid model. Defaults to None.\n", + " random_state (int, optional): The seed used by the random number generator for reproducibility. Defaults to 42.\n", + "\n", + " Returns:\n", + " float: The RMSE of the hybrid model.\n", + " '''\n", " # Extract collaborative data from the model\n", " collab_df = hybrid_model.collab_model.df\n", " \n", @@ -3656,8 +3704,11 @@ " Plot RMSE against collaborative filtering weights.\n", "\n", " Args:\n", - " weights (list): List of collaborative filtering weights.\n", - " rmse_scores (list): List of RMSE scores corresponding to the weights.\n", + " weights (list): A list of collaborative filtering weights.\n", + " rmse_scores (list): A list of RMSE scores corresponding to the weights.\n", + "\n", + " Returns:\n", + " None\n", " '''\n", " plt.figure(figsize=(10, 6))\n", " plt.plot(weights, rmse_scores, marker='o', linestyle='--', color='purple')\n",