diff --git a/examples/01_prepare_data/data_split.ipynb b/examples/01_prepare_data/data_split.ipynb index 8b74bb3ae2..a9fee08ff7 100644 --- a/examples/01_prepare_data/data_split.ipynb +++ b/examples/01_prepare_data/data_split.ipynb @@ -113,7 +113,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|████████████████████████████████████████████████████████████████████████████| 1.93k/1.93k [00:08<00:00, 217KB/s]\n" + "100%|██████████████████████████████████████████████████████████████████████████| 1.93k/1.93k [00:01<00:00, 1.82kKB/s]\n" ] } ], diff --git a/examples/02_model_collaborative_filtering/baseline_deep_dive.ipynb b/examples/02_model_collaborative_filtering/baseline_deep_dive.ipynb index 4c658fc6b5..66dca5045d 100644 --- a/examples/02_model_collaborative_filtering/baseline_deep_dive.ipynb +++ b/examples/02_model_collaborative_filtering/baseline_deep_dive.ipynb @@ -59,8 +59,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "System version: 3.6.8 |Anaconda, Inc.| (default, Feb 11 2019, 15:03:47) [MSC v.1915 64 bit (AMD64)]\n", - "Pandas version: 0.24.1\n" + "System version: 3.9.16 (main, May 15 2023, 23:46:34) \n", + "[GCC 11.2.0]\n", + "Pandas version: 1.5.3\n" ] } ], @@ -69,6 +70,7 @@ "\n", "import itertools\n", "import pandas as pd\n", + "import scrapbook as sb\n", "\n", "from recommenders.utils.notebook_utils import is_jupyter\n", "from recommenders.datasets import movielens\n", @@ -79,8 +81,8 @@ " map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n", ")\n", "\n", - "print(\"System version: {}\".format(sys.version))\n", - "print(\"Pandas version: {}\".format(pd.__version__))" + "print(f\"System version: {sys.version}\")\n", + "print(f\"Pandas version: {pd.__version__}\")" ] }, { @@ -100,7 +102,8 @@ }, "outputs": [], "source": [ - "MOVIELENS_DATA_SIZE = '100k'" + "MOVIELENS_DATA_SIZE = \"100k\"\n", + "TOP_K = 10" ] }, { @@ -108,6 +111,13 @@ "execution_count": 3, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████| 4.81k/4.81k [00:09<00:00, 495KB/s]\n" + ] + }, { "data": { "text/html": [ @@ -184,14 +194,15 @@ "4 166 346 1.0 886397596" ] }, + "execution_count": 3, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ "data = movielens.load_pandas_df(\n", - " size=MOVIELENS_DATA_SIZE,\n", - " header=['UserId', 'MovieId', 'Rating', 'Timestamp']\n", + " size=MOVIELENS_DATA_SIZE, \n", + " header=[\"UserId\", \"MovieId\", \"Rating\", \"Timestamp\"]\n", ")\n", "\n", "data.head()" @@ -284,22 +295,23 @@ "4 5 2.868217" ] }, + "execution_count": 5, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ "# Calculate avg ratings from the training set\n", - "users_ratings = train.groupby(['UserId'])['Rating'].mean()\n", + "users_ratings = train.groupby([\"UserId\"])[\"Rating\"].mean()\n", "users_ratings = users_ratings.to_frame().reset_index()\n", - "users_ratings.rename(columns = {'Rating': 'AvgRating'}, inplace = True)\n", + "users_ratings.rename(columns={\"Rating\": \"AvgRating\"}, inplace=True)\n", "\n", "users_ratings.head()" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -384,15 +396,16 @@ "12219 1 63 2.0 878543196 3.69697" ] }, + "execution_count": 6, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ "# Generate prediction for the test set\n", - "baseline_predictions = pd.merge(test, users_ratings, on=['UserId'], how='inner')\n", + "baseline_predictions = pd.merge(test, users_ratings, on=[\"UserId\"], how=\"inner\")\n", "\n", - "baseline_predictions.loc[baseline_predictions['UserId'] == 1].head()" + "baseline_predictions.loc[baseline_predictions[\"UserId\"] == 1].head()" ] }, { @@ -404,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -419,13 +432,13 @@ } ], "source": [ - "baseline_predictions = baseline_predictions[['UserId', 'MovieId', 'AvgRating']]\n", + "baseline_predictions = baseline_predictions[[\"UserId\", \"MovieId\", \"AvgRating\"]]\n", "\n", "cols = {\n", - " 'col_user': 'UserId',\n", - " 'col_item': 'MovieId',\n", - " 'col_rating': 'Rating',\n", - " 'col_prediction': 'AvgRating',\n", + " \"col_user\": \"UserId\",\n", + " \"col_item\": \"MovieId\",\n", + " \"col_rating\": \"Rating\",\n", + " \"col_prediction\": \"AvgRating\",\n", "}\n", "\n", "eval_rmse = rmse(test, baseline_predictions, **cols)\n", @@ -459,7 +472,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -526,19 +539,20 @@ "4 288 371" ] }, + "execution_count": 8, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ - "item_counts = train['MovieId'].value_counts().to_frame().reset_index()\n", - "item_counts.columns = ['MovieId', 'Count']\n", + "item_counts = train[\"MovieId\"].value_counts().to_frame().reset_index()\n", + "item_counts.columns = [\"MovieId\", \"Count\"]\n", "item_counts.head()" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -551,7 +565,7 @@ } ], "source": [ - "user_item_col = ['UserId', 'MovieId']\n", + "user_item_col = [\"UserId\", \"MovieId\"]\n", "\n", "# Cross join users and items\n", "test_users = test['UserId'].unique()\n", @@ -568,7 +582,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -641,8 +655,9 @@ "4 50 419 598" ] }, + "execution_count": 10, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ @@ -653,29 +668,27 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "MAP:\t0.055007\n", - "NDCG@K:\t0.252864\n", + "MAP:\t0.055008\n", + "NDCG@K:\t0.252867\n", "Precision@K:\t0.224628\n", "Recall@K:\t0.111736\n" ] } ], "source": [ - "k = 10\n", - "\n", - "cols['col_prediction'] = 'Count'\n", + "cols[\"col_prediction\"] = \"Count\"\n", "\n", - "eval_map = map_at_k(test, baseline_recommendations, k=k, **cols)\n", - "eval_ndcg = ndcg_at_k(test, baseline_recommendations, k=k, **cols)\n", - "eval_precision = precision_at_k(test, baseline_recommendations, k=k, **cols)\n", - "eval_recall = recall_at_k(test, baseline_recommendations, k=k, **cols)\n", + "eval_map = map_at_k(test, baseline_recommendations, k=TOP_K, **cols)\n", + "eval_ndcg = ndcg_at_k(test, baseline_recommendations, k=TOP_K, **cols)\n", + "eval_precision = precision_at_k(test, baseline_recommendations, k=TOP_K, **cols)\n", + "eval_recall = recall_at_k(test, baseline_recommendations, k=TOP_K, **cols)\n", "\n", "print(\"MAP:\\t%f\" % eval_map,\n", " \"NDCG@K:\\t%f\" % eval_ndcg,\n", @@ -699,87 +712,157 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { - "application/papermill.record+json": { - "map": 0.055007342636635974 + "application/scrapbook.scrap.json+json": { + "data": 0.05500831263949166, + "encoder": "json", + "name": "map", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "map" } }, - "metadata": {}, "output_type": "display_data" }, { "data": { - "application/papermill.record+json": { - "ndcg": 0.25286402361020544 + "application/scrapbook.scrap.json+json": { + "data": 0.2528673962200594, + "encoder": "json", + "name": "ndcg", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "ndcg" } }, - "metadata": {}, "output_type": "display_data" }, { "data": { - "application/papermill.record+json": { - "precision": 0.22462845010615715 + "application/scrapbook.scrap.json+json": { + "data": 0.22462845010615715, + "encoder": "json", + "name": "precision", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "precision" } }, - "metadata": {}, "output_type": "display_data" }, { "data": { - "application/papermill.record+json": { - "recall": 0.1117356507425933 + "application/scrapbook.scrap.json+json": { + "data": 0.1117356507425933, + "encoder": "json", + "name": "recall", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "recall" } }, - "metadata": {}, "output_type": "display_data" }, { "data": { - "application/papermill.record+json": { - "rmse": 1.044885130655045 + "application/scrapbook.scrap.json+json": { + "data": 1.044885130655045, + "encoder": "json", + "name": "rmse", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "rmse" } }, - "metadata": {}, "output_type": "display_data" }, { "data": { - "application/papermill.record+json": { - "mae": 0.8369250150730534 + "application/scrapbook.scrap.json+json": { + "data": 0.8369250150730534, + "encoder": "json", + "name": "mae", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "mae" } }, - "metadata": {}, "output_type": "display_data" }, { "data": { - "application/papermill.record+json": { - "exp_var": 0.1364955485850292 + "application/scrapbook.scrap.json+json": { + "data": 0.1364955485850292, + "encoder": "json", + "name": "exp_var", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "exp_var" } }, - "metadata": {}, "output_type": "display_data" }, { "data": { - "application/papermill.record+json": { - "rsquared": 0.13649128638749664 + "application/scrapbook.scrap.json+json": { + "data": 0.13649128638749664, + "encoder": "json", + "name": "rsquared", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "rsquared" } }, - "metadata": {}, "output_type": "display_data" } ], "source": [ "if is_jupyter():\n", - " # Record results with papermill for unit-tests\n", - " import papermill as pm\n", - " import scrapbook as sb\n", + " # Record results with papermill and scrapbook for tests\n", " sb.glue(\"map\", eval_map)\n", " sb.glue(\"ndcg\", eval_ndcg)\n", " sb.glue(\"precision\", eval_precision)\n", @@ -799,20 +882,14 @@ "[[1](https://dl.acm.org/citation.cfm?id=1401944)] Yehuda Koren,\tFactorization meets the neighborhood: a multifaceted collaborative filtering model, KDD '08 pp. 426-434 2008. \n", "[[2](https://surprise.readthedocs.io/en/stable/basic_algorithms.html)] Surprise lib, Basic algorithms" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { + "celltoolbar": "Tags", "kernelspec": { - "display_name": "reco_base", + "display_name": "Python (recommenders)", "language": "python", - "name": "reco_base" + "name": "recommenders" }, "language_info": { "codemirror_mode": { @@ -824,9 +901,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/examples/02_model_collaborative_filtering/sar_deep_dive.ipynb b/examples/02_model_collaborative_filtering/sar_deep_dive.ipynb index 77bcc4e0d2..f46fc49486 100644 --- a/examples/02_model_collaborative_filtering/sar_deep_dive.ipynb +++ b/examples/02_model_collaborative_filtering/sar_deep_dive.ipynb @@ -107,36 +107,25 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n", - "[GCC 7.3.0]\n", - "Pandas version: 0.24.2\n" - ] - } - ], + "outputs": [], "source": [ "# set the environment path to find Recommenders\n", "import sys\n", - "\n", - "import itertools\n", "import logging\n", - "import os\n", - "\n", + "import scipy\n", "import numpy as np\n", "import pandas as pd\n", - "import papermill as pm\n", + "import scrapbook as sb\n", "\n", "from recommenders.datasets import movielens\n", "from recommenders.datasets.python_splitters import python_stratified_split\n", "from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n", "from recommenders.models.sar import SAR\n", "\n", - "print(\"System version: {}\".format(sys.version))\n", - "print(\"Pandas version: {}\".format(pd.__version__))" + "print(f\"System version: {sys.version}\")\n", + "print(f\"Pandas version: {pd.__version__}\")\n", + "print(f\"NumPy version: {np.__version__}\")\n", + "print(f\"SciPy version: {scipy.__version__}\")" ] }, { @@ -149,11 +138,25 @@ }, "outputs": [], "source": [ - "# top k items to recommend\n", + "# Top k items to recommend\n", "TOP_K = 10\n", "\n", "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", - "MOVIELENS_DATA_SIZE = '100k'" + "MOVIELENS_DATA_SIZE = \"100k\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# set log level to INFO\n", + "logging.basicConfig(\n", + " level=logging.DEBUG,\n", + " format=\"%(asctime)s %(levelname)-8s %(message)s\",\n", + " datefmt=\"%Y-%m-%d %H:%M:%S\",\n", + ")" ] }, { @@ -172,112 +175,18 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "4.93MB [00:02, 2.36MB/s] \n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
UserIdMovieIdRatingTimestampTitle
01962423.0881250949Kolya (1996)
1632423.0875747190Kolya (1996)
22262425.0883888671Kolya (1996)
31542423.0879138235Kolya (1996)
43062425.0876503793Kolya (1996)
\n", - "
" - ], - "text/plain": [ - " UserId MovieId Rating Timestamp Title\n", - "0 196 242 3.0 881250949 Kolya (1996)\n", - "1 63 242 3.0 875747190 Kolya (1996)\n", - "2 226 242 5.0 883888671 Kolya (1996)\n", - "3 154 242 3.0 879138235 Kolya (1996)\n", - "4 306 242 5.0 876503793 Kolya (1996)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "data = movielens.load_pandas_df(\n", " size=MOVIELENS_DATA_SIZE,\n", - " header=['UserId', 'MovieId', 'Rating', 'Timestamp'],\n", - " title_col='Title'\n", + " header=[\"UserId\", \"MovieId\", \"Rating\", \"Timestamp\"],\n", + " title_col=\"Title\",\n", ")\n", "\n", - "# Convert the float precision to 32-bit in order to reduce memory consumption \n", - "data.loc[:, 'Rating'] = data['Rating'].astype(np.float32)\n", + "# Convert the float precision to 32-bit in order to reduce memory consumption\n", + "data[\"Rating\"] = data[\"Rating\"].astype(np.float32)\n", "\n", "data.head()" ] @@ -293,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -308,11 +217,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "train, test = python_stratified_split(data, ratio=0.75, col_user=header[\"col_user\"], col_item=header[\"col_item\"], seed=42)" + "train, test = python_stratified_split(\n", + " data, ratio=0.75, col_user=header[\"col_user\"], col_item=header[\"col_item\"], seed=42\n", + ")\n" ] }, { @@ -331,14 +242,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "# set log level to INFO\n", - "logging.basicConfig(level=logging.DEBUG, \n", - " format='%(asctime)s %(levelname)-8s %(message)s')\n", - "\n", "model = SAR(\n", " similarity_type=\"jaccard\", \n", " time_decay_coefficient=30, \n", @@ -350,21 +257,21 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2019-05-28 22:40:09,133 INFO Collecting user affinity matrix\n", - "2019-05-28 22:40:09,137 INFO Calculating time-decayed affinities\n", - "2019-05-28 22:40:09,178 INFO Creating index columns\n", - "2019-05-28 22:40:09,188 INFO Building user affinity sparse matrix\n", - "2019-05-28 22:40:09,194 INFO Calculating item co-occurrence\n", - "2019-05-28 22:40:09,412 INFO Calculating item similarity\n", - "2019-05-28 22:40:09,413 INFO Using jaccard based similarity\n", - "2019-05-28 22:40:09,534 INFO Done training\n" + "2023-07-04 09:49:54 INFO Collecting user affinity matrix\n", + "2023-07-04 09:49:54 INFO Calculating time-decayed affinities\n", + "2023-07-04 09:49:54 INFO Creating index columns\n", + "2023-07-04 09:49:54 INFO Building user affinity sparse matrix\n", + "2023-07-04 09:49:54 INFO Calculating item co-occurrence\n", + "2023-07-04 09:49:55 INFO Calculating item similarity\n", + "2023-07-04 09:49:55 INFO Using jaccard based similarity\n", + "2023-07-04 09:49:55 INFO Done training\n" ] } ], @@ -374,7 +281,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 18, "metadata": { "scrolled": true }, @@ -383,8 +290,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "2019-05-28 22:40:09,546 INFO Calculating recommendation scores\n", - "2019-05-28 22:40:09,641 INFO Removing seen items\n" + "2023-07-04 09:49:57 INFO Calculating recommendation scores\n", + "2023-07-04 09:49:57 INFO Removing seen items\n" ] } ], @@ -401,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 19, "metadata": { "scrolled": true }, @@ -435,74 +342,74 @@ " \n", " \n", " \n", - " 9424\n", + " 9420\n", " 943\n", - " 82\n", - " 21.313228\n", - " Jurassic Park (1993)\n", + " 176\n", + " 21.325644\n", + " Aliens (1986)\n", " \n", " \n", - " 9425\n", + " 9421\n", " 943\n", - " 403\n", - " 21.158839\n", - " Batman (1989)\n", + " 89\n", + " 20.901408\n", + " Blade Runner (1982)\n", " \n", " \n", - " 9426\n", + " 9422\n", " 943\n", - " 568\n", - " 20.962922\n", - " Speed (1994)\n", + " 82\n", + " 20.688100\n", + " Jurassic Park (1993)\n", " \n", " \n", - " 9428\n", + " 9423\n", " 943\n", - " 423\n", - " 20.162170\n", - " E.T. the Extra-Terrestrial (1982)\n", + " 172\n", + " 20.287318\n", + " Empire Strikes Back, The (1980)\n", " \n", " \n", - " 9427\n", + " 9424\n", " 943\n", - " 89\n", - " 19.890513\n", - " Blade Runner (1982)\n", + " 423\n", + " 20.256682\n", + " E.T. the Extra-Terrestrial (1982)\n", " \n", " \n", - " 9429\n", + " 9425\n", " 943\n", - " 393\n", - " 19.832944\n", - " Mrs. Doubtfire (1993)\n", + " 195\n", + " 20.250996\n", + " Terminator, The (1984)\n", " \n", " \n", - " 9423\n", + " 9426\n", " 943\n", - " 11\n", - " 19.570244\n", - " Seven (Se7en) (1995)\n", + " 202\n", + " 20.145059\n", + " Groundhog Day (1993)\n", " \n", " \n", - " 9422\n", + " 9427\n", " 943\n", - " 71\n", - " 19.553877\n", - " Lion King, The (1994)\n", + " 68\n", + " 19.983884\n", + " Crow, The (1994)\n", " \n", " \n", - " 9421\n", + " 9428\n", " 943\n", - " 202\n", - " 19.422129\n", - " Groundhog Day (1993)\n", + " 566\n", + " 19.820856\n", + " Clear and Present Danger (1994)\n", " \n", " \n", - " 9420\n", + " 9429\n", " 943\n", - " 238\n", - " 19.115604\n", - " Raising Arizona (1987)\n", + " 550\n", + " 19.804157\n", + " Die Hard: With a Vengeance (1995)\n", " \n", " \n", "\n", @@ -510,27 +417,31 @@ ], "text/plain": [ " UserId MovieId Prediction Title\n", - "9424 943 82 21.313228 Jurassic Park (1993)\n", - "9425 943 403 21.158839 Batman (1989)\n", - "9426 943 568 20.962922 Speed (1994)\n", - "9428 943 423 20.162170 E.T. the Extra-Terrestrial (1982)\n", - "9427 943 89 19.890513 Blade Runner (1982)\n", - "9429 943 393 19.832944 Mrs. Doubtfire (1993)\n", - "9423 943 11 19.570244 Seven (Se7en) (1995)\n", - "9422 943 71 19.553877 Lion King, The (1994)\n", - "9421 943 202 19.422129 Groundhog Day (1993)\n", - "9420 943 238 19.115604 Raising Arizona (1987)" + "9420 943 176 21.325644 Aliens (1986)\n", + "9421 943 89 20.901408 Blade Runner (1982)\n", + "9422 943 82 20.688100 Jurassic Park (1993)\n", + "9423 943 172 20.287318 Empire Strikes Back, The (1980)\n", + "9424 943 423 20.256682 E.T. the Extra-Terrestrial (1982)\n", + "9425 943 195 20.250996 Terminator, The (1984)\n", + "9426 943 202 20.145059 Groundhog Day (1993)\n", + "9427 943 68 19.983884 Crow, The (1994)\n", + "9428 943 566 19.820856 Clear and Present Danger (1994)\n", + "9429 943 550 19.804157 Die Hard: With a Vengeance (1995)" ] }, + "execution_count": 19, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ - "top_k_with_titles = (top_k.join(data[['MovieId', 'Title']].drop_duplicates().set_index('MovieId'), \n", - " on='MovieId', \n", - " how='inner').sort_values(by=['UserId', 'Prediction'], ascending=False))\n", - "display(top_k_with_titles.head(10))" + "top_k_with_titles = top_k.join(\n", + " data[[\"MovieId\", \"Title\"]].drop_duplicates().set_index(\"MovieId\"),\n", + " on=\"MovieId\",\n", + " how=\"inner\",\n", + ").sort_values(by=[\"UserId\", \"Prediction\"], ascending=False)\n", + "\n", + "top_k_with_titles.head(10)" ] }, { @@ -541,23 +452,25 @@ "\n", "It should be known that the recommendation scores generated by multiplying the item similarity matrix $S$ and the user affinity matrix $A$ **DOES NOT** have the same scale with the original explicit ratings in the movielens dataset. That is to say, SAR algorithm is meant for the task of *recommending relevent items to users* rather than *predicting explicit ratings for user-item pairs*. \n", "\n", - "To this end, ranking metrics like precision@k, recall@k, etc., are more applicable to evaluate SAR algorithm. The following illustrates how to evaluate SAR model by using the evaluation functions provided in the `recommenders`." + "To this end, ranking metrics like precision@k, recall@k, etc., are more applicable to evaluate SAR algorithm. The following illustrates how to evaluate SAR model by using the evaluation functions provided in Recommenders library." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# all ranking metrics have the same arguments\n", "args = [test, top_k]\n", - "kwargs = dict(col_user='UserId', \n", - " col_item='MovieId', \n", - " col_rating='Rating', \n", - " col_prediction='Prediction', \n", - " relevancy_method='top_k', \n", - " k=TOP_K)\n", + "kwargs = dict(\n", + " col_user=\"UserId\",\n", + " col_item=\"MovieId\",\n", + " col_rating=\"Rating\",\n", + " col_prediction=\"Prediction\",\n", + " relevancy_method=\"top_k\",\n", + " k=TOP_K,\n", + ")\n", "\n", "eval_map = map_at_k(*args, **kwargs)\n", "eval_ndcg = ndcg_at_k(*args, **kwargs)\n", @@ -567,7 +480,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -576,10 +489,10 @@ "text": [ "Model:\n", "Top K:\t\t 10\n", - "MAP:\t\t 0.095544\n", - "NDCG:\t\t 0.350232\n", - "Precision@K:\t 0.305726\n", - "Recall@K:\t 0.164690\n" + "MAP:\t\t 0.113796\n", + "NDCG:\t\t 0.384809\n", + "Precision@K:\t 0.331707\n", + "Recall@K:\t 0.182571\n" ] } ], @@ -592,6 +505,19 @@ " f\"Recall@K:\\t {eval_recall:f}\", sep='\\n')" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Record results for tests - ignore this cell\n", + "sb.glue(\"map\", eval_map)\n", + "sb.glue(\"ndcg\", eval_ndcg)\n", + "sb.glue(\"precision\", eval_precision)\n", + "sb.glue(\"recall\", eval_recall)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -606,10 +532,11 @@ } ], "metadata": { + "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python (reco_base)", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "reco_base" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -621,7 +548,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.9.16" } }, "nbformat": 4, diff --git a/examples/02_model_collaborative_filtering/surprise_svd_deep_dive.ipynb b/examples/02_model_collaborative_filtering/surprise_svd_deep_dive.ipynb index c5e72c7e26..69310d1bb7 100644 --- a/examples/02_model_collaborative_filtering/surprise_svd_deep_dive.ipynb +++ b/examples/02_model_collaborative_filtering/surprise_svd_deep_dive.ipynb @@ -91,17 +91,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "System version: 3.6.8 |Anaconda, Inc.| (default, Feb 11 2019, 15:03:47) [MSC v.1915 64 bit (AMD64)]\n", - "Surprise version: 1.0.6\n" + "System version: 3.9.16 (main, May 15 2023, 23:46:34) \n", + "[GCC 11.2.0]\n", + "Surprise version: 1.1.3\n" ] } ], "source": [ - "import sys\n", - "import os\n", + "import sys\n", "import surprise\n", - "import papermill as pm\n", "import scrapbook as sb\n", "import pandas as pd\n", "\n", @@ -112,8 +111,8 @@ " recall_at_k, get_top_k_items)\n", "from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions\n", "\n", - "print(\"System version: {}\".format(sys.version))\n", - "print(\"Surprise version: {}\".format(surprise.__version__))" + "print(f\"System version: {sys.version}\")\n", + "print(f\"Surprise version: {surprise.__version__}\")" ] }, { @@ -126,6 +125,9 @@ }, "outputs": [], "source": [ + "# Top k items to recommend\n", + "TOP_K = 10\n", + "\n", "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", "MOVIELENS_DATA_SIZE = '100k'" ] @@ -142,6 +144,13 @@ "execution_count": 3, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████| 4.81k/4.81k [00:07<00:00, 646KB/s]\n" + ] + }, { "data": { "text/html": [ @@ -263,7 +272,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 5, @@ -332,7 +341,7 @@ "Processing epoch 27\n", "Processing epoch 28\n", "Processing epoch 29\n", - "Took 19.879321813583374 seconds for training.\n" + "Took 2.276676100009354 seconds for training.\n" ] } ], @@ -342,7 +351,7 @@ "with Timer() as train_time:\n", " svd.fit(train_set)\n", "\n", - "print(\"Took {} seconds for training.\".format(train_time.interval))" + "print(f\"Took {train_time.interval} seconds for training.\")" ] }, { @@ -356,7 +365,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -388,33 +397,33 @@ " \n", " \n", " 0\n", - " 600.0\n", - " 651.0\n", - " 4.119\n", + " 877\n", + " 381\n", + " 3.698217\n", " \n", " \n", " 1\n", - " 607.0\n", - " 494.0\n", - " 3.728\n", + " 815\n", + " 602\n", + " 3.590957\n", " \n", " \n", " 2\n", - " 875.0\n", - " 1103.0\n", - " 4.225\n", + " 94\n", + " 431\n", + " 3.841149\n", " \n", " \n", " 3\n", - " 648.0\n", - " 238.0\n", - " 4.225\n", + " 416\n", + " 875\n", + " 2.642248\n", " \n", " \n", " 4\n", - " 113.0\n", - " 273.0\n", - " 4.043\n", + " 500\n", + " 182\n", + " 4.384139\n", " \n", " \n", "\n", @@ -422,14 +431,14 @@ ], "text/plain": [ " userID itemID prediction\n", - "0 600.0 651.0 4.119\n", - "1 607.0 494.0 3.728\n", - "2 875.0 1103.0 4.225\n", - "3 648.0 238.0 4.225\n", - "4 113.0 273.0 4.043" + "0 877 381 3.698217\n", + "1 815 602 3.590957\n", + "2 94 431 3.841149\n", + "3 416 875 2.642248\n", + "4 500 182 4.384139" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -450,14 +459,14 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Took 28.51998782157898 seconds for prediction.\n" + "Took 16.501801499980502 seconds for prediction.\n" ] } ], @@ -465,12 +474,12 @@ "with Timer() as test_time:\n", " all_predictions = compute_ranking_predictions(svd, train, usercol='userID', itemcol='itemID', remove_seen=True)\n", " \n", - "print(\"Took {} seconds for prediction.\".format(test_time.interval))" + "print(f\"Took {test_time.interval} seconds for prediction.\")" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -502,33 +511,33 @@ " \n", " \n", " 75000\n", - " 496\n", - " 101\n", - " 2.981\n", + " 811\n", + " 755\n", + " 4.090273\n", " \n", " \n", " 75001\n", - " 496\n", - " 471\n", - " 3.196\n", + " 811\n", + " 287\n", + " 4.557071\n", " \n", " \n", " 75002\n", - " 496\n", - " 121\n", - " 3.282\n", + " 811\n", + " 181\n", + " 4.571596\n", " \n", " \n", " 75003\n", - " 496\n", - " 238\n", - " 3.577\n", + " 811\n", + " 96\n", + " 4.458827\n", " \n", " \n", " 75004\n", - " 496\n", - " 243\n", - " 1.930\n", + " 811\n", + " 83\n", + " 4.559237\n", " \n", " \n", "\n", @@ -536,14 +545,14 @@ ], "text/plain": [ " userID itemID prediction\n", - "75000 496 101 2.981\n", - "75001 496 471 3.196\n", - "75002 496 121 3.282\n", - "75003 496 238 3.577\n", - "75004 496 243 1.930" + "75000 811 755 4.090273\n", + "75001 811 287 4.557071\n", + "75002 811 181 4.571596\n", + "75003 811 96 4.458827\n", + "75004 811 83 4.559237" ] }, - "execution_count": 12, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -569,22 +578,22 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "RMSE:\t\t0.957953\n", - "MAE:\t\t0.754764\n", - "rsquared:\t0.286992\n", - "exp var:\t0.287030\n", + "RMSE:\t\t0.948771\n", + "MAE:\t\t0.747003\n", + "rsquared:\t0.288045\n", + "exp var:\t0.288157\n", "----\n", - "MAP:\t0.013018\n", - "NDCG:\t0.099960\n", - "Precision@K:\t0.095122\n", - "Recall@K:\t0.032043\n" + "MAP:\t\t0.015624\n", + "NDCG:\t\t0.110465\n", + "Precision@K:\t0.100425\n", + "Recall@K:\t0.035267\n" ] } ], @@ -594,11 +603,10 @@ "eval_rsquared = rsquared(test, predictions)\n", "eval_exp_var = exp_var(test, predictions)\n", "\n", - "k = 10\n", - "eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=k)\n", - "eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=k)\n", - "eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=k)\n", - "eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=k)\n", + "eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n", + "eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n", + "eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n", + "eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n", "\n", "\n", "print(\"RMSE:\\t\\t%f\" % eval_rmse,\n", @@ -608,65 +616,195 @@ "\n", "print('----')\n", "\n", - "print(\"MAP:\\t%f\" % eval_map,\n", - " \"NDCG:\\t%f\" % eval_ndcg,\n", + "print(\"MAP:\\t\\t%f\" % eval_map,\n", + " \"NDCG:\\t\\t%f\" % eval_ndcg,\n", " \"Precision@K:\\t%f\" % eval_precision,\n", " \"Recall@K:\\t%f\" % eval_recall, sep='\\n')" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "metadata": {}, "outputs": [ { - "data": {}, - "metadata": {}, + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.9487710439747563, + "encoder": "json", + "name": "rmse", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "rmse" + } + }, "output_type": "display_data" }, { - "data": {}, - "metadata": {}, + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.7470034925349859, + "encoder": "json", + "name": "mae", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "mae" + } + }, "output_type": "display_data" }, { - "data": {}, - "metadata": {}, + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.28804512193443, + "encoder": "json", + "name": "rsquared", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "rsquared" + } + }, "output_type": "display_data" }, { - "data": {}, - "metadata": {}, + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.28815720397413125, + "encoder": "json", + "name": "exp_var", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "exp_var" + } + }, "output_type": "display_data" }, { - "data": {}, - "metadata": {}, + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.015624359303961253, + "encoder": "json", + "name": "map", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "map" + } + }, "output_type": "display_data" }, { - "data": {}, - "metadata": {}, + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.1104645586650869, + "encoder": "json", + "name": "ndcg", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "ndcg" + } + }, "output_type": "display_data" }, { - "data": {}, - "metadata": {}, + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.10042462845010618, + "encoder": "json", + "name": "precision", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "precision" + } + }, "output_type": "display_data" }, { - "data": {}, - "metadata": {}, + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.03526739062158758, + "encoder": "json", + "name": "recall", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "recall" + } + }, "output_type": "display_data" }, { - "data": {}, - "metadata": {}, + "data": { + "application/scrapbook.scrap.json+json": { + "data": 2.276676100009354, + "encoder": "json", + "name": "train_time", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "train_time" + } + }, "output_type": "display_data" }, { - "data": {}, - "metadata": {}, + "data": { + "application/scrapbook.scrap.json+json": { + "data": 16.501801499980502, + "encoder": "json", + "name": "test_time", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "test_time" + } + }, "output_type": "display_data" } ], @@ -699,7 +837,7 @@ "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -713,9 +851,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.5" + "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +}