diff --git a/Projekty/Projekt2/Spytek_Zolkowski/milestone3_pres.pdf b/Projekty/Projekt2/Spytek_Zolkowski/milestone3_pres.pdf new file mode 100644 index 000000000..594038476 Binary files /dev/null and b/Projekty/Projekt2/Spytek_Zolkowski/milestone3_pres.pdf differ diff --git a/Projekty/Projekt2/Spytek_Zolkowski/notebook_final2.ipynb b/Projekty/Projekt2/Spytek_Zolkowski/notebook_final2.ipynb new file mode 100644 index 000000000..9dfdfc677 --- /dev/null +++ b/Projekty/Projekt2/Spytek_Zolkowski/notebook_final2.ipynb @@ -0,0 +1,835 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Projekt 2 - EDA\n", + "**Mikołaj Spytek, Artur Żółkowski**\n", + "\n", + "W tym projekcie zajmujemy się klasteryzacją danych dotyczących aktywności użytkowników sklepu internetowego.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "from sklearn.cluster import AgglomerativeClustering\n", + "from sklearn.mixture import GaussianMixture\n", + "from sklearn.cluster import DBSCAN\n", + "from sklearn.metrics import calinski_harabasz_score, silhouette_score, davies_bouldin_score, adjusted_mutual_info_score, normalized_mutual_info_score\n", + "from sklearn.manifold import TSNE\n", + "import sklearn\n", + "import seaborn as sns\n", + "from sklearn.cluster import KMeans\n", + "\n", + "import random\n", + "random.seed(42)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\"data/online_shoppers_intention.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Przygotowanie danych" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def encode(data, col, max_val):\n", + " data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)\n", + " data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "months = {\"Jan\": 1, \"Feb\": 2, \"Mar\": 3, \"Apr\": 4, \"May\": 5, \"June\": 6, \n", + " \"Jul\": 7, \"Aug\": 8, \"Sep\": 9, \"Oct\": 10, \"Nov\": 11, \"Dec\": 12}\n", + "data[\"Month\"] = data[\"Month\"].map(months)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = encode(data, 'Month', 12)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ax = data.plot.scatter('Month_sin', 'Month_cos').set_aspect('equal')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_vars = [\"Administrative\", \"Administrative_Duration\", \"Informational\", \"Informational_Duration\", \"ProductRelated\", \n", + " \"ProductRelated_Duration\", \"BounceRates\", \"ExitRates\", \"PageValues\", \"SpecialDay\", \"Month_sin\", \"Month_cos\"]\n", + "cat_vars = [\"OperatingSystems\", \"Browser\", \"Region\", \"VisitorType\", \"Weekend\", \"TrafficType\"]\n", + "log_vars = ['Administrative', 'Administrative_Duration', 'Informational',\n", + " 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',\n", + " 'BounceRates', 'ExitRates', 'PageValues']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import FunctionTransformer, StandardScaler, OrdinalEncoder\n", + "\n", + "scaler=StandardScaler()\n", + "\n", + "preprocessor = ColumnTransformer(\n", + " transformers= [\n", + " ('log', FunctionTransformer(np.log1p), log_vars),\n", + " ('cat', OrdinalEncoder(), cat_vars)\n", + " ],\n", + " remainder = 'passthrough'\n", + ")\n", + "transformed_data = preprocessor.fit_transform(data.drop(['Month', 'Revenue'], axis=1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transformed_data = scaler.fit_transform(transformed_data)\n", + "transformed_data = pd.DataFrame(transformed_data, columns = data.drop(['Month', 'Revenue'], axis=1).columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Klastrowania" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def count_clustering_scores(X, cluster_num, model, score_fun):\n", + " if isinstance(cluster_num, int):\n", + " cluster_num_iter = [cluster_num]\n", + " else:\n", + " cluster_num_iter = cluster_num\n", + " \n", + " scores = [] \n", + " for k in cluster_num_iter:\n", + " model_instance = model(n_clusters=k)\n", + " labels = model_instance.fit_predict(X)\n", + " wcss = score_fun(X, labels)\n", + " scores.append(wcss)\n", + " \n", + " if isinstance(cluster_num, int):\n", + " return scores[0]\n", + " else:\n", + " return scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cluster_num_seq = range(2, 20)\n", + "davies_vec = count_clustering_scores(transformed_data, cluster_num_seq, KMeans, davies_bouldin_score)\n", + "plt.figure(figsize=(12,8))\n", + "plt.plot(cluster_num_seq, davies_vec, 'bx-')\n", + "plt.xlabel('k')\n", + "plt.ylabel('davies-bouldin score')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cluster_num_seq = range(2, 20)\n", + "silhouette_vec = count_clustering_scores(transformed_data, cluster_num_seq, KMeans, silhouette_score)\n", + "plt.figure(figsize=(12,8))\n", + "plt.plot(cluster_num_seq, silhouette_vec, 'bx-')\n", + "plt.xlabel('k')\n", + "plt.ylabel('Silhouette score')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Pierwszy przykładowy model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_km = KMeans(n_clusters = 12, random_state = 42)\n", + "labels_km = model_km.fit_predict(transformed_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transformed_data[\"cluster\"] = labels_km\n", + "data[\"cluster\"] = labels_km" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tSNE = TSNE(learning_rate = 300, random_state = 42, verbose = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tSNE_td = tSNE.fit_transform(transformed_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(12,8))\n", + "sns.scatterplot(x = tSNE_td[:,0],\n", + " y = tSNE_td[:,1], \n", + " hue = labels_km,\n", + " style = data[\"Revenue\"],\n", + " alpha=0.5,\n", + " palette=sns.color_palette(\"hls\", 12), \n", + " legend=True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(4, 3, figsize=(14, 14))\n", + "for i, feature in enumerate(num_vars):\n", + " m, n = divmod(i, 3)\n", + " sns.boxplot(x=\"cluster\", y=feature, data=data, ax = ax[m, n])\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = data.groupby(\"cluster\").agg(['sum', 'count'])\n", + "results[\"Revenue\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Porównaniue wyników różnych modeli" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "algorithms = {\n", + " \"KMeans\": KMeans(random_state=42),\n", + " \"Agglomerative - ward linkage\": AgglomerativeClustering(linkage=\"ward\"),\n", + " \"Agglomerative - single linkage\": AgglomerativeClustering(linkage=\"single\"),\n", + " \"GMM - spherical covariance\": GaussianMixture(covariance_type = \"spherical\", random_state = 42)\n", + "}\n", + "\n", + "# scores = {\n", + "# \"Silhouette\": silhouette_score(),\n", + "# \"Calinski_Harabasz\": calinski_harabasz_score(),\n", + "# \"Davies_Bouldin\": davies_bouldin_score()\n", + "# }\n", + "\n", + "\n", + "silhouette_scores = pd.DataFrame()\n", + "calinski_harabasz_scores = pd.DataFrame()\n", + "davies_bouldin_scores = pd.DataFrame()\n", + "stability_scores= pd.DataFrame()\n", + "indices = [k for k in range(len(transformed_data))]\n", + "\n", + "\n", + "for i in range (2, 13):\n", + " for name in algorithms:\n", + " model = algorithms[name]\n", + " if \"KMeans\" in name or \"Agglomerative\" in name:\n", + " model.n_clusters = i\n", + " else:\n", + " model.n_components = i\n", + " labels = model.fit_predict(transformed_data)\n", + " silhouette_scores.loc[name, i] = silhouette_score(transformed_data, labels)\n", + " calinski_harabasz_scores.loc[name, i] = calinski_harabasz_score(transformed_data, labels)\n", + " davies_bouldin_scores.loc[name, i] = davies_bouldin_score(transformed_data, labels)\n", + " stability = []\n", + " for j in range(5):\n", + " resampled = sklearn.utils.resample(indices)\n", + " resampled_pred = model.fit_predict(transformed_data.loc[resampled])\n", + " stability.append(normalized_mutual_info_score(labels[resampled], resampled_pred))\n", + " stability_scores.loc[name,i] = np.mean(stability)\n", + " print(\"Doing {} with {} clusters\".format(name, i))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "silhouette_scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.heatmap(silhouette_scores)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "calinski_harabasz_scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.heatmap(calinski_harabasz_scores)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "davies_bouldin_scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.heatmap(davies_bouldin_scores)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stability_scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.heatmap(stability_scores)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transformed_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "minPts = 38\n", + "nbrs = sklearn.neighbors.NearestNeighbors(n_neighbors=minPts).fit(transformed_data)\n", + "distances, indices = nbrs.kneighbors(transformed_data)\n", + "distanceDec = sorted(distances[:,minPts-1], reverse=True)\n", + "fig = plt.figure(figsize=(9,6))\n", + "ax1 = fig.add_subplot()\n", + "\n", + "plt.xlabel('Indeks punktu po sortowaniu')\n", + "plt.ylabel('Dystans od 37 najbliższego sąsiada')\n", + "ax1.plot(list(range(1,transformed_data.shape[0]+1)), distanceDec)\n", + "plt.xscale('log')\n", + "plt.grid(axis='y')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "db = DBSCAN(eps=4.2, min_samples=38)\n", + "\n", + "db_labels = db.fit_predict(transformed_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "set(db_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(12,8))\n", + "sns.scatterplot(x = tSNE_td[:,0],\n", + " y = tSNE_td[:,1], \n", + " hue = db_labels,\n", + " alpha=0.5,\n", + " palette=sns.color_palette(\"Set2\", 3), \n", + " legend=True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.hist(db_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "db = DBSCAN(eps=2.2, min_samples=38)\n", + "\n", + "db_labels = db.fit_predict(transformed_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "set(db_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(12,8))\n", + "sns.scatterplot(x = tSNE_td[:,0],\n", + " y = tSNE_td[:,1], \n", + " hue = db_labels,\n", + " alpha=0.5,\n", + " palette=sns.color_palette(\"hls\", 14), \n", + " legend=True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.hist(db_labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Analiza wybranego modelu" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "km = KMeans(n_clusters=5, random_state=42)\n", + "\n", + "\n", + "labels = km.fit_predict(transformed_data)\n", + "\n", + "\n", + "transformed_data[\"cluster\"] = labels\n", + "data[\"cluster\"] = labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(12,8))\n", + "sns.scatterplot(x = tSNE_td[:,0],\n", + " y = tSNE_td[:,1], \n", + " hue = labels,\n", + " style = data[\"Revenue\"],\n", + " alpha=0.5,\n", + " palette=sns.color_palette(\"hls\", 5), \n", + " legend=True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.hist(labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = data.groupby(\"cluster\").agg(['sum', 'count'])\n", + "results[\"Revenue\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(4, 3, figsize=(14, 14))\n", + "for i, feature in enumerate(num_vars):\n", + " m, n = divmod(i, 3)\n", + " sns.boxplot(x=\"cluster\", y=feature, data=data, ax = ax[m, n], palette=sns.color_palette(\"hls\", 5))\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "sns.countplot(x=\"VisitorType\", hue=\"cluster\", data=data, palette=sns.color_palette(\"hls\", 5))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.countplot(x=\"Revenue\", hue=\"cluster\", data=data, palette=sns.color_palette(\"hls\", 5))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.spatial import distance\n", + "\n", + "def min_interclust_dist(X, label):\n", + " clusters = set(label)\n", + " global_min_dist = np.inf\n", + " for cluster_i in clusters:\n", + " cluster_i_idx = np.where(label == cluster_i)\n", + " for cluster_j in clusters:\n", + " if cluster_i != cluster_j:\n", + " cluster_j_idx = np.where(label == cluster_j)\n", + " interclust_min_dist = np.min(distance.cdist(X[cluster_i_idx], X[cluster_j_idx]))\n", + " global_min_dist = np.min([global_min_dist, interclust_min_dist])\n", + " return global_min_dist\n", + "\n", + "def _inclust_mean_dists(X, label):\n", + " clusters = set(label)\n", + " inclust_dist_list = []\n", + " for cluster_i in clusters:\n", + " cluster_i_idx = np.where(label == cluster_i)\n", + " inclust_dist = np.mean(distance.pdist(X[cluster_i_idx]))\n", + " inclust_dist_list.append(inclust_dist)\n", + " return inclust_dist_list\n", + "\n", + "def mean_inclust_dist(X, label):\n", + " inclust_dist_list = _inclust_mean_dists(X, label)\n", + " return np.mean(inclust_dist_list)\n", + "\n", + "def std_dev_of_inclust_dist(X, label):\n", + " inclust_dist_list = _inclust_mean_dists(X, label)\n", + " return np.std(inclust_dist_list)\n", + "\n", + "def mean_dist_to_center(X, label):\n", + " clusters = set(label)\n", + " inclust_dist_list = []\n", + " for cluster_i in clusters:\n", + " cluster_i_idx = np.where(label == cluster_i)\n", + " cluster_i_mean = np.mean(X[cluster_i_idx], axis=0, keepdims=True)\n", + " inclust_dist = np.mean(distance.cdist(X[cluster_i_idx], cluster_i_mean))\n", + " inclust_dist_list.append(inclust_dist)\n", + " return np.mean(inclust_dist_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "min_interclust_dist(transformed_data.to_numpy(), labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mean_inclust_dist(transformed_data.to_numpy(), labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "std_dev_of_inclust_dist(transformed_data.to_numpy(), labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mean_dist_to_center(transformed_data.to_numpy(), labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 9 klastrów - bonus" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "km = KMeans(n_clusters=9, random_state=42)\n", + "\n", + "\n", + "labels = km.fit_predict(transformed_data)\n", + "\n", + "\n", + "transformed_data[\"cluster\"] = labels\n", + "data[\"cluster\"] = labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(12,8))\n", + "sns.scatterplot(x = tSNE_td[:,0],\n", + " y = tSNE_td[:,1], \n", + " hue = labels,\n", + " style = data[\"Revenue\"],\n", + " alpha=0.5,\n", + " palette=sns.color_palette(\"hls\", 9), \n", + " legend=True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.hist(labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = data.groupby(\"cluster\").agg(['sum', 'count'])\n", + "results[\"Revenue\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(4, 3, figsize=(14, 14))\n", + "for i, feature in enumerate(num_vars):\n", + " m, n = divmod(i, 3)\n", + " sns.boxplot(x=\"cluster\", y=feature, data=data, ax = ax[m, n], palette=sns.color_palette(\"hls\", 9))\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.countplot(x=\"VisitorType\", hue=\"cluster\", data=data, palette=sns.color_palette(\"hls\", 9))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.countplot(x=\"Revenue\", hue=\"cluster\", data=data, palette=sns.color_palette(\"hls\", 9))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Projekty/Projekt2/Spytek_Zolkowski/notebook_final2.pdf b/Projekty/Projekt2/Spytek_Zolkowski/notebook_final2.pdf new file mode 100644 index 000000000..a9bd8986b Binary files /dev/null and b/Projekty/Projekt2/Spytek_Zolkowski/notebook_final2.pdf differ