diff --git a/publications/umap_paper_benchmarks/README.md b/publications/umap_paper_benchmarks/README.md new file mode 100644 index 00000000..63355cc9 --- /dev/null +++ b/publications/umap_paper_benchmarks/README.md @@ -0,0 +1,34 @@ +Datasets are not included in this repository and need to be downloaded separately. + +The necessary dependencies for reproducing the benchmarks have been captured in `conda` environment yaml files. + +To install dependencies for cuml and UMAP-learn benchmarks: +``` +conda env create --name cuml_umap_benchmarks -f conda/umap_paper_cuml_cuda10.2.yml +``` + +To install dependencies for GPUMAP benchmarks: +``` +conda env create --name gpumap_benchmarks -f conda/umap_paper_gpumap_cuda10.0.yml +``` + +You can run the notebooks using jupyter lab: +``` +conda activate +python -m ipykernel install --user +jupyter lab +``` + +# Datasets + +- PEN Digits - uses sklearn.datasets.load_digits +- GoogleNews Word2Vec - Downloaded from https://code.google.com/archive/p/word2vec/ and loaded using Gensim library +- Fashion MNIST - Downloaded from https://github.com/zalandoresearch/fashion-mnist +- CIFAR-100 - Downloaded from https://www.cs.toronto.edu/~kriz/cifar.html +- Shuttle - Downloaded from https://archive.ics.uci.edu/ml/datasets/Statlog+(Shuttle) +- MNIST - Uses datasets submodule to download and load +- TASIC2018 - Data from : https://portal.brain-map.org/atlases-and-data/rnaseq (see dedicated notebook) +- scRNA - Dataset downloaded from https://cells.ucsc.edu/ +- COIL-20 - Uses datasets submodule to download and load + + diff --git a/publications/umap_paper_benchmarks/__init__.py b/publications/umap_paper_benchmarks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/publications/umap_paper_benchmarks/conda/umap_paper_cuml_cuda10.2.yml b/publications/umap_paper_benchmarks/conda/umap_paper_cuml_cuda10.2.yml new file mode 100644 index 00000000..edc3a4ba --- /dev/null +++ b/publications/umap_paper_benchmarks/conda/umap_paper_cuml_cuda10.2.yml @@ -0,0 +1,37 @@ +channels: + - rapidsai-nightly + - nvidia + - conda-forge + - facebook + - pytorch + - defaults +dependencies: + - python=3.7 + - cudatoolkit=10.2 + - cudf + - dask-cuda + - dask-cudf + - cuml + - cugraph + - opencv + - scikit-image + - chainer + - scipy + - gensim + - ucx-py + - joblib + - matplotlib + - umap-learn + - numba + - ucx-proc=*=gpu + - scikit-learn + - cupy + - ipykernel + - jupyterlab + - pip + - pip: + - jupyter-server-proxy + - git+https://github.com/dask/dask.git + - git+https://github.com/dask/distributed.git + - git+https://github.com/overshiki/datasets.git + - wget diff --git a/publications/umap_paper_benchmarks/conda/umap_paper_gpumap_cuda10.0.yml b/publications/umap_paper_benchmarks/conda/umap_paper_gpumap_cuda10.0.yml new file mode 100644 index 00000000..a8f67bb8 --- /dev/null +++ b/publications/umap_paper_benchmarks/conda/umap_paper_gpumap_cuda10.0.yml @@ -0,0 +1,39 @@ +channels: + - rapidsai-nightly + - nvidia + - conda-forge + - facebook + - pytorch + - defaults +dependencies: + - python=3.7 + - cudatoolkit=10.0 + - cudf + - dask-cuda + - dask-cudf + - cuml + - cugraph + - opencv + - scikit-image + - chainer + - scipy + - ucx-py + - joblib + - matplotlib + - umap-learn + - numba + - ucx-proc=*=gpu + - scikit-learn + - cupy + - ipykernel + - jupyterlab + - pip + - pip: + - jupyter-server-proxy + - faiss==1.5.3 + - faiss-gpu==1.5.3 + - git+https://github.com/dask/dask.git + - git+https://github.com/dask/distributed.git + - git+https://github.com/p3732/gpumap.git + - git+https://github.com/overshiki/datasets.git + - wget diff --git a/publications/umap_paper_benchmarks/notebooks/__init__.py b/publications/umap_paper_benchmarks/notebooks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/publications/umap_paper_benchmarks/notebooks/benchmarks_to_csv.ipynb b/publications/umap_paper_benchmarks/notebooks/benchmarks_to_csv.ipynb new file mode 100644 index 00000000..daf22d10 --- /dev/null +++ b/publications/umap_paper_benchmarks/notebooks/benchmarks_to_csv.ipynb @@ -0,0 +1,880 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle \n", + "import pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "benchmarks = pickle.load(open(\"results/results.pickle\", \"rb\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "algos = [\"umapcuml\", \"umaplearn\", \"umapgpumap\"]\n", + "types = [\"unsupervised\", \"supervised\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "datasets = list(benchmarks.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(datasets)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['digits', 'fashion_mnist', 'cifar100', 'coil20', 'shuttle', 'mnist', 'scrna']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'digits': {'umapcuml': [{'unsupervised': {'time': 0.4452664852142334,\n", + " 'trust': 0.9876636433389232},\n", + " 'supervised': {'time': 0.47392821311950684, 'trust': 0.9879378905549919},\n", + " 'xform': {'time': 0.15683841705322266, 'trust': 0.9847805133487961}},\n", + " {'unsupervised': {'time': 0.18512916564941406, 'trust': 0.9874644094493787},\n", + " 'supervised': {'time': 0.2050032615661621, 'trust': 0.9864168160345743},\n", + " 'xform': {'time': 0.42592787742614746, 'trust': 0.9856609318173406}},\n", + " {'unsupervised': {'time': 0.44254136085510254, 'trust': 0.9869448789027268},\n", + " 'supervised': {'time': 0.4717979431152344, 'trust': 0.9879864913274599},\n", + " 'xform': {'time': 0.14838576316833496, 'trust': 0.9858305953573715}},\n", + " {'unsupervised': {'time': 0.36017560958862305, 'trust': 0.9863258882554476},\n", + " 'supervised': {'time': 0.474459171295166, 'trust': 0.9875988980757733},\n", + " 'xform': {'time': 0.15300321578979492, 'trust': 0.9862826829215756}}],\n", + " 'umaplearn': [{'unsupervised': {'time': 9.263211965560913,\n", + " 'trust': 0.9871246641182629},\n", + " 'supervised': {'time': 7.278768062591553, 'trust': 0.9874444797866585},\n", + " 'xform': {'time': 6.69830322265625, 'trust': 0.9876964551341049}},\n", + " {'unsupervised': {'time': 5.6170947551727295, 'trust': 0.9879530939808027},\n", + " 'supervised': {'time': 6.764050483703613, 'trust': 0.9872420044514459},\n", + " 'xform': {'time': 6.6580493450164795, 'trust': 0.9871366260984057}},\n", + " {'unsupervised': {'time': 5.205284118652344, 'trust': 0.9875342573753868},\n", + " 'supervised': {'time': 6.607924222946167, 'trust': 0.9872283276419821},\n", + " 'xform': {'time': 6.466777563095093, 'trust': 0.9877209855584185}},\n", + " {'unsupervised': {'time': 5.228588342666626, 'trust': 0.987210531059637},\n", + " 'supervised': {'time': 6.372365951538086, 'trust': 0.9876400330669284},\n", + " 'xform': {'time': 6.500810861587524, 'trust': 0.9877023942990708}}],\n", + " 'umapgpumap': [{'unsupervised': {'time': 4.263961553573608,\n", + " 'trust': 0.951669271325105},\n", + " 'supervised': {'time': 3.118447780609131, 'trust': 0.9531886310162015}},\n", + " {'unsupervised': {'time': 1.9332420825958252, 'trust': 0.9558078864163978},\n", + " 'supervised': {'time': 2.453421115875244, 'trust': 0.9512448510681191}},\n", + " {'unsupervised': {'time': 1.8781499862670898, 'trust': 0.9556659738337958},\n", + " 'supervised': {'time': 2.3237812519073486, 'trust': 0.9522937410611908}},\n", + " {'unsupervised': {'time': 1.8561744689941406, 'trust': 0.9461403688179619},\n", + " 'supervised': {'time': 2.3167366981506348, 'trust': 0.9548070534694238}}]},\n", + " 'fashion_mnist': {'umapgpumap': [{'unsupervised': {'time': 6.479771852493286,\n", + " 'trust': 0.9745745792397836},\n", + " 'supervised': {'time': 6.876313924789429, 'trust': 0.9676141851918549}},\n", + " {'unsupervised': {'time': 3.3777499198913574, 'trust': 0.9750927790516971},\n", + " 'supervised': {'time': 6.480828523635864, 'trust': 0.9696567894031254}},\n", + " {'unsupervised': {'time': 3.3025641441345215, 'trust': 0.9748744066270485},\n", + " 'supervised': {'time': 6.351450681686401, 'trust': 0.9664509547458497}},\n", + " {'unsupervised': {'time': 3.4731950759887695, 'trust': 0.9748966109477026},\n", + " 'supervised': {'time': 6.198164224624634, 'trust': 0.9668087698207842}}],\n", + " 'umapcuml': [{'unsupervised': {'time': 0.589625358581543,\n", + " 'trust': 0.97601825773189},\n", + " 'supervised': {'time': 1.0603547096252441, 'trust': 0.9757726834610587},\n", + " 'xform': {'time': 0.44457364082336426, 'trust': 0.9736058731016267}},\n", + " {'unsupervised': {'time': 0.40508532524108887, 'trust': 0.9739971370030214},\n", + " 'supervised': {'time': 1.025468349456787, 'trust': 0.9762932845375539},\n", + " 'xform': {'time': 0.45213890075683594, 'trust': 0.969764299951311}},\n", + " {'unsupervised': {'time': 0.41647815704345703, 'trust': 0.9773246413616931},\n", + " 'supervised': {'time': 1.0376453399658203, 'trust': 0.9755241428390393},\n", + " 'xform': {'time': 0.46546101570129395, 'trust': 0.9742983522553708}},\n", + " {'unsupervised': {'time': 0.4108717441558838, 'trust': 0.974691792331139},\n", + " 'supervised': {'time': 1.0243778228759766, 'trust': 0.9758944423768267},\n", + " 'xform': {'time': 0.4740877151489258, 'trust': 0.9738181965837619}}],\n", + " 'umaplearn': [{'unsupervised': {'time': 51.31654953956604,\n", + " 'trust': 0.9768837692805357},\n", + " 'supervised': {'time': 52.09980010986328, 'trust': 0.9775639865804937},\n", + " 'xform': {'time': 0.261110782623291, 'trust': 0.9782571167003489}},\n", + " {'unsupervised': {'time': 44.576194524765015, 'trust': 0.9777756288426009},\n", + " 'supervised': {'time': 52.18143153190613, 'trust': 0.9772261758275473},\n", + " 'xform': {'time': 0.2587544918060303, 'trust': 0.9785438356844148}},\n", + " {'unsupervised': {'time': 44.49176001548767, 'trust': 0.9783708581947695},\n", + " 'supervised': {'time': 50.79117822647095, 'trust': 0.9771812216019369},\n", + " 'xform': {'time': 0.2576425075531006, 'trust': 0.9768428079316486}},\n", + " {'unsupervised': {'time': 43.096691846847534, 'trust': 0.9781400324149109},\n", + " 'supervised': {'time': 57.29015898704529, 'trust': 0.9781969233047202},\n", + " 'xform': {'time': 0.2584860324859619, 'trust': 0.9787408533525421}}]},\n", + " 'cifar100': {'umaplearn': [{'unsupervised': {'time': 106.26967883110046,\n", + " 'trust': 0.8441760104151036},\n", + " 'supervised': {'time': 99.81106400489807, 'trust': 0.8474044828110034},\n", + " 'xform': {'time': 99.87579226493835, 'trust': 0.8498454070541855}},\n", + " {'unsupervised': {'time': 106.59346199035645, 'trust': 0.844329741901062},\n", + " 'supervised': {'time': 97.1928915977478, 'trust': 0.8490670250262601},\n", + " 'xform': {'time': 98.72296786308289, 'trust': 0.8435752178705541}},\n", + " {'unsupervised': {'time': 107.33383107185364, 'trust': 0.8472284890319066},\n", + " 'supervised': {'time': 100.0251407623291, 'trust': 0.8463326112046655},\n", + " 'xform': {'time': 100.32673573493958, 'trust': 0.8508993518441328}},\n", + " {'unsupervised': {'time': 103.20404744148254, 'trust': 0.8423912181521435},\n", + " 'supervised': {'time': 96.66665291786194, 'trust': 0.8450828778624399},\n", + " 'xform': {'time': 100.74139332771301, 'trust': 0.8453433263121233}}],\n", + " 'umapcuml': [{'unsupervised': {'time': 0.813828706741333,\n", + " 'trust': 0.827138247643078},\n", + " 'supervised': {'time': 1.107100248336792, 'trust': 0.834695580398412},\n", + " 'xform': {'time': 0.991875410079956, 'trust': 0.8296861796281167}},\n", + " {'unsupervised': {'time': 1.0972464084625244, 'trust': 0.8342801452408608},\n", + " 'supervised': {'time': 1.087634563446045, 'trust': 0.8380606268143529},\n", + " 'xform': {'time': 1.041929006576538, 'trust': 0.8325708448609004}},\n", + " {'unsupervised': {'time': 0.9525892734527588, 'trust': 0.8270282343602084},\n", + " 'supervised': {'time': 1.0747671127319336, 'trust': 0.8382352637585149},\n", + " 'xform': {'time': 1.0186042785644531, 'trust': 0.8290310706696826}},\n", + " {'unsupervised': {'time': 1.1712286472320557, 'trust': 0.825273720981932},\n", + " 'supervised': {'time': 1.0567560195922852, 'trust': 0.8315560307932855},\n", + " 'xform': {'time': 0.9979662895202637, 'trust': 0.8287983224865828}}],\n", + " 'umapgpumap': [{'unsupervised': {'time': 8.464476108551025,\n", + " 'trust': 0.8346008593479353},\n", + " 'supervised': {'time': 6.0653767585754395, 'trust': 0.8309824029952222}},\n", + " {'unsupervised': {'time': 5.399772882461548, 'trust': 0.8401050404506913},\n", + " 'supervised': {'time': 5.690048456192017, 'trust': 0.8296486555401793}},\n", + " {'unsupervised': {'time': 5.15649676322937, 'trust': 0.8347203629909984},\n", + " 'supervised': {'time': 6.0474982261657715, 'trust': 0.8307973040035718}},\n", + " {'unsupervised': {'time': 5.723386287689209, 'trust': 0.8329430399430893},\n", + " 'supervised': {'time': 6.012655973434448, 'trust': 0.8284376570463122}}]},\n", + " 'coil20': {'umapcuml': [{'unsupervised': {'time': 2.064183473587036,\n", + " 'trust': 0.9921860542617423},\n", + " 'supervised': {'time': 0.394045352935791, 'trust': 0.9866257285867378},\n", + " 'xform': {'time': 0.17340397834777832, 'trust': 0.9927199482474712}},\n", + " {'unsupervised': {'time': 0.22162199020385742, 'trust': 0.9926057267571029},\n", + " 'supervised': {'time': 0.23158931732177734, 'trust': 0.9862040957683159},\n", + " 'xform': {'time': 0.37630367279052734, 'trust': 0.9909749666745079}},\n", + " {'unsupervised': {'time': 0.4388558864593506, 'trust': 0.9909937857758959},\n", + " 'supervised': {'time': 0.43929457664489746, 'trust': 0.986986493374108},\n", + " 'xform': {'time': 0.1761476993560791, 'trust': 0.9907711910922921}},\n", + " {'unsupervised': {'time': 0.30503273010253906, 'trust': 0.9927721908570533},\n", + " 'supervised': {'time': 0.4133152961730957, 'trust': 0.9864624010036854},\n", + " 'xform': {'time': 0.18416094779968262, 'trust': 0.9922615920437021}}],\n", + " 'umapgpumap': [{'unsupervised': {'time': 2.5237808227539062,\n", + " 'trust': 0.9562308541781019},\n", + " 'supervised': {'time': 8.3171067237854, 'trust': 0.9332845866332105}},\n", + " {'unsupervised': {'time': 2.702404499053955, 'trust': 0.9561813560208056},\n", + " 'supervised': {'time': 8.395845174789429, 'trust': 0.932599486395358}},\n", + " {'unsupervised': {'time': 2.562446117401123, 'trust': 0.9565478057450535},\n", + " 'supervised': {'time': 7.961192607879639, 'trust': 0.9290572806398495}},\n", + " {'unsupervised': {'time': 2.5401978492736816, 'trust': 0.9438459578138477},\n", + " 'supervised': {'time': 8.168630838394165, 'trust': 0.9308458794009253}}],\n", + " 'umaplearn': [{'unsupervised': {'time': 13.982086896896362,\n", + " 'trust': 0.9936500235238768},\n", + " 'supervised': {'time': 12.38850736618042, 'trust': 0.9868479965498315},\n", + " 'xform': {'time': 12.405791997909546, 'trust': 0.9872499934655898}},\n", + " {'unsupervised': {'time': 10.334022283554077, 'trust': 0.9923627447136621},\n", + " 'supervised': {'time': 12.462889194488525, 'trust': 0.986517094017094},\n", + " 'xform': {'time': 12.799014806747437, 'trust': 0.9875311037925717}},\n", + " {'unsupervised': {'time': 10.396621227264404, 'trust': 0.9921312959042317},\n", + " 'supervised': {'time': 12.415169477462769, 'trust': 0.9855754528346271},\n", + " 'xform': {'time': 12.686092615127563, 'trust': 0.9872973025954678}},\n", + " {'unsupervised': {'time': 10.12751817703247, 'trust': 0.9892843840664941},\n", + " 'supervised': {'time': 12.08709454536438, 'trust': 0.9860139444313756},\n", + " 'xform': {'time': 11.974581480026245, 'trust': 0.9857327034161897}}]},\n", + " 'shuttle': {'umapcuml': [{'unsupervised': {'time': 0.7837574481964111,\n", + " 'trust': 0.4907984035370634},\n", + " 'supervised': {'time': 0.5476815700531006, 'trust': 0.9999999959879131},\n", + " 'xform': {'time': 0.4578070640563965, 'trust': 0.9999987639175375}},\n", + " {'unsupervised': {'time': 0.5676589012145996, 'trust': 0.9999980015103526},\n", + " 'supervised': {'time': 0.4594261646270752, 'trust': 0.9999976843618024},\n", + " 'xform': {'time': 0.2353208065032959, 'trust': 0.9999980647629775}},\n", + " {'unsupervised': {'time': 0.34301042556762695, 'trust': 0.9999991344960172},\n", + " 'supervised': {'time': 0.4870455265045166, 'trust': 0.9999997347595531},\n", + " 'xform': {'time': 0.44645023345947266, 'trust': 0.9999988579663878}},\n", + " {'unsupervised': {'time': 0.6358237266540527, 'trust': 0.9999971075067181},\n", + " 'supervised': {'time': 0.7296688556671143, 'trust': 0.9999981510643496},\n", + " 'xform': {'time': 0.45848917961120605, 'trust': 0.9999989773328898}}],\n", + " 'umaplearn': [{'unsupervised': {'time': 43.14485430717468,\n", + " 'trust': 0.9995937495025617},\n", + " 'supervised': {'time': 51.45029878616333, 'trust': 1.0},\n", + " 'xform': {'time': 61.346752882003784, 'trust': 0.9999996712025631}},\n", + " {'unsupervised': {'time': 36.14379930496216, 'trust': 0.9999990541436014},\n", + " 'supervised': {'time': 55.80541253089905, 'trust': 0.9999988794103004},\n", + " 'xform': {'time': 60.803152561187744, 'trust': 1.0}},\n", + " {'unsupervised': {'time': 39.73343586921692, 'trust': 1.0},\n", + " 'supervised': {'time': 49.26429724693298, 'trust': 0.7057494722624743},\n", + " 'xform': {'time': 48.41824674606323, 'trust': 0.6532779119313101}},\n", + " {'unsupervised': {'time': 36.481457471847534, 'trust': 0.7662410042817691},\n", + " 'supervised': {'time': 44.17238092422485, 'trust': 1.0},\n", + " 'xform': {'time': 50.555405616760254, 'trust': 1.0}}],\n", + " 'umapgpumap': [{'unsupervised': {'time': 12.086997747421265,\n", + " 'trust': 0.9732011740073436},\n", + " 'supervised': {'time': 19.82458734512329, 'trust': 0.9668076059384338}},\n", + " {'unsupervised': {'time': 8.946580171585083, 'trust': 0.9763382049528245},\n", + " 'supervised': {'time': 12.691536664962769, 'trust': 0.9639454594176171}},\n", + " {'unsupervised': {'time': 7.189095497131348, 'trust': 0.9778424900551579},\n", + " 'supervised': {'time': 23.83462929725647, 'trust': 0.9640114737970411}},\n", + " {'unsupervised': {'time': 8.034754753112793, 'trust': 0.9698157821284997},\n", + " 'supervised': {'time': 12.253328800201416, 'trust': 0.9658356159126269}}]},\n", + " 'mnist': {'umapcuml': [{'unsupervised': {'time': 0.8702592849731445,\n", + " 'trust': 0.9559595178151625},\n", + " 'supervised': {'time': 1.011225700378418, 'trust': 0.9553705150307618},\n", + " 'xform': {'time': 0.4377431869506836, 'trust': 0.954108408575139}},\n", + " {'unsupervised': {'time': 0.6537017822265625, 'trust': 0.9574214731202516},\n", + " 'supervised': {'time': 0.8979723453521729, 'trust': 0.9560532148990446},\n", + " 'xform': {'time': 0.4433917999267578, 'trust': 0.9531227299909409}},\n", + " {'unsupervised': {'time': 0.6645529270172119, 'trust': 0.956424851544944},\n", + " 'supervised': {'time': 0.8808555603027344, 'trust': 0.9563548094547354},\n", + " 'xform': {'time': 0.4418752193450928, 'trust': 0.952863784209873}},\n", + " {'unsupervised': {'time': 0.642711877822876, 'trust': 0.9567874069133908},\n", + " 'supervised': {'time': 0.879960298538208, 'trust': 0.9574419861872977},\n", + " 'xform': {'time': 0.44603633880615234, 'trust': 0.954058174077841}}],\n", + " 'umaplearn': [{'unsupervised': {'time': 52.32333850860596,\n", + " 'trust': 0.9575792140876225},\n", + " 'supervised': {'time': 88.2114098072052, 'trust': 0.9563557643207674},\n", + " 'xform': {'time': 85.75131511688232, 'trust': 0.9566137627757307}},\n", + " {'unsupervised': {'time': 51.839635133743286, 'trust': 0.9594030397578331},\n", + " 'supervised': {'time': 92.17171335220337, 'trust': 0.9564347384460339},\n", + " 'xform': {'time': 94.09139847755432, 'trust': 0.9575536319478022}},\n", + " {'unsupervised': {'time': 54.40650486946106, 'trust': 0.9590946155285452},\n", + " 'supervised': {'time': 90.77351665496826, 'trust': 0.9569490322586621},\n", + " 'xform': {'time': 88.45775318145752, 'trust': 0.9581549449881714}},\n", + " {'unsupervised': {'time': 51.73101210594177, 'trust': 0.9590602315702501},\n", + " 'supervised': {'time': 85.51447701454163, 'trust': 0.9584785717302744},\n", + " 'xform': {'time': 101.88577651977539, 'trust': 0.9568694395073667}}],\n", + " 'umapgpumap': [{'unsupervised': {'time': 10.581817388534546,\n", + " 'trust': 0.9442834214597077},\n", + " 'supervised': {'time': 25.554595947265625, 'trust': 0.9450770215619692}},\n", + " {'unsupervised': {'time': 10.227915525436401, 'trust': 0.9419907738521989},\n", + " 'supervised': {'time': 25.033493280410767, 'trust': 0.9447635175706243}},\n", + " {'unsupervised': {'time': 9.917372941970825, 'trust': 0.9436392852630545},\n", + " 'supervised': {'time': 25.709108352661133, 'trust': 0.9469258037618123}},\n", + " {'unsupervised': {'time': 11.701467275619507, 'trust': 0.9397684622623858},\n", + " 'supervised': {'time': 19.324782371520996, 'trust': 0.9448813560198076}}]},\n", + " 'scrna': {'umapcuml': [{'unsupervised': {'time': 3.8234424591064453,\n", + " 'trust': 0.6178009579665475},\n", + " 'xform': {'time': 3.4701781272888184, 'trust': 0.5337737424809548}},\n", + " {'unsupervised': {'time': 3.8994381427764893, 'trust': 0.6201815623705682},\n", + " 'xform': {'time': 3.9311397075653076, 'trust': 0.9708745134888634}},\n", + " {'unsupervised': {'time': 4.300644397735596, 'trust': 0.6187572904540646},\n", + " 'xform': {'time': 3.5282957553863525, 'trust': 0.8702917658612157}},\n", + " {'unsupervised': {'time': 4.389519453048706, 'trust': 0.9781377849332045},\n", + " 'xform': {'time': 3.602308988571167, 'trust': 0.6201129287721822}}],\n", + " 'umapgpumap': [{'unsupervised': {'time': 13.082006216049194,\n", + " 'trust': 0.6168243354282248}},\n", + " {'unsupervised': {'time': 10.061619758605957, 'trust': 0.9434946335833367}},\n", + " {'unsupervised': {'time': 10.218225479125977, 'trust': 0.6203727995014007}},\n", + " {'unsupervised': {'time': 10.199477195739746,\n", + " 'trust': 0.9425912295012873}}],\n", + " 'umaplearn': [{'unsupervised': {'time': 222.38141894340515,\n", + " 'trust': 0.6233079229256522},\n", + " 'xform': {'time': 2.1147098541259766, 'trust': 0.9412946245569216}},\n", + " {'unsupervised': {'time': 225.683984041214, 'trust': 0.6231010898670823},\n", + " 'xform': {'time': 2.2269630432128906, 'trust': 0.9502004104337913}},\n", + " {'unsupervised': {'time': 227.73396110534668, 'trust': 0.6238778233428524},\n", + " 'xform': {'time': 2.115278482437134, 'trust': 0.9481981978106048}},\n", + " {'unsupervised': {'time': 219.88730216026306, 'trust': 0.623651432003607},\n", + " 'xform': {'time': 2.3570568561553955, 'trust': 0.9415185740893797}}]}}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "benchmarks" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "final_results = {}\n", + "\n", + "for dataset in datasets:\n", + " d = benchmarks[dataset]\n", + " final_results[dataset] = {}\n", + " for algo in algos:\n", + " if algo in d:\n", + " a = d[algo]\n", + " summary = {}\n", + " for bench in a:\n", + " for t in types:\n", + " if t not in summary:\n", + " summary[t] = { \"sum\": 0, \"count\": 0, \"sum_squared\": 0, \"trust\": 0}\n", + " if t in bench:\n", + " time = bench[t][\"time\"]\n", + " summary[t][\"sum\"] += time\n", + " summary[t][\"count\"] += 1\n", + " summary[t][\"sum_squared\"] += time**2\n", + " summary[t][\"trust\"] = max(summary[t][\"trust\"], bench[t][\"trust\"])\n", + "\n", + " final_results[dataset][algo] = {}\n", + " for t in types:\n", + " if summary[t][\"count\"] > 0:\n", + " mean = (summary[t][\"sum\"]) / summary[t][\"count\"]\n", + " var = ((summary[t][\"sum_squared\"]) / summary[t][\"count\"]) - (mean**2)\n", + " trust = summary[t][\"trust\"]\n", + " else:\n", + " mean = 0\n", + " var = 0\n", + " trust = 0\n", + " final_results[dataset][algo][t] = {\"mean\": mean, \"var\": var, \"trust\": trust}\n", + " \n", + " else:\n", + " print(algo + \" not in \" + dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'digits': {'umapcuml': {'unsupervised': {'mean': 0.35827815532684326,\n", + " 'var': 0.011162857575371277,\n", + " 'trust': 0.9876636433389232},\n", + " 'supervised': {'mean': 0.40629714727401733,\n", + " 'var': 0.013507401316314116,\n", + " 'trust': 0.9879864913274599}},\n", + " 'umaplearn': {'unsupervised': {'mean': 6.328544795513153,\n", + " 'var': 2.8975128262992627,\n", + " 'trust': 0.9879530939808027},\n", + " 'supervised': {'mean': 6.755777180194855,\n", + " 'var': 0.1106131444620786,\n", + " 'trust': 0.9876400330669284}},\n", + " 'umapgpumap': {'unsupervised': {'mean': 2.482882022857666,\n", + " 'var': 1.0582028882037378,\n", + " 'trust': 0.9558078864163978},\n", + " 'supervised': {'mean': 2.5530967116355896,\n", + " 'var': 0.10950217290617204,\n", + " 'trust': 0.9548070534694238}}},\n", + " 'fashion_mnist': {'umapcuml': {'unsupervised': {'mean': 0.45551514625549316,\n", + " 'var': 0.006011408943862762,\n", + " 'trust': 0.9773246413616931},\n", + " 'supervised': {'mean': 1.036961555480957,\n", + " 'var': 0.0002095378332569453,\n", + " 'trust': 0.9762932845375539}},\n", + " 'umaplearn': {'unsupervised': {'mean': 45.870298981666565,\n", + " 'var': 10.23240442609631,\n", + " 'trust': 0.9783708581947695},\n", + " 'supervised': {'mean': 53.09064221382141,\n", + " 'var': 6.182976974198482,\n", + " 'trust': 0.9781969233047202}},\n", + " 'umapgpumap': {'unsupervised': {'mean': 4.158320248126984,\n", + " 'var': 1.800035649938632,\n", + " 'trust': 0.9750927790516971},\n", + " 'supervised': {'mean': 6.476689338684082,\n", + " 'var': 0.06324447576008652,\n", + " 'trust': 0.9696567894031254}}},\n", + " 'cifar100': {'umapcuml': {'unsupervised': {'mean': 1.008723258972168,\n", + " 'var': 0.018844815007184934,\n", + " 'trust': 0.8342801452408608},\n", + " 'supervised': {'mean': 1.081564486026764,\n", + " 'var': 0.0003376463217925618,\n", + " 'trust': 0.8382352637585149}},\n", + " 'umaplearn': {'unsupervised': {'mean': 105.85025483369827,\n", + " 'var': 2.482921346083458,\n", + " 'trust': 0.8472284890319066},\n", + " 'supervised': {'mean': 98.42393732070923,\n", + " 'var': 2.272873735988469,\n", + " 'trust': 0.8490670250262601}},\n", + " 'umapgpumap': {'unsupervised': {'mean': 6.186033010482788,\n", + " 'var': 1.7708737036427635,\n", + " 'trust': 0.8401050404506913},\n", + " 'supervised': {'mean': 5.953894853591919,\n", + " 'var': 0.023564399280502357,\n", + " 'trust': 0.8309824029952222}}},\n", + " 'coil20': {'umapcuml': {'unsupervised': {'mean': 0.7574235200881958,\n", + " 'var': 0.5752119048886044,\n", + " 'trust': 0.9927721908570533},\n", + " 'supervised': {'mean': 0.3695611357688904,\n", + " 'var': 0.006603219726240894,\n", + " 'trust': 0.986986493374108}},\n", + " 'umaplearn': {'unsupervised': {'mean': 11.210062146186829,\n", + " 'var': 2.571288658299423,\n", + " 'trust': 0.9936500235238768},\n", + " 'supervised': {'mean': 12.338415145874023,\n", + " 'var': 0.02176407274296821,\n", + " 'trust': 0.9868479965498315}},\n", + " 'umapgpumap': {'unsupervised': {'mean': 2.5822073221206665,\n", + " 'var': 0.005004079547958895,\n", + " 'trust': 0.9565478057450535},\n", + " 'supervised': {'mean': 8.210693836212158,\n", + " 'var': 0.02740621988587577,\n", + " 'trust': 0.9332845866332105}}},\n", + " 'shuttle': {'umapcuml': {'unsupervised': {'mean': 0.5825626254081726,\n", + " 'var': 0.025230869766499353,\n", + " 'trust': 0.9999991344960172},\n", + " 'supervised': {'mean': 0.5559555292129517,\n", + " 'var': 0.011077821222144735,\n", + " 'trust': 0.9999999959879131}},\n", + " 'umaplearn': {'unsupervised': {'mean': 38.87588673830032,\n", + " 'var': 8.039266967871527,\n", + " 'trust': 1.0},\n", + " 'supervised': {'mean': 50.173097372055054,\n", + " 'var': 17.54718326385637,\n", + " 'trust': 1.0}},\n", + " 'umapgpumap': {'unsupervised': {'mean': 9.064357042312622,\n", + " 'var': 3.431728740054808,\n", + " 'trust': 0.9778424900551579},\n", + " 'supervised': {'mean': 17.151020526885986,\n", + " 'var': 23.923241572835195,\n", + " 'trust': 0.9668076059384338}}},\n", + " 'mnist': {'umapcuml': {'unsupervised': {'mean': 0.7078064680099487,\n", + " 'var': 0.008856602310757467,\n", + " 'trust': 0.9574214731202516},\n", + " 'supervised': {'mean': 0.9175034761428833,\n", + " 'var': 0.0029794700764256277,\n", + " 'trust': 0.9574419861872977}},\n", + " 'umaplearn': {'unsupervised': {'mean': 52.57512265443802,\n", + " 'var': 1.1677051461933843,\n", + " 'trust': 0.9594030397578331},\n", + " 'supervised': {'mean': 89.16777920722961,\n", + " 'var': 6.465818109702013,\n", + " 'trust': 0.9584785717302744}},\n", + " 'umapgpumap': {'unsupervised': {'mean': 10.60714328289032,\n", + " 'var': 0.45444580430584836,\n", + " 'trust': 0.9442834214597077},\n", + " 'supervised': {'mean': 23.90549498796463,\n", + " 'var': 7.056965841371607,\n", + " 'trust': 0.9469258037618123}}},\n", + " 'scrna': {'umapcuml': {'unsupervised': {'mean': 4.103261113166809,\n", + " 'var': 0.06018657014949724,\n", + " 'trust': 0.9781377849332045},\n", + " 'supervised': {'mean': 0, 'var': 0, 'trust': 0}},\n", + " 'umaplearn': {'unsupervised': {'mean': 223.92166656255722,\n", + " 'var': 9.071952858837903,\n", + " 'trust': 0.6238778233428524},\n", + " 'supervised': {'mean': 0, 'var': 0, 'trust': 0}},\n", + " 'umapgpumap': {'unsupervised': {'mean': 10.890332162380219,\n", + " 'var': 1.604801846075489,\n", + " 'trust': 0.9434946335833367},\n", + " 'supervised': {'mean': 0, 'var': 0, 'trust': 0}}}}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_results" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "data = []\n", + "for dataset in datasets:\n", + " for algo in algos:\n", + " for t in types:\n", + " if algo in final_results[dataset]:\n", + " b = final_results[dataset][algo][t]\n", + " data.append((dataset, algo, t, b[\"mean\"], b[\"var\"], b[\"trust\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "\n", + "with open('benchmark_results.csv','w') as out:\n", + " csv_out=csv.writer(out)\n", + " csv_out.writerow(['dataset','impl', 'bench', 'mean', 'var', 'max_trust'])\n", + " for row in data:\n", + " csv_out.writerow(row)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('digits',\n", + " 'umapcuml',\n", + " 'unsupervised',\n", + " 0.35827815532684326,\n", + " 0.011162857575371277,\n", + " 0.9876636433389232),\n", + " ('digits',\n", + " 'umapcuml',\n", + " 'supervised',\n", + " 0.40629714727401733,\n", + " 0.013507401316314116,\n", + " 0.9879864913274599),\n", + " ('digits',\n", + " 'umaplearn',\n", + " 'unsupervised',\n", + " 6.328544795513153,\n", + " 2.8975128262992627,\n", + " 0.9879530939808027),\n", + " ('digits',\n", + " 'umaplearn',\n", + " 'supervised',\n", + " 6.755777180194855,\n", + " 0.1106131444620786,\n", + " 0.9876400330669284),\n", + " ('digits',\n", + " 'umapgpumap',\n", + " 'unsupervised',\n", + " 2.482882022857666,\n", + " 1.0582028882037378,\n", + " 0.9558078864163978),\n", + " ('digits',\n", + " 'umapgpumap',\n", + " 'supervised',\n", + " 2.5530967116355896,\n", + " 0.10950217290617204,\n", + " 0.9548070534694238),\n", + " ('fashion_mnist',\n", + " 'umapcuml',\n", + " 'unsupervised',\n", + " 0.45551514625549316,\n", + " 0.006011408943862762,\n", + " 0.9773246413616931),\n", + " ('fashion_mnist',\n", + " 'umapcuml',\n", + " 'supervised',\n", + " 1.036961555480957,\n", + " 0.0002095378332569453,\n", + " 0.9762932845375539),\n", + " ('fashion_mnist',\n", + " 'umaplearn',\n", + " 'unsupervised',\n", + " 45.870298981666565,\n", + " 10.23240442609631,\n", + " 0.9783708581947695),\n", + " ('fashion_mnist',\n", + " 'umaplearn',\n", + " 'supervised',\n", + " 53.09064221382141,\n", + " 6.182976974198482,\n", + " 0.9781969233047202),\n", + " ('fashion_mnist',\n", + " 'umapgpumap',\n", + " 'unsupervised',\n", + " 4.158320248126984,\n", + " 1.800035649938632,\n", + " 0.9750927790516971),\n", + " ('fashion_mnist',\n", + " 'umapgpumap',\n", + " 'supervised',\n", + " 6.476689338684082,\n", + " 0.06324447576008652,\n", + " 0.9696567894031254),\n", + " ('cifar100',\n", + " 'umapcuml',\n", + " 'unsupervised',\n", + " 1.008723258972168,\n", + " 0.018844815007184934,\n", + " 0.8342801452408608),\n", + " ('cifar100',\n", + " 'umapcuml',\n", + " 'supervised',\n", + " 1.081564486026764,\n", + " 0.0003376463217925618,\n", + " 0.8382352637585149),\n", + " ('cifar100',\n", + " 'umaplearn',\n", + " 'unsupervised',\n", + " 105.85025483369827,\n", + " 2.482921346083458,\n", + " 0.8472284890319066),\n", + " ('cifar100',\n", + " 'umaplearn',\n", + " 'supervised',\n", + " 98.42393732070923,\n", + " 2.272873735988469,\n", + " 0.8490670250262601),\n", + " ('cifar100',\n", + " 'umapgpumap',\n", + " 'unsupervised',\n", + " 6.186033010482788,\n", + " 1.7708737036427635,\n", + " 0.8401050404506913),\n", + " ('cifar100',\n", + " 'umapgpumap',\n", + " 'supervised',\n", + " 5.953894853591919,\n", + " 0.023564399280502357,\n", + " 0.8309824029952222),\n", + " ('coil20',\n", + " 'umapcuml',\n", + " 'unsupervised',\n", + " 0.7574235200881958,\n", + " 0.5752119048886044,\n", + " 0.9927721908570533),\n", + " ('coil20',\n", + " 'umapcuml',\n", + " 'supervised',\n", + " 0.3695611357688904,\n", + " 0.006603219726240894,\n", + " 0.986986493374108),\n", + " ('coil20',\n", + " 'umaplearn',\n", + " 'unsupervised',\n", + " 11.210062146186829,\n", + " 2.571288658299423,\n", + " 0.9936500235238768),\n", + " ('coil20',\n", + " 'umaplearn',\n", + " 'supervised',\n", + " 12.338415145874023,\n", + " 0.02176407274296821,\n", + " 0.9868479965498315),\n", + " ('coil20',\n", + " 'umapgpumap',\n", + " 'unsupervised',\n", + " 2.5822073221206665,\n", + " 0.005004079547958895,\n", + " 0.9565478057450535),\n", + " ('coil20',\n", + " 'umapgpumap',\n", + " 'supervised',\n", + " 8.210693836212158,\n", + " 0.02740621988587577,\n", + " 0.9332845866332105),\n", + " ('shuttle',\n", + " 'umapcuml',\n", + " 'unsupervised',\n", + " 0.5825626254081726,\n", + " 0.025230869766499353,\n", + " 0.9999991344960172),\n", + " ('shuttle',\n", + " 'umapcuml',\n", + " 'supervised',\n", + " 0.5559555292129517,\n", + " 0.011077821222144735,\n", + " 0.9999999959879131),\n", + " ('shuttle',\n", + " 'umaplearn',\n", + " 'unsupervised',\n", + " 38.87588673830032,\n", + " 8.039266967871527,\n", + " 1.0),\n", + " ('shuttle',\n", + " 'umaplearn',\n", + " 'supervised',\n", + " 50.173097372055054,\n", + " 17.54718326385637,\n", + " 1.0),\n", + " ('shuttle',\n", + " 'umapgpumap',\n", + " 'unsupervised',\n", + " 9.064357042312622,\n", + " 3.431728740054808,\n", + " 0.9778424900551579),\n", + " ('shuttle',\n", + " 'umapgpumap',\n", + " 'supervised',\n", + " 17.151020526885986,\n", + " 23.923241572835195,\n", + " 0.9668076059384338),\n", + " ('mnist',\n", + " 'umapcuml',\n", + " 'unsupervised',\n", + " 0.7078064680099487,\n", + " 0.008856602310757467,\n", + " 0.9574214731202516),\n", + " ('mnist',\n", + " 'umapcuml',\n", + " 'supervised',\n", + " 0.9175034761428833,\n", + " 0.0029794700764256277,\n", + " 0.9574419861872977),\n", + " ('mnist',\n", + " 'umaplearn',\n", + " 'unsupervised',\n", + " 52.57512265443802,\n", + " 1.1677051461933843,\n", + " 0.9594030397578331),\n", + " ('mnist',\n", + " 'umaplearn',\n", + " 'supervised',\n", + " 89.16777920722961,\n", + " 6.465818109702013,\n", + " 0.9584785717302744),\n", + " ('mnist',\n", + " 'umapgpumap',\n", + " 'unsupervised',\n", + " 10.60714328289032,\n", + " 0.45444580430584836,\n", + " 0.9442834214597077),\n", + " ('mnist',\n", + " 'umapgpumap',\n", + " 'supervised',\n", + " 23.90549498796463,\n", + " 7.056965841371607,\n", + " 0.9469258037618123),\n", + " ('scrna',\n", + " 'umapcuml',\n", + " 'unsupervised',\n", + " 4.103261113166809,\n", + " 0.06018657014949724,\n", + " 0.9781377849332045),\n", + " ('scrna', 'umapcuml', 'supervised', 0, 0, 0),\n", + " ('scrna',\n", + " 'umaplearn',\n", + " 'unsupervised',\n", + " 223.92166656255722,\n", + " 9.071952858837903,\n", + " 0.6238778233428524),\n", + " ('scrna', 'umaplearn', 'supervised', 0, 0, 0),\n", + " ('scrna',\n", + " 'umapgpumap',\n", + " 'unsupervised',\n", + " 10.890332162380219,\n", + " 1.604801846075489,\n", + " 0.9434946335833367),\n", + " ('scrna', 'umapgpumap', 'supervised', 0, 0, 0)]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "scale_results_noknn = pickle.load(open(\"results/scale_results_precompute_knn.pickle\", \"rb\"))[\"umapcuml\"]\n", + "scale_results_knn = pickle.load(open(\"results/scale_results_with_knn.pickle\", \"rb\"))[\"umapcuml\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "xaxis = sorted([int(a) for a in scale_results_knn.keys()])" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "knn_with = { int(k): v[0][\"unsupervised\"][\"time\"] for k,v in scale_results_knn.items() }" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "knn_without = { int(k): v[0][\"unsupervised\"][\"time\"] for k,v in scale_results_noknn.items() }" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "with open('googlenews_knn_and_without.csv','w') as out:\n", + " csv_out=csv.writer(out)\n", + " csv_out.writerow(['n_samples', 'with_knn', 'without_knn'])\n", + " for n_samples in xaxis:\n", + " csv_out.writerow((n_samples, knn_with[n_samples], knn_without[n_samples]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/publications/umap_paper_benchmarks/notebooks/umap_benchmark.ipynb b/publications/umap_paper_benchmarks/notebooks/umap_benchmark.ipynb new file mode 100644 index 00000000..28d2b410 --- /dev/null +++ b/publications/umap_paper_benchmarks/notebooks/umap_benchmark.ipynb @@ -0,0 +1,853 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# UMAP Experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"..\")\n", + "\n", + "import datasets\n", + "\n", + "from umap_bench.funcs import build_and_train\n", + "from umap_bench.funcs import draw_chart\n", + "from umap_bench.funcs import _run_build_and_train_once\n", + "from umap_bench.funcs import store_results\n", + "from umap_bench.funcs import maybe_load_results\n", + "from umap_bench.funcs import maybe_get_results\n", + "\n", + "from umap_bench.funcs import perform_n_samples_test\n", + "from umap_bench.funcs import perform_n_components_test\n", + "\n", + "from umap_bench import loaders\n", + "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "import pickle\n", + "import rmm\n", + "import time\n", + "import numpy as np\n", + "\n", + "from cuml.metrics import trustworthiness\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from umap import UMAP as UMAP_LEARN\n", + "from cuml.manifold import UMAP as UMAP_CUML\n", + "\n", + "import os\n", + "os.getcwd()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define the number of cores for the multi-core CPU UMAP implementation to use" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"NUMBA_NUM_THREADS\"] = \"80\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since the GPUMAP project is no longer being maintained, we make a best effort to provide reproducibility of benchmarks. We make it optional so the other implementations may still be evaluated if GPUMAP is not installed. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "has_gpumap = True\n", + "try:\n", + " from gpumap import GPUMAP as UMAP_GPUMAP\n", + "except ImportError:\n", + " has_gpumap = False\n", + " \n", + "has_gpumap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "RESULTS_FILE=\"results/results.pickle\"\n", + "SCALE_RESULTS_FILE=\"results/scale_results.pickle\"\n", + "\n", + "POOL_SIZE_GB=15 # Number of GB to use for device memory pool\n", + "\n", + "TRUST_BATCH_SIZE=5000 # Number of rows to use per batch for computing trustworthiness\n", + "\n", + "KEY_UMAPCUML = \"umapcuml\"\n", + "KEY_UMAPLEARN = \"umaplearn\"\n", + "KEY_UMAPGPUMAP = \"umapgpumap\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rmm.reinitialize(\n", + " pool_allocator=True, # default is False\n", + " managed_memory=False, # default is False\n", + " initial_pool_size=int(1024*1024*1024*POOL_SIZE_GB), # set to 2GiB. Default is 1/2 total GPU memory\n", + " devices=0, # GPU device IDs to register. By default registers only GPU 0.\n", + " logging=False, # default is False -- has perf overhead\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_results = maybe_load_results(RESULTS_FILE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pen Digits Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "KEY_DIGITS = \"digits\"\n", + "\n", + "X, y = loaders.load_digits()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_digits = maybe_get_results(final_results, KEY_DIGITS)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_digits[KEY_UMAPCUML] = build_and_train(UMAP_CUML, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_digits[KEY_UMAPLEARN] = build_and_train(UMAP_LEARN, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_digits[KEY_UMAPGPUMAP] = build_and_train(UMAP_GPUMAP, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_results[KEY_DIGITS] = results_digits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_results[KEY_DIGITS]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "store_results(final_results, RESULTS_FILE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fashion MNIST Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# https://github.com/zalandoresearch/fashion-mnist/blob/master/utils/mnist_reader.py\n", + "KEY_FASHION_MNIST = \"fashion_mnist\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train, train_labels = loaders.load_fashion_mnist('data/fashion', kind='train')\n", + "test, test_labels = loaders.load_fashion_mnist('data/fashion', kind='t10k')\n", + "X = (np.array(np.vstack([train, test]), dtype=np.float64) [:50000]/ 255.0).astype(np.float32)\n", + "y = np.array(np.hstack([train_labels, test_labels]))[:50000].astype(np.float32)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_fashion = maybe_get_results(final_results, KEY_FASHION_MNIST)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_fashion[KEY_UMAPCUML] = build_and_train(UMAP_CUML, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_fashion[KEY_UMAPLEARN] = build_and_train(UMAP_LEARN, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_fashion[KEY_UMAPGPUMAP] = build_and_train(UMAP_GPUMAP, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_results[KEY_FASHION_MNIST] = results_fashion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "store_results(final_results, RESULTS_FILE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_results[KEY_FASHION_MNIST]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classes = [\n", + " 'T-shirt/top',\n", + " 'Trouser',\n", + " 'Pullover',\n", + " 'Dress',\n", + " 'Coat',\n", + " 'Sandal',\n", + " 'Shirt',\n", + " 'Sneaker',\n", + " 'Bag',\n", + " 'Ankle boot']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "draw_chart(UMAP_LEARN(n_neighbors=10, min_dist=0.01), X, y, \"Fashion MNIST\", \"UMAP-learn\", classes)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "draw_chart(UMAP_CUML(n_neighbors=10, min_dist=0.01), X, y, \"Fashion MNIST\", \"cuML UMAP\", classes)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "draw_chart(UMAP_GPUMAP(n_neighbors=10, min_dist=0.01), X, y, \"Fashion MNIST\", \"GPUUMAP\", classes)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CIFAR-100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "KEY_CIFAR100 = \"cifar100\"\n", + "\n", + "train, test = loaders.load_cifar100(\"data/cifar100/cifar-100-python\")\n", + "\n", + "train, train_labels = (train[b\"data\"], train[b\"fine_labels\"])\n", + "test, test_labels = (test[b\"data\"], test[b\"fine_labels\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X = (np.array(np.vstack([train, test]), dtype=np.float64) [:60000]/ 255.0).astype(np.float32)\n", + "y = np.array(np.hstack([train_labels, test_labels]))[:60000].astype(np.float32)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_cifar100 = maybe_get_results(final_results, KEY_CIFAR100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_cifar100[KEY_UMAPLEARN] = build_and_train(UMAP_LEARN, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_cifar100[KEY_UMAPCUML] = build_and_train(UMAP_CUML, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_cifar100[KEY_UMAPGPUMAP] = build_and_train(UMAP_GPUMAP, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_results[KEY_CIFAR100] = results_cifar100\n", + "store_results(final_results, RESULTS_FILE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_cifar100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Shuttle Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "KEY_SHUTTLE = \"shuttle\"\n", + "\n", + "X, y = loaders.load_shuttle(\"data/shuttle.mat\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_shuttle = maybe_get_results(final_results, KEY_SHUTTLE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_shuttle[KEY_UMAPCUML] = build_and_train(UMAP_CUML, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_shuttle[KEY_UMAPLEARN] = build_and_train(UMAP_LEARN, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_shuttle[KEY_UMAPGPUMAP] = build_and_train(UMAP_GPUMAP, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_results[KEY_SHUTTLE] = results_shuttle\n", + "store_results(final_results, RESULTS_FILE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_shuttle" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## COIL-20 Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "KEY_COIL20 = \"coil20\"\n", + "\n", + "X, y = loaders.load_coil20(\"data/\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_coil20 = maybe_get_results(final_results, KEY_COIL20)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_coil20[KEY_UMAPCUML] = build_and_train(UMAP_CUML, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_coil20[KEY_UMAPGPUMAP] = build_and_train(UMAP_GPUMAP, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_coil20[KEY_UMAPLEARN] = build_and_train(UMAP_LEARN, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_results[KEY_COIL20] = results_coil20\n", + "store_results(final_results, RESULTS_FILE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_results[KEY_COIL20]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## MNIST Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "KEY_MNIST = \"mnist\"\n", + "\n", + "X, y = loaders.load_mnist(\"data/\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_mnist = maybe_get_results(final_results, KEY_MNIST)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_mnist[KEY_UMAPCUML] = build_and_train(UMAP_CUML, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_mnist[KEY_UMAPLEARN] = build_and_train(UMAP_LEARN, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_mnist[KEY_UMAPGPUMAP] = build_and_train(UMAP_GPUMAP, X, y, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_results[KEY_MNIST] = results_mnist\n", + "store_results(final_results, RESULTS_FILE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_mnist" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## scRNA\n", + "\n", + "This benchmark requires a pickle file to be output from the GPU notebook [here](https://github.com/clara-parabricks/rapids-single-cell-examples)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "KEY_SCRNA = \"scrna\"\n", + "\n", + "X = pickle.load( open( \"data/scrna.pickle\", \"rb\" ) )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_scrna = maybe_get_results(final_results, KEY_SCRNA)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_scrna[KEY_UMAPCUML] = build_and_train(UMAP_CUML, X, None, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_scrna[KEY_UMAPLEARN] = build_and_train(UMAP_LEARN, X, None, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_scrna[KEY_UMAPGPUMAP] = build_and_train(UMAP_GPUMAP, X, None, {})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_results[KEY_SCRNA] = results_scrna\n", + "store_results(final_results, RESULTS_FILE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results_scrna" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scale Benchmark\n", + "\n", + "Test UMAP variants at different `n_samples` and `n_components`. Need to download the \"GoogleNews-vectors-negative300.bin.gz\" dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X = load_word2vec(\"data/\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scale_results = maybe_load_results(SCALE_RESULTS_FILE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scale_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "perform_n_components_test(UMAP_CUML, X, KEY_UMAPCUML)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "store_results(scale_results, SCALE_RESULTS_FILE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "scale_results[KEY_UMAPCUML] = perform_n_samples_test(UMAP_CUML, X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "store_results(scale_results, SCALE_RESULTS_FILE)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/publications/umap_paper_benchmarks/notebooks/umap_mnmg_scaling.ipynb b/publications/umap_paper_benchmarks/notebooks/umap_mnmg_scaling.ipynb new file mode 100644 index 00000000..8e90b76d --- /dev/null +++ b/publications/umap_paper_benchmarks/notebooks/umap_mnmg_scaling.ipynb @@ -0,0 +1,202 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## UMAP MNMG runtime on large dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from cuml.dask.manifold import UMAP as UMAP_MNMG\n", + "from cuml.manifold import UMAP\n", + "from cuml.dask.datasets import make_blobs\n", + "\n", + "from dask_cuda import LocalCUDACluster\n", + "from dask.distributed import Client, wait\n", + "\n", + "import time\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_data(args, client):\n", + " dX, _ = make_blobs(n_samples=args['n_samples'],\n", + " n_features=args['n_features'],\n", + " cluster_std=1.0,\n", + " dtype=\"float32\",\n", + " n_parts=args['n_parts'],\n", + " client=client)\n", + " n_to_sample = int(args['n_samples'] * args['sampling_ratio'])\n", + " dX = client.persist(dX)\n", + " wait(dX)\n", + " lX = dX[:n_to_sample].compute()\n", + " return lX, dX" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def benchmark(args):\n", + " # Start Dask-CUDA cluster & Dask client\n", + " cluster = LocalCUDACluster(n_workers=args['n_parts'], threads_per_worker=1)\n", + " client = Client(cluster)\n", + "\n", + " lX, dX = generate_data(args, client)\n", + " \n", + " # Measure runtime accross n_iter runs (+1 \"warm-up test\")\n", + " durations = []\n", + " for i in range(args['n_iter'] + 1):\n", + " \n", + " # Train local model\n", + " local_model = UMAP(n_components=args['n_components'], n_neighbors=args['n_neighbors'],\n", + " n_epochs=args['n_epochs'])\n", + " local_model.fit(lX)\n", + " \n", + " # Pass trained model and order distributed inference\n", + " model = UMAP_MNMG(local_model)\n", + " lazy_transformed = model.transform(dX)\n", + " \n", + " # Perform distributed inference and measure time\n", + " start = time.time()\n", + " lazy_transformed.compute()\n", + " durations.append(time.time()-start)\n", + " \n", + " # Remove \"warm-up\" test\n", + " durations = np.array(durations[1:])\n", + " \n", + " # Stop Dask-CUDA cluster & Dask client\n", + " client.close()\n", + " cluster.close()\n", + " \n", + " # Return runtime average\n", + " return durations.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def runtime_barchart(args, mean_durations):\n", + " labels = ['1 GPU', '2 GPUs', '4 GPUs', '8 GPUs']\n", + " runtimes = list(map(lambda x: round(x, 2), mean_durations))\n", + " x = np.arange(len(labels))\n", + " fig, ax = plt.subplots()\n", + " rects = ax.bar(x, runtimes, 0.35)\n", + "\n", + " ax.set_ylabel('Runtime (s)')\n", + " ax.set_title('Scale of random dataset transform: {}x{}'.format(args['n_samples'], args['n_features']))\n", + " ax.set_xticks(x)\n", + " ax.set_xticklabels(labels)\n", + "\n", + "\n", + " def autolabel(rects):\n", + " for rect in rects:\n", + " height = rect.get_height()\n", + " ax.annotate('{}'.format(height),\n", + " xy=(rect.get_x() + rect.get_width() / 2, height),\n", + " xytext=(0, 3),\n", + " textcoords=\"offset points\",\n", + " ha='center', va='bottom')\n", + "\n", + " autolabel(rects)\n", + " fig.tight_layout()\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of GPUs: 1, mean runtime: 25.30\n", + "Number of GPUs: 2, mean runtime: 13.38\n", + "Number of GPUs: 4, mean runtime: 7.34\n", + "Number of GPUs: 8, mean runtime: 4.50\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "args = {'n_samples': 100000, 'n_features':300,\n", + " 'n_components': 64, 'n_neighbors':15, 'n_epochs':5000,\n", + " 'sampling_ratio': 0.001, 'n_iter': 3}\n", + "\n", + "mean_runtimes = []\n", + "for n_gpus in [1, 2, 4, 8]:\n", + " args['n_parts'] = n_gpus\n", + " mean_runtime = benchmark(args)\n", + " mean_runtimes.append(mean_runtime)\n", + " print(\"Number of GPUs: {}, mean runtime: {:.2f}\".format(n_gpus, mean_runtime))\n", + "runtime_barchart(args, mean_runtimes)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "\n", + "with open('../results/mnmg_scaling.csv','w') as out:\n", + " csv_out=csv.writer(out)\n", + " csv_out.writerow(['number of GPUs', 'transformation runtime (s)'])\n", + " for i, n_gpus in enumerate([1, 2, 4, 8]):\n", + " csv_out.writerow([n_gpus, mean_runtimes[i]])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/publications/umap_paper_benchmarks/notebooks/umap_mnmg_visualization.ipynb b/publications/umap_paper_benchmarks/notebooks/umap_mnmg_visualization.ipynb new file mode 100644 index 00000000..fc0a8ea3 --- /dev/null +++ b/publications/umap_paper_benchmarks/notebooks/umap_mnmg_visualization.ipynb @@ -0,0 +1,190 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## UMAP MNMG Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from cuml.dask.manifold import UMAP as UMAP_MNMG\n", + "from cuml.manifold import UMAP\n", + "from cuml.metrics import trustworthiness\n", + "\n", + "from dask_cuda import LocalCUDACluster\n", + "from dask.distributed import Client\n", + "import dask.array as da\n", + "\n", + "import pickle\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def distribute_data(data, n_parts, sampling_ratio):\n", + " n_samples = data.shape[0]\n", + " \n", + " # Number of samples for local train\n", + " n_to_sample = int(n_samples * sampling_ratio)\n", + "\n", + " # Generate local train data\n", + " selection = np.random.choice(n_samples, n_to_sample)\n", + " lX = data[selection]\n", + "\n", + " # Number of samples per partition\n", + " n_samples_per_part = int(n_samples / n_parts)\n", + "\n", + " # Generate partitioning of distributed data for inference\n", + " chunks = [n_samples_per_part] * n_parts\n", + " chunks[-1] += n_samples % n_samples_per_part\n", + " chunks = tuple(chunks)\n", + " dX = da.from_array(data, chunks=(chunks, -1))\n", + " \n", + " return lX, dX" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_umap_mnmg(args, X, y):\n", + " fig, ax = plt.subplots(len(args['sampling_ratio']), len(args['n_parts']),\n", + " sharex='col', sharey='row', figsize=(20,27))\n", + " fig.subplots_adjust(wspace=0.1, hspace=0.15)\n", + " fig.suptitle('Comparison of Sampling Ratios and Partitioning in Distributed UMAP', y=0.9, size=20)\n", + " \n", + " for a, n_parts in zip(ax[0], args['n_parts']):\n", + " a.set_title(\"Partitions: {}\".format(n_parts))\n", + "\n", + " for a, sampling_ratio in zip(ax[:,0], args['sampling_ratio']):\n", + " a.set_ylabel(\"Sampling: {:.2f}%\".format(sampling_ratio * 100), rotation=0, size='large', labelpad=45)\n", + " \n", + " if n_parts > 1:\n", + " cluster = LocalCUDACluster(n_workers=8, threads_per_worker=1)\n", + " client = Client(cluster)\n", + " \n", + " for i, n_parts in enumerate(args['n_parts']):\n", + " for j, sampling_ratio in enumerate(args['sampling_ratio']): \n", + " if n_parts == 1: # Local transformation\n", + " local_model = UMAP(n_components=2, n_neighbors=args['n_neighbors'],\n", + " n_epochs=args['n_epochs'], random_state=args['random_state'])\n", + " \n", + " # Generate subsample for training data\n", + " n_samples = X.shape[0]\n", + " n_to_sample = int(n_samples * sampling_ratio)\n", + " selection = np.random.choice(n_samples, n_to_sample)\n", + " train_data = X[selection]\n", + " \n", + " # Run fit with subsample and transform with full data\n", + " local_model.fit(train_data)\n", + " transformed = local_model.transform(X)\n", + " else: # Distributed transformation\n", + " \n", + " # Distribute data\n", + " lX, dX = distribute_data(X, n_parts, sampling_ratio)\n", + "\n", + " # Train local model\n", + " local_model = UMAP(n_components=2, n_neighbors=args['n_neighbors'],\n", + " n_epochs=args['n_epochs'], random_state=args['random_state'])\n", + " local_model.fit(lX)\n", + " \n", + " # Pass trained model and perform distributed inference\n", + " model = UMAP_MNMG(local_model, random_state=args['random_state'])\n", + " transformed = model.transform(dX).compute()\n", + " \n", + " trust_score = trustworthiness(X, transformed, n_neighbors=args['n_neighbors'])\n", + " \n", + " # Plot transformed data\n", + " subplot = ax[j, i]\n", + " subplot.scatter(transformed[:,0], transformed[:,1], c=y,\n", + " s=0.2, alpha=0.3, edgecolors='none')\n", + " trust_score_text = \"Trust: \" + str(round(trust_score, 4))\n", + " subplot.text(0.5,-0.1, trust_score_text, size=14, ha=\"center\", transform=subplot.transAxes)\n", + " \n", + " if n_parts > 1:\n", + " client.close()\n", + " cluster.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preprocess tasic2018 dataset\n", + "Generate the preprocessed dataset with the **tasic2018_dataset_preprocessing.ipynb** notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "X = pickle.load(open(\"tasic2018_X.p\", \"rb\"))\n", + "y = pickle.load(open(\"tasic2018_y.p\", \"rb\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "args = {'n_neighbors': 15, 'n_epochs': 500, 'random_state': 42,\n", + " 'n_parts':[1,4,8,16], 'sampling_ratio': [0.008,0.016,0.032,0.5,1.0]}\n", + "\n", + "plot_umap_mnmg(args, X, y)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/publications/umap_paper_benchmarks/notebooks/umap_reproducibility_benchmark.ipynb b/publications/umap_paper_benchmarks/notebooks/umap_reproducibility_benchmark.ipynb new file mode 100644 index 00000000..41463447 --- /dev/null +++ b/publications/umap_paper_benchmarks/notebooks/umap_reproducibility_benchmark.ipynb @@ -0,0 +1,232 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## UMAP reproducibility benchmark (runtime & trustworthiness)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from umap import UMAP as umap_learn\n", + "from cuml.manifold import UMAP as umap_cuml\n", + "from cuml.metrics import trustworthiness\n", + "\n", + "from sklearn.datasets import make_blobs\n", + "import time\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_data(n_samples):\n", + " X, y = make_blobs(n_samples=n_samples, n_features=args['n_features'],\n", + " centers=int(n_samples/20), cluster_std=8.0)\n", + " return X" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def benchmark_model(model_constr, args, data, n_components):\n", + " durations = []\n", + " trust_scores = []\n", + " for i in range(args['n_iter'] + 1):\n", + " # Instantiate model\n", + " model = model_constr(n_components=n_components, n_neighbors=args['n_neighbors'],\n", + " n_epochs=args['n_epochs'], random_state=args['random_state'])\n", + " \n", + " # Perform transformation and measure time\n", + " start = time.time()\n", + " transformed = model.fit_transform(data)\n", + " durations.append(time.time()-start)\n", + " \n", + " # Compute trustworthiness score\n", + " trust_scores.append(trustworthiness(data, transformed, n_neighbors=args['n_neighbors']))\n", + " \n", + " durations = np.array(durations[1:])\n", + " trust_scores = np.array(trust_scores)\n", + " \n", + " # Compute runtime average and variance as well as trustworthiness score average\n", + " return durations.mean(), durations.var(), trust_scores.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def benchmark(args):\n", + " for n_samples in args['n_samples']:\n", + " for n_components in args['n_components']:\n", + " # Generate dataset\n", + " X = generate_data(n_samples)\n", + "\n", + " # Benchmarks the two models\n", + " print(\"For dataset of shape ({}, {}) and n_components = {}:\".format(n_samples, args['n_features'], n_components))\n", + "\n", + " n_elements = n_samples * args['n_features']\n", + " if n_elements <= 10000000:\n", + " print(\"\\tUMAP-LEARN:\")\n", + " args['random_state'] = None\n", + " ul_inconsistent = benchmark_model(umap_learn, args, X, n_components)\n", + " args['random_state'] = 42\n", + " ul_consistent = benchmark_model(umap_learn, args, X, n_components)\n", + " print_results(ul_inconsistent, ul_consistent)\n", + "\n", + " print(\"\\tCUML UMAP:\")\n", + " args['random_state'] = None\n", + " cuml_inconsistent = benchmark_model(umap_cuml, args, X, n_components)\n", + " args['random_state'] = 42\n", + " cuml_consistent = benchmark_model(umap_cuml, args, X, n_components)\n", + " print_results(cuml_inconsistent, cuml_consistent)\n", + "\n", + " a = cuml_consistent[0]\n", + " b = cuml_inconsistent[0]\n", + " slowdown = ((a - b) / a) * 100\n", + " print('\\t\\tcuML consistent pathway is {:.2f}% slower\\n'.format(slowdown))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def print_results(inconsistent, consistent):\n", + " ic_dur_mean, ic_dur_var, ic_trust = inconsistent\n", + " print(\"\\t\\tWithout random seed: runtime avg - var: {:.2f} - {:.2f}, tustworthiness: {:.2f}\".format(ic_dur_mean, ic_dur_var, ic_trust))\n", + " c_dur_mean, c_dur_var, c_trust = consistent\n", + " print(\"\\t\\tWith random seed: runtime avg - var: {:.2f} - {:.2f}, tustworthiness: {:.2f}\".format(c_dur_mean, c_dur_var, c_trust))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "For dataset of shape (1000, 1000) and n_components = 2:\n", + "\tUMAP-LEARN:\n", + "\t\tWithout random seed: runtime avg - var: 2.56 - 0.00, tustworthiness: 1.00\n", + "\t\tWith random seed: runtime avg - var: 2.54 - 0.00, tustworthiness: 1.00\n", + "\tCUML UMAP:\n", + "\t\tWithout random seed: runtime avg - var: 0.24 - 0.00, tustworthiness: 1.00\n", + "\t\tWith random seed: runtime avg - var: 0.24 - 0.00, tustworthiness: 1.00\n", + "\t\tcuML consistent pathway is -3.18% slower\n", + "\n", + "For dataset of shape (1000, 1000) and n_components = 8:\n", + "\tUMAP-LEARN:\n", + "\t\tWithout random seed: runtime avg - var: 2.75 - 0.00, tustworthiness: 1.00\n", + "\t\tWith random seed: runtime avg - var: 2.71 - 0.00, tustworthiness: 1.00\n", + "\tCUML UMAP:\n", + "\t\tWithout random seed: runtime avg - var: 0.25 - 0.00, tustworthiness: 1.00\n", + "\t\tWith random seed: runtime avg - var: 0.26 - 0.00, tustworthiness: 1.00\n", + "\t\tcuML consistent pathway is 5.35% slower\n", + "\n", + "For dataset of shape (1000, 1000) and n_components = 16:\n", + "\tUMAP-LEARN:\n", + "\t\tWithout random seed: runtime avg - var: 2.85 - 0.00, tustworthiness: 1.00\n", + "\t\tWith random seed: runtime avg - var: 2.88 - 0.00, tustworthiness: 1.00\n", + "\tCUML UMAP:\n", + "\t\tWithout random seed: runtime avg - var: 0.27 - 0.00, tustworthiness: 1.00\n", + "\t\tWith random seed: runtime avg - var: 0.31 - 0.00, tustworthiness: 1.00\n", + "\t\tcuML consistent pathway is 11.07% slower\n", + "\n", + "For dataset of shape (10000, 1000) and n_components = 2:\n", + "\tUMAP-LEARN:\n", + "\t\tWithout random seed: runtime avg - var: 31.17 - 0.10, tustworthiness: 1.00\n", + "\t\tWith random seed: runtime avg - var: 31.08 - 0.02, tustworthiness: 1.00\n", + "\tCUML UMAP:\n", + "\t\tWithout random seed: runtime avg - var: 0.29 - 0.00, tustworthiness: 1.00\n", + "\t\tWith random seed: runtime avg - var: 0.32 - 0.00, tustworthiness: 1.00\n", + "\t\tcuML consistent pathway is 9.54% slower\n", + "\n", + "For dataset of shape (10000, 1000) and n_components = 8:\n", + "\tUMAP-LEARN:\n", + "\t\tWithout random seed: runtime avg - var: 37.21 - 0.37, tustworthiness: 1.00\n", + "\t\tWith random seed: runtime avg - var: 35.90 - 0.03, tustworthiness: 1.00\n", + "\tCUML UMAP:\n", + "\t\tWithout random seed: runtime avg - var: 0.35 - 0.00, tustworthiness: 1.00\n", + "\t\tWith random seed: runtime avg - var: 0.46 - 0.00, tustworthiness: 1.00\n", + "\t\tcuML consistent pathway is 24.26% slower\n", + "\n", + "For dataset of shape (10000, 1000) and n_components = 16:\n", + "\tUMAP-LEARN:\n", + "\t\tWithout random seed: runtime avg - var: 42.85 - 0.02, tustworthiness: 1.00\n", + "\t\tWith random seed: runtime avg - var: 45.47 - 0.01, tustworthiness: 1.00\n", + "\tCUML UMAP:\n", + "\t\tWithout random seed: runtime avg - var: 0.43 - 0.00, tustworthiness: 1.00\n", + "\t\tWith random seed: runtime avg - var: 0.84 - 0.00, tustworthiness: 1.00\n", + "\t\tcuML consistent pathway is 49.20% slower\n", + "\n", + "For dataset of shape (100000, 1000) and n_components = 2:\n", + "\tCUML UMAP:\n", + "\t\tWithout random seed: runtime avg - var: 1.26 - 0.00, tustworthiness: 1.00\n", + "\t\tWith random seed: runtime avg - var: 1.59 - 0.00, tustworthiness: 1.00\n", + "\t\tcuML consistent pathway is 20.60% slower\n", + "\n", + "For dataset of shape (100000, 1000) and n_components = 8:\n", + "\tCUML UMAP:\n", + "\t\tWithout random seed: runtime avg - var: 1.80 - 0.00, tustworthiness: 1.00\n", + "\t\tWith random seed: runtime avg - var: 3.20 - 0.00, tustworthiness: 0.80\n", + "\t\tcuML consistent pathway is 43.67% slower\n", + "\n", + "For dataset of shape (100000, 1000) and n_components = 16:\n", + "\tCUML UMAP:\n", + "\t\tWithout random seed: runtime avg - var: 2.58 - 0.00, tustworthiness: 1.00\n", + "\t\tWith random seed: runtime avg - var: 7.09 - 0.00, tustworthiness: 0.76\n", + "\t\tcuML consistent pathway is 63.67% slower\n", + "\n" + ] + } + ], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "args = {'n_samples':[1000, 10000, 100000], 'n_features':1000, 'centers':500,\n", + " 'n_components':[2, 8, 16], 'n_neighbors':15, 'n_epochs':500, 'n_iter':3}\n", + "\n", + "benchmark(args)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/publications/umap_paper_benchmarks/results/benchmark_results.csv b/publications/umap_paper_benchmarks/results/benchmark_results.csv new file mode 100644 index 00000000..4e2a9aa9 --- /dev/null +++ b/publications/umap_paper_benchmarks/results/benchmark_results.csv @@ -0,0 +1,43 @@ +dataset,impl,bench,mean,var,max_trust +digits,umapcuml,unsupervised,0.35827815532684326,0.011162857575371277,0.9876636433389232 +digits,umapcuml,supervised,0.40629714727401733,0.013507401316314116,0.9879864913274599 +digits,umaplearn,unsupervised,6.328544795513153,2.8975128262992627,0.9879530939808027 +digits,umaplearn,supervised,6.755777180194855,0.1106131444620786,0.9876400330669284 +digits,umapgpumap,unsupervised,2.482882022857666,1.0582028882037378,0.9558078864163978 +digits,umapgpumap,supervised,2.5530967116355896,0.10950217290617204,0.9548070534694238 +fashion_mnist,umapcuml,unsupervised,0.45551514625549316,0.006011408943862762,0.9773246413616931 +fashion_mnist,umapcuml,supervised,1.036961555480957,0.0002095378332569453,0.9762932845375539 +fashion_mnist,umaplearn,unsupervised,45.870298981666565,10.23240442609631,0.9783708581947695 +fashion_mnist,umaplearn,supervised,53.09064221382141,6.182976974198482,0.9781969233047202 +fashion_mnist,umapgpumap,unsupervised,4.158320248126984,1.800035649938632,0.9750927790516971 +fashion_mnist,umapgpumap,supervised,6.476689338684082,0.06324447576008652,0.9696567894031254 +cifar100,umapcuml,unsupervised,1.008723258972168,0.018844815007184934,0.8342801452408608 +cifar100,umapcuml,supervised,1.081564486026764,0.0003376463217925618,0.8382352637585149 +cifar100,umaplearn,unsupervised,105.85025483369827,2.482921346083458,0.8472284890319066 +cifar100,umaplearn,supervised,98.42393732070923,2.272873735988469,0.8490670250262601 +cifar100,umapgpumap,unsupervised,6.186033010482788,1.7708737036427635,0.8401050404506913 +cifar100,umapgpumap,supervised,5.953894853591919,0.023564399280502357,0.8309824029952222 +coil20,umapcuml,unsupervised,0.7574235200881958,0.5752119048886044,0.9927721908570533 +coil20,umapcuml,supervised,0.3695611357688904,0.006603219726240894,0.986986493374108 +coil20,umaplearn,unsupervised,11.210062146186829,2.571288658299423,0.9936500235238768 +coil20,umaplearn,supervised,12.338415145874023,0.02176407274296821,0.9868479965498315 +coil20,umapgpumap,unsupervised,2.5822073221206665,0.005004079547958895,0.9565478057450535 +coil20,umapgpumap,supervised,8.210693836212158,0.02740621988587577,0.9332845866332105 +shuttle,umapcuml,unsupervised,0.5825626254081726,0.025230869766499353,0.9999991344960172 +shuttle,umapcuml,supervised,0.5559555292129517,0.011077821222144735,0.9999999959879131 +shuttle,umaplearn,unsupervised,38.87588673830032,8.039266967871527,1.0 +shuttle,umaplearn,supervised,50.173097372055054,17.54718326385637,1.0 +shuttle,umapgpumap,unsupervised,9.064357042312622,3.431728740054808,0.9778424900551579 +shuttle,umapgpumap,supervised,17.151020526885986,23.923241572835195,0.9668076059384338 +mnist,umapcuml,unsupervised,0.7078064680099487,0.008856602310757467,0.9574214731202516 +mnist,umapcuml,supervised,0.9175034761428833,0.0029794700764256277,0.9574419861872977 +mnist,umaplearn,unsupervised,52.57512265443802,1.1677051461933843,0.9594030397578331 +mnist,umaplearn,supervised,89.16777920722961,6.465818109702013,0.9584785717302744 +mnist,umapgpumap,unsupervised,10.60714328289032,0.45444580430584836,0.9442834214597077 +mnist,umapgpumap,supervised,23.90549498796463,7.056965841371607,0.9469258037618123 +scrna,umapcuml,unsupervised,4.103261113166809,0.06018657014949724,0.9781377849332045 +scrna,umapcuml,supervised,0,0,0 +scrna,umaplearn,unsupervised,223.92166656255722,9.071952858837903,0.6238778233428524 +scrna,umaplearn,supervised,0,0,0 +scrna,umapgpumap,unsupervised,10.890332162380219,1.604801846075489,0.9434946335833367 +scrna,umapgpumap,supervised,0,0,0 diff --git a/publications/umap_paper_benchmarks/results/googlenews_knn_and_without.csv b/publications/umap_paper_benchmarks/results/googlenews_knn_and_without.csv new file mode 100644 index 00000000..48441815 --- /dev/null +++ b/publications/umap_paper_benchmarks/results/googlenews_knn_and_without.csv @@ -0,0 +1,11 @@ +n_samples,with_knn,without_knn +1024,0.25798463821411133,1.2285232543945312 +334243,7.277303218841553,0.767493486404419 +667463,28.77500081062317,1.6629040241241455 +1000682,64.54534125328064,2.7732017040252686 +1333902,114.33410835266113,3.8045833110809326 +1667121,176.93605589866638,5.041834831237793 +2000341,255.18308854103088,5.964860916137695 +2333560,345.6909635066986,7.245739221572876 +2666780,451.7911911010742,8.139975309371948 +3000000,570.7973875999451,9.288048505783081 diff --git a/publications/umap_paper_benchmarks/results/mnmg_scaling.csv b/publications/umap_paper_benchmarks/results/mnmg_scaling.csv new file mode 100644 index 00000000..ac49254b --- /dev/null +++ b/publications/umap_paper_benchmarks/results/mnmg_scaling.csv @@ -0,0 +1,5 @@ +number of GPUs,transformation runtime (s) +1,25.3 +2,13.38 +4,7.34 +8,4.5 diff --git a/publications/umap_paper_benchmarks/results/results.pickle b/publications/umap_paper_benchmarks/results/results.pickle new file mode 100644 index 00000000..b83794cd Binary files /dev/null and b/publications/umap_paper_benchmarks/results/results.pickle differ diff --git a/publications/umap_paper_benchmarks/results/scale_results_precompute_knn.pickle b/publications/umap_paper_benchmarks/results/scale_results_precompute_knn.pickle new file mode 100644 index 00000000..0cc7fa84 Binary files /dev/null and b/publications/umap_paper_benchmarks/results/scale_results_precompute_knn.pickle differ diff --git a/publications/umap_paper_benchmarks/results/scale_results_with_knn.pickle b/publications/umap_paper_benchmarks/results/scale_results_with_knn.pickle new file mode 100644 index 00000000..d31e4fc2 Binary files /dev/null and b/publications/umap_paper_benchmarks/results/scale_results_with_knn.pickle differ diff --git a/publications/umap_paper_benchmarks/umap_bench/__init__.py b/publications/umap_paper_benchmarks/umap_bench/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/publications/umap_paper_benchmarks/umap_bench/funcs.py b/publications/umap_paper_benchmarks/umap_bench/funcs.py new file mode 100644 index 00000000..51033ac8 --- /dev/null +++ b/publications/umap_paper_benchmarks/umap_bench/funcs.py @@ -0,0 +1,203 @@ +import matplotlib.pyplot as plt +import numpy as np + +import os + +import time +import numpy as np + +import pickle + +from cuml.metrics import trustworthiness + +TRUST_BATCH_SIZE = 5000 + +def maybe_get_results(results, key): + return results[key] if key in results else {} + +def draw_chart(model, X, y, dataset, model_name, classes=None): + + embedding = model.fit_transform(X, y) + + fig, ax = plt.subplots(1, figsize=(14, 10)) + plt.scatter(embedding[:,1], embedding[:,0], s=0.3, c=y, cmap='Spectral', alpha=1.0) + plt.setp(ax, xticks=[], yticks=[]) + cbar = plt.colorbar(boundaries=np.arange(11)-0.5) + cbar.set_ticks(np.arange(10)) + if classes is not None: + cbar.set_ticklabels(classes) + plt.title("%s Embedded via %s" % (dataset, model_name)); + + +def _run_build_and_train_once(model_class, X, y=None, kwargs={}, knn_graph=None, verbose=False, eval_trust=True): + + results = {} + extra_args = {} + if knn_graph is not None: + extra_args["knn_graph"] = knn_graph + + if verbose: + print("Unsupervised") + model = model_class(**kwargs) + + try: + start = time.time() + embeddings = model.fit_transform(X, **extra_args) + end = time.time() - start + + if verbose: + print("Time: "+ str(end)) + + n_neighbors = model.n_neighbors + del model + + if eval_trust: + if verbose: + print("Done. Evaluating trustworthiness") + trust = trustworthiness(X, embeddings, n_neighbors=n_neighbors, batch_size=TRUST_BATCH_SIZE) + else: + trust = None + + if verbose: + print(str(trust)) + results["unsupervised"] = {"time": end, "trust": trust} + except: + import traceback + traceback.print_exc() + + # Supervised + + if y is not None: + if verbose: + print("Supervised") + kwargs["target_metric"] = "categorical" + model = model_class(**kwargs) + + try: + start = time.time() + embeddings = model.fit_transform(X, y, **extra_args) + end = time.time() - start + + + n_neighbors = model.n_neighbors + del model + + if eval_trust: + if verbose: + print("Done. Evaluating trustworthiness") + trust = trustworthiness(X, embeddings, n_neighbors=n_neighbors, batch_size=TRUST_BATCH_SIZE) + else: + trust = None + + if verbose: + print(str(trust)) + print("Time: "+ str(end)) + + results["supervised"] = {"time": end, "trust": trust} + except: + import traceback + traceback.print_exc() + + # Transform + + + if verbose: + print("Transform") + model = model_class(**kwargs) + + try: + + if knn_graph is not None: + model.fit(X) + start = time.time() + embeddings = model.transform(X, **extra_args) + end = time.time() - start + else: + model.fit(X, knn_graph=knn_graph) + start = time.time() + embeddings = model.transform(X, knn_graph=knn_graph) + end = time.time() - start + + + n_neighbors = model.n_neighbors + del model + + if eval_trust: + if verbose: + print("Done. Evaluating trustworthiness") + trust = trustworthiness(X, embeddings, n_neighbors=n_neighbors, batch_size=TRUST_BATCH_SIZE) + else: + trust = None + + if verbose: + print(str(trust)) + print("Time: "+ str(end)) + results["xform"] = {"time": end, "trust": trust} + except: + import traceback + traceback.print_exc() + + return results + + +def build_and_train(model_class, X, y=None, kwargs={}, n_trials=4, knn_graph=None, verbose=False, eval_trust=True): + + results = [] + + for trial in range(n_trials): + results.append(_run_build_and_train_once(model_class, X, y=y, kwargs=kwargs, + knn_graph=knn_graph, verbose=verbose, + eval_trust=eval_trust)) + return results + +def store_results(results, filename): + with open(filename, 'wb') as handle: + pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL) + + +def maybe_load_results(filename): + # Load a results file if it exists, otherwise load empty dictionary" + return pickle.load( open(filename, "rb" ) ) if os.path.exists(filename) else {} + + +def perform_n_samples_test(model, X, precompute_knn=True, start_samples=1024, n_indep=10, n_trials=1, n_components=2): + import math + results = {} + s = np.linspace(start_samples, X.shape[0], n_indep) + for n_samples in s: + print("Testing " + str(n_samples)) + samples = np.random.choice(np.arange(X.shape[0]), math.floor(n_samples)) + X_sampled = X[samples] + + if precopute_knn: + from cuml.neighbors import NearestNeighbors + import cupy as cp + d, i = NearestNeighbors(n_neighbors=15).fit(X_sampled).kneighbors(X_sampled) + knn_graph = cp.sparse.coo_matrix((cp.asarray(d.ravel()), (cp.repeat(cp.arange(d.shape[0]), 15), cp.asarray(i.ravel())))) + else: + knn_graph = None + + results[n_samples] = build_and_train(model, + X_sampled, + y=None, + kwargs={"n_components": n_components}, + knn_graph=knn_graph, + verbose=True, + n_trials=n_trials, + eval_trust=False) + + return results + +def perform_n_components_test(model, X, model_name, start_components=2, stop_components=1024): + + import math + + n_components = np.linspace(start_components, stop_components, 3) + + print(n_components) + + for components in n_components: + print("Testing " + str(math.floor(components)) + " components") + scale_results[model_name + "_" + str(math.floor(components)) + "_components"] = \ + perform_n_samples_test(model, X, n_components=math.floor(components)) + store_results(scale_results, "results/scale_results.pickle") \ No newline at end of file diff --git a/publications/umap_paper_benchmarks/umap_bench/loaders.py b/publications/umap_paper_benchmarks/umap_bench/loaders.py new file mode 100644 index 00000000..e2585ac4 --- /dev/null +++ b/publications/umap_paper_benchmarks/umap_bench/loaders.py @@ -0,0 +1,124 @@ +import os +import gzip +import numpy as np +import pickle +import wget + +from datasets.coil20.feed import feed + +import scipy.io + + +def load_digits(): + from sklearn import datasets + data = datasets.load_digits() + + return data.data, data.target + + +def load_fashion_mnist(path, kind='train'): + + """Load MNIST data from `path`""" + labels_path = os.path.join(path, + '%s-labels-idx1-ubyte.gz' + % kind) + images_path = os.path.join(path, + '%s-images-idx3-ubyte.gz' + % kind) + + with gzip.open(labels_path, 'rb') as lbpath: + labels = np.frombuffer(lbpath.read(), dtype=np.uint8, + offset=8) + + with gzip.open(images_path, 'rb') as imgpath: + images = np.frombuffer(imgpath.read(), dtype=np.uint8, + offset=16).reshape(len(labels), 784) + + return images, labels + + +def unpickle_cifar100(file): + import pickle + with open(file, 'rb') as fo: + dict = pickle.load(fo, encoding='bytes') + return dict + +def load_cifar100(path="data/cifar100/cifar-100-python"): + + train_path = os.path.join(path, "train") + test_path = os.path.join(path, "test") + + if not os.path.exists(train_path): + raise ValueError("Path %s not found. Please provide path to " + "untarred cifar100 dataset." % train_path) + + if not os.path.exists(test_path): + raise ValueError("Path %s not found. Please provide path to " + "untarred cifar100 dataset." % test_path) + + train = unpickle_cifar100(train_path) + test = unpickle_cifar100(test_path) + + return train, test + + +def load_shuttle(filepath="data/shuttle.mat"): + if not os.path.exists(filepath): + raise ValueError("File shuttle.mat not found. Please download " + "from 'https://www.dropbox.com/s/mk8ozgisimfn3dw/shuttle.mat'") + + mat = scipy.io.loadmat(filepath) + + X = mat["X"].astype(np.float32) + y = mat["y"].astype(np.int32).ravel() + return X, y + + +def load_coil20(path="data/coil20"): + feed(feed_path=path, dataset_type='processed') + + from datasets import pa2np + X, Y = pa2np(os.path.join(path, "X_processed.pa")), pa2np(os.path.join(path, "Y_processed.pa")) + + features = X.shape[2]*X.shape[3] + new_X = np.zeros((X.shape[0], features)) + + from skimage import color + for i in range(X.shape[0]): + img = X[i, :, :, :] + shape = features + gray = color.rgb2gray(np.moveaxis(img, 0, 2)).reshape(shape) + new_X[i] = gray + + X = new_X.astype(np.float32) + y = Y.astype(np.float32) + + return X, y + +def load_mnist(path="data/mnist/"): + from datasets.mnist.feed import feed + feed(feed_path=path) + + from datasets import pa2np + X, Y = pa2np(os.path.join(path, "X.pa")), pa2np(os.path.join(path, "Y.pa")) + + X = X.reshape(X.shape[0], X.shape[1] * X.shape[2]) + y = Y + + return X, y + + +def load_word2vec(path): + + from gensim.models import KeyedVectors + + bin_file = os.path.join(path, "/GoogleNews-vectors-negative300.bin") + + if not os.path.exists(bin_file): + raise ValueError("GoogleNews-vectors-negative300.bin was not found in " + path + + ". You will need to download this file and place in 'path'") + + vecs = KeyedVectors.load_word2vec_format(bin_file, binary=True) + + return vecs.vectors +