diff --git a/tests/tpch/plot_results.ipynb b/tests/tpch/plot_results.ipynb new file mode 100644 index 0000000000..aeb6d762aa --- /dev/null +++ b/tests/tpch/plot_results.ipynb @@ -0,0 +1,345 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0cdaec38-4a9e-4a25-b45e-1188903d219d", + "metadata": {}, + "source": [ + "# Plot TPC-H results" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5e877284-731c-4105-bef4-4409b410dc24", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from plotting import from_db, latest, normalize, plot" + ] + }, + { + "cell_type": "markdown", + "id": "048cfafb-573f-4ad5-b724-fdff3af83adf", + "metadata": {}, + "source": [ + "## Load data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "01d857fc-7956-4592-ba34-6e325cbb217b", + "metadata": {}, + "outputs": [], + "source": [ + "df = from_db(\"/Users/hendrikmakait/Downloads/tpch_1000.db\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5815ac3c-2ad6-4207-a977-31d61039d355", + "metadata": {}, + "outputs": [], + "source": [ + "spark_df = from_db(\"/Users/hendrikmakait/projects/coiled/benchmarks/benchmark.db\")\n", + "spark_df = spark_df[spark_df[\"library\"] == \"pyspark\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1130a571-db77-46e7-990f-b0913a22b5dc", + "metadata": {}, + "outputs": [], + "source": [ + "dask_df = from_db(\"/Users/hendrikmakait/Downloads/tpch_1000.db 3\")\n", + "dask_df = dask_df[dask_df[\"library\"] == \"dask\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c2ad0b96-86f8-40d3-8c83-ca064877c647", + "metadata": {}, + "outputs": [], + "source": [ + "combined_df = pd.concat([df, dask_df, spark_df])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "36617fe5-c676-474a-b4d3-7eb139157d21", + "metadata": {}, + "outputs": [], + "source": [ + "df = latest(combined_df)" + ] + }, + { + "cell_type": "markdown", + "id": "ba013c77-5173-42c5-881f-89a4455f01ac", + "metadata": {}, + "source": [ + "## Prepare data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "677b16fc-aae5-4c21-abfd-aeb5505f403c", + "metadata": {}, + "outputs": [], + "source": [ + "df = normalize(df)" + ] + }, + { + "cell_type": "markdown", + "id": "e6eb21e4-9754-48fd-a6b9-4429f4696928", + "metadata": {}, + "source": [ + "## Generate plots" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1c83ccc4-ac11-4edf-bbea-62d613113e61", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "abs_chart = plot(df, [\"dask\", \"duckdb\", \"pyspark\"])\n", + "abs_chart.save(\"tpch-scale-1000-no-polars.svg\")\n", + "abs_chart" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ab5492be-6f97-49cd-b248-c6f5cac71638", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rel_chart = plot(df, [\"dask\", \"duckdb\", \"pyspark\"], column=\"relative_duration\")\n", + "rel_chart.save(\"tpch-scale-1000-no-polars-relative.svg\")\n", + "rel_chart" + ] + }, + { + "cell_type": "markdown", + "id": "611c2644-c7b7-47ac-a2b2-1c47fa22360f", + "metadata": {}, + "source": [ + "## Compute summary statistics" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "34830787-2364-4541-8cf8-8adffbde9148", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "library\n", + "dask 1.000000\n", + "duckdb 1.091302\n", + "pyspark 1.255598\n", + "Name: relative_duration, dtype: float64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby(\"library\").relative_duration.mean()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/tpch/plotting.py b/tests/tpch/plotting.py new file mode 100644 index 0000000000..88503e4fde --- /dev/null +++ b/tests/tpch/plotting.py @@ -0,0 +1,78 @@ +import altair as alt +import pandas as pd + +LIBRARY_COLORS = { + "dask": "#5677a4", + "duckdb": "#e68b39", + "polars": "#d4605b", + "pyspark": "green", +} + + +def from_db(path): + df = pd.read_sql_table(table_name="test_run", con=f"sqlite:///{path}") + + df = df[ + (df.call_outcome == "passed") + & (df.path.str.contains("^tpch/test_(?:dask|duckdb|polars|pyspark)")) + & df.cluster_name + ] + df = df[["path", "name", "duration", "start", "cluster_name"]] + df["library"] = df.path.map(lambda path: path.split("_")[-1].split(".")[0]) + df["query"] = df.name.map(lambda name: int(name.split("_")[-1])) + df["name"] = df.cluster_name.map(lambda name: name.split("-", 3)[-1]) + df["scale"] = df.cluster_name.map(lambda name: int(name.split("-")[2])) + del df["path"] + del df["cluster_name"] + return df + + +def latest(df, n=1): + df = df.sort_values(["query", "library"]) + + def recent(df): + return df.sort_values("start").tail(n) + + df = df.groupby(["library", "query"]).apply(recent).reset_index(drop=True) + del df["start"] + return df + + +def normalize(df): + dask_durations = df[df["library"] == "dask"].set_index("query")["duration"] + data = df.groupby("query").apply( + lambda group: group.assign( + relative_duration=group["duration"] / dask_durations[group.name] + ) + ) + return data.reset_index(drop=True) + + +def subplot(df, column, libraries): + return ( + alt.Chart(df) + .mark_bar() + .encode( + x="query:N", + y=f"{column}:Q", + xOffset="library:N", + color=alt.Color("library").scale( + domain=libraries, + range=[LIBRARY_COLORS[lib] for lib in libraries], + ), + tooltip=["library", column], + ) + ) + + +def plot(df, libraries=None, column="duration"): + if libraries is None: + libraries = ["dask", "duckdb", "polars", "pyspark"] + plot = subplot(df[df["query"] < 12], column=column, libraries=libraries) & subplot( + df[df["query"] >= 12], column=column, libraries=libraries + ) + return plot.properties( + title=f"TPC-H -- scale:{df.scale.iloc[0]} name:{df.name.iloc[0]}" + ).configure_title( + fontSize=20, + )