diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q37.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q37.py new file mode 100644 index 00000000000..58da69806db --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q37.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 37.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 37.""" + return """ + SELECT + i_item_id , + i_item_desc , + i_current_price + FROM item, + inventory, + date_dim, + catalog_sales + WHERE i_current_price BETWEEN 20 AND 20 + 30 + AND inv_item_sk = i_item_sk + AND d_date_sk=inv_date_sk + AND d_date BETWEEN Cast('1999-03-06' AS DATE) AND ( + Cast('1999-03-06' AS DATE) + INTERVAL '60' day) + AND i_manufact_id IN (843,815,850,840) + AND inv_quantity_on_hand BETWEEN 100 AND 500 + AND cs_item_sk = i_item_sk + GROUP BY i_item_id, + i_item_desc, + i_current_price + ORDER BY i_item_id + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 37.""" + # Load tables + item = get_data(run_config.dataset_path, "item", run_config.suffix) + inventory = get_data(run_config.dataset_path, "inventory", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + catalog_sales = get_data( + run_config.dataset_path, "catalog_sales", run_config.suffix + ) + return ( + item.join(inventory, left_on="i_item_sk", right_on="inv_item_sk") + .join(date_dim, left_on="inv_date_sk", right_on="d_date_sk") + .join(catalog_sales, left_on="i_item_sk", right_on="cs_item_sk") + .filter( + (pl.col("i_current_price").is_between(20, 50)) + & (pl.col("i_manufact_id").is_in([843, 815, 850, 840])) + & (pl.col("inv_quantity_on_hand").is_between(100, 500)) + & (pl.col("d_date").is_between(pl.date(1999, 3, 6), pl.date(1999, 5, 5))) + ) + .group_by(["i_item_id", "i_item_desc", "i_current_price"]) + .agg([]) + .sort(["i_item_id"]) + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q38.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q38.py new file mode 100644 index 00000000000..861de156fb3 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q38.py @@ -0,0 +1,109 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 38.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 38.""" + return """ + SELECT Count(*) + FROM (SELECT DISTINCT c_last_name, + c_first_name, + d_date + FROM store_sales, + date_dim, + customer + WHERE store_sales.ss_sold_date_sk = date_dim.d_date_sk + AND store_sales.ss_customer_sk = customer.c_customer_sk + AND d_month_seq BETWEEN 1188 AND 1188 + 11 + INTERSECT + SELECT DISTINCT c_last_name, + c_first_name, + d_date + FROM catalog_sales, + date_dim, + customer + WHERE catalog_sales.cs_sold_date_sk = date_dim.d_date_sk + AND catalog_sales.cs_bill_customer_sk = customer.c_customer_sk + AND d_month_seq BETWEEN 1188 AND 1188 + 11 + INTERSECT + SELECT DISTINCT c_last_name, + c_first_name, + d_date + FROM web_sales, + date_dim, + customer + WHERE web_sales.ws_sold_date_sk = date_dim.d_date_sk + AND web_sales.ws_bill_customer_sk = customer.c_customer_sk + AND d_month_seq BETWEEN 1188 AND 1188 + 11) hot_cust + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 38.""" + # Load tables + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + catalog_sales = get_data( + run_config.dataset_path, "catalog_sales", run_config.suffix + ) + web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + customer = get_data(run_config.dataset_path, "customer", run_config.suffix) + # Filter date_dim for the specified month sequence range + date_filter = date_dim.filter(pl.col("d_month_seq").is_between(1188, 1188 + 11)) + # Store sales customers with names and dates + store_customers = ( + store_sales.join(date_filter, left_on="ss_sold_date_sk", right_on="d_date_sk") + .join(customer, left_on="ss_customer_sk", right_on="c_customer_sk") + .select(["c_last_name", "c_first_name", "d_date"]) + .unique() + ) + # Catalog sales customers with names and dates + catalog_customers = ( + catalog_sales.join(date_filter, left_on="cs_sold_date_sk", right_on="d_date_sk") + .join(customer, left_on="cs_bill_customer_sk", right_on="c_customer_sk") + .select(["c_last_name", "c_first_name", "d_date"]) + .unique() + ) + # Web sales customers with names and dates + web_customers = ( + web_sales.join(date_filter, left_on="ws_sold_date_sk", right_on="d_date_sk") + .join(customer, left_on="ws_bill_customer_sk", right_on="c_customer_sk") + .select(["c_last_name", "c_first_name", "d_date"]) + .unique() + ) + # Find INTERSECT of all three using a different approach + # Combine all three and find tuples that appear exactly 3 times + all_customers = pl.concat( + [ + store_customers.with_columns(pl.lit("store").alias("source")), + catalog_customers.with_columns(pl.lit("catalog").alias("source")), + web_customers.with_columns(pl.lit("web").alias("source")), + ] + ) + # Find combinations that appear in all three sources + intersect_final = ( + all_customers.group_by(["c_last_name", "c_first_name", "d_date"]) + .agg(pl.col("source").n_unique().alias("source_count")) + .filter(pl.col("source_count") == 3) + .select(["c_last_name", "c_first_name", "d_date"]) + ) + # Count the final result + return ( + intersect_final + # Cast -> Int64 to match DuckDB + .select([pl.len().cast(pl.Int64).alias("count_star()")]).limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q39.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q39.py new file mode 100644 index 00000000000..df307b9a1d1 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q39.py @@ -0,0 +1,176 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 39.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 39.""" + return """ + WITH inv AS + (SELECT w_warehouse_name, + w_warehouse_sk, + i_item_sk, + d_moy, + stdev, + mean, + CASE mean + WHEN 0 THEN NULL + ELSE stdev/mean + END cov + FROM + (SELECT w_warehouse_name, + w_warehouse_sk, + i_item_sk, + d_moy, + stddev_samp(inv_quantity_on_hand)*1.000 stdev, + avg(inv_quantity_on_hand) mean + FROM inventory, + item, + warehouse, + date_dim + WHERE inv_item_sk = i_item_sk + AND inv_warehouse_sk = w_warehouse_sk + AND inv_date_sk = d_date_sk + AND d_year =2001 + GROUP BY w_warehouse_name, + w_warehouse_sk, + i_item_sk, + d_moy) foo + WHERE CASE mean + WHEN 0 THEN 0 + ELSE stdev/mean + END > 1) + SELECT inv1.w_warehouse_sk wsk1, + inv1.i_item_sk isk1, + inv1.d_moy dmoy1, + inv1.mean mean1, + inv1.cov cov1, + inv2.w_warehouse_sk, + inv2.i_item_sk, + inv2.d_moy, + inv2.mean, + inv2.cov + FROM inv inv1, + inv inv2 + WHERE inv1.i_item_sk = inv2.i_item_sk + AND inv1.w_warehouse_sk = inv2.w_warehouse_sk + AND inv1.d_moy=1 + AND inv2.d_moy=1+1 + ORDER BY inv1.w_warehouse_sk NULLS FIRST, + inv1.i_item_sk NULLS FIRST, + inv1.d_moy NULLS FIRST, + inv1.mean NULLS FIRST, + inv1.cov NULLS FIRST, + inv2.d_moy NULLS FIRST, + inv2.mean NULLS FIRST, + inv2.cov NULLS FIRST; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 39.""" + inventory = get_data(run_config.dataset_path, "inventory", run_config.suffix) + item = get_data(run_config.dataset_path, "item", run_config.suffix) + warehouse = get_data(run_config.dataset_path, "warehouse", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + + base_agg = ( + inventory.join(item, left_on="inv_item_sk", right_on="i_item_sk") + .join(warehouse, left_on="inv_warehouse_sk", right_on="w_warehouse_sk") + .join(date_dim, left_on="inv_date_sk", right_on="d_date_sk") + .filter(pl.col("d_year") == 2001) + .group_by(["w_warehouse_name", "inv_warehouse_sk", "inv_item_sk", "d_moy"]) + .agg( + [ + pl.col("inv_quantity_on_hand").std().alias("stdev"), + pl.col("inv_quantity_on_hand").mean().alias("mean"), + ] + ) + ) + + inv_cte = base_agg.with_columns( + pl.when(pl.col("mean") == 0) + .then(None) + .otherwise(pl.col("stdev") / pl.col("mean")) + .alias("cov") + ).filter( + pl.when(pl.col("mean") == 0) + .then(False) # noqa: FBT003 + .otherwise(pl.col("stdev") / pl.col("mean") > 1.0) + ) + + inv1 = inv_cte.filter(pl.col("d_moy") == 1).select( + [ + pl.col("inv_warehouse_sk").alias("w_warehouse_sk"), + pl.col("inv_item_sk").alias("i_item_sk"), + "d_moy", + "mean", + "cov", + ] + ) + + inv2 = ( + inv_cte.filter(pl.col("d_moy") == 2) + .select( + [ + "inv_warehouse_sk", + "inv_item_sk", + pl.col("d_moy").alias("d_moy_2"), + pl.col("mean").alias("mean_2"), + pl.col("cov").alias("cov_2"), + ] + ) + .with_columns( + [ + pl.col("inv_warehouse_sk").alias("w_warehouse_sk_2"), + pl.col("inv_item_sk").alias("i_item_sk_2"), + ] + ) + ) + + return ( + inv1.join( + inv2, + left_on=["w_warehouse_sk", "i_item_sk"], + right_on=["inv_warehouse_sk", "inv_item_sk"], + how="inner", + ) + .select( + [ + "w_warehouse_sk", + "i_item_sk", + "d_moy", + "mean", + "cov", + "w_warehouse_sk_2", + "i_item_sk_2", + "d_moy_2", + "mean_2", + "cov_2", + ] + ) + .sort( + [ + "w_warehouse_sk", + "i_item_sk", + "d_moy", + "mean", + "cov", + "d_moy_2", + "mean_2", + "cov_2", + ] + ) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q40.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q40.py new file mode 100644 index 00000000000..47d3c231619 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q40.py @@ -0,0 +1,113 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 40.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 40.""" + return """ + SELECT + w_state , + i_item_id , + Sum( + CASE + WHEN ( + Cast(d_date AS DATE) < Cast ('2002-06-01' AS DATE)) THEN cs_sales_price - COALESCE(cr_refunded_cash,0) + ELSE 0 + END) AS sales_before , + Sum( + CASE + WHEN ( + Cast(d_date AS DATE) >= Cast ('2002-06-01' AS DATE)) THEN cs_sales_price - COALESCE(cr_refunded_cash,0) + ELSE 0 + END) AS sales_after + FROM catalog_sales + LEFT OUTER JOIN catalog_returns + ON ( + cs_order_number = cr_order_number + AND cs_item_sk = cr_item_sk) , + warehouse , + item , + date_dim + WHERE i_current_price BETWEEN 0.99 AND 1.49 + AND i_item_sk = cs_item_sk + AND cs_warehouse_sk = w_warehouse_sk + AND cs_sold_date_sk = d_date_sk + AND d_date BETWEEN (Cast ('2002-06-01' AS DATE) - INTERVAL '30' day) AND ( + cast ('2002-06-01' AS date) + INTERVAL '30' day) + GROUP BY w_state, + i_item_id + ORDER BY w_state, + i_item_id + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 40.""" + # Load tables + catalog_sales = get_data( + run_config.dataset_path, "catalog_sales", run_config.suffix + ) + catalog_returns = get_data( + run_config.dataset_path, "catalog_returns", run_config.suffix + ) + warehouse = get_data(run_config.dataset_path, "warehouse", run_config.suffix) + item = get_data(run_config.dataset_path, "item", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + # Define the target date and date range + target_date = pl.date(2002, 6, 1) + start_date = pl.date(2002, 5, 2) # 2002-06-01 - 30 days + end_date = pl.date(2002, 7, 1) # 2002-06-01 + 30 days + return ( + catalog_sales.join( + catalog_returns, + left_on=["cs_order_number", "cs_item_sk"], + right_on=["cr_order_number", "cr_item_sk"], + how="left", + ) # LEFT OUTER JOIN + .join(warehouse, left_on="cs_warehouse_sk", right_on="w_warehouse_sk") + .join(item, left_on="cs_item_sk", right_on="i_item_sk") + .join(date_dim, left_on="cs_sold_date_sk", right_on="d_date_sk") + .filter( + (pl.col("i_current_price").is_between(0.99, 1.49)) + & (pl.col("d_date").is_between(start_date, end_date)) + ) + .with_columns( + [ + pl.when(pl.col("d_date") < target_date) + .then( + pl.col("cs_sales_price") - pl.col("cr_refunded_cash").fill_null(0) + ) + .otherwise(0) + .alias("sales_before_amount"), + pl.when(pl.col("d_date") >= target_date) + .then( + pl.col("cs_sales_price") - pl.col("cr_refunded_cash").fill_null(0) + ) + .otherwise(0) + .alias("sales_after_amount"), + ] + ) + .group_by(["w_state", "i_item_id"]) + .agg( + [ + pl.col("sales_before_amount").sum().alias("sales_before"), + pl.col("sales_after_amount").sum().alias("sales_after"), + ] + ) + .sort(["w_state", "i_item_id"]) + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q41.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q41.py new file mode 100644 index 00000000000..3464324c09c --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q41.py @@ -0,0 +1,138 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 41.""" + +from __future__ import annotations + +import operator as op +from functools import reduce +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + +Q_NUM = 41 + +MANUFACTURER_ID_START = 765 + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 41.""" + return f""" + SELECT Distinct(i_product_name) + FROM item i1 + WHERE i_manufact_id BETWEEN {MANUFACTURER_ID_START} AND {MANUFACTURER_ID_START} + 40 + AND (SELECT Count(*) AS item_cnt + FROM item + WHERE ( i_manufact = i1.i_manufact + AND ( ( i_category = 'Women' + AND ( i_color = 'dim' + OR i_color = 'green' ) + AND ( i_units = 'Gross' + OR i_units = 'Dozen' ) + AND ( i_size = 'economy' + OR i_size = 'petite' ) ) + OR ( i_category = 'Women' + AND ( i_color = 'navajo' + OR i_color = 'aquamarine' ) + AND ( i_units = 'Case' + OR i_units = 'Unknown' ) + AND ( i_size = 'large' + OR i_size = 'N/A' ) ) + OR ( i_category = 'Men' + AND ( i_color = 'indian' + OR i_color = 'dark' ) + AND ( i_units = 'Oz' + OR i_units = 'Lb' ) + AND ( i_size = 'extra large' + OR i_size = 'small' ) ) + OR ( i_category = 'Men' + AND ( i_color = 'peach' + OR i_color = 'purple' ) + AND ( i_units = 'Tbl' + OR i_units = 'Bunch' ) + AND ( i_size = 'economy' + OR i_size = 'petite' ) ) ) ) + OR ( i_manufact = i1.i_manufact + AND ( ( i_category = 'Women' + AND ( i_color = 'orchid' + OR i_color = 'peru' ) + AND ( i_units = 'Carton' + OR i_units = 'Cup' ) + AND ( i_size = 'economy' + OR i_size = 'petite' ) ) + OR ( i_category = 'Women' + AND ( i_color = 'violet' + OR i_color = 'papaya' ) + AND ( i_units = 'Ounce' + OR i_units = 'Box' ) + AND ( i_size = 'large' + OR i_size = 'N/A' ) ) + OR ( i_category = 'Men' + AND ( i_color = 'drab' + OR i_color = 'grey' ) + AND ( i_units = 'Each' + OR i_units = 'N/A' ) + AND ( i_size = 'extra large' + OR i_size = 'small' ) ) + OR ( i_category = 'Men' + AND ( i_color = 'chocolate' + OR i_color = 'antique' ) + AND ( i_units = 'Dram' + OR i_units = 'Gram' ) + AND ( i_size = 'economy' + OR i_size = 'petite' ) ) ) )) > 0 + ORDER BY i_product_name + LIMIT 100; + + """ + + +rules: list[tuple[str, list[str], list[str], list[str]]] = [ + ("Women", ["dim", "green"], ["Gross", "Dozen"], ["economy", "petite"]), + ("Women", ["navajo", "aquamarine"], ["Case", "Unknown"], ["large", "N/A"]), + ("Men", ["indian", "dark"], ["Oz", "Lb"], ["extra large", "small"]), + ("Men", ["peach", "purple"], ["Tbl", "Bunch"], ["economy", "petite"]), + ("Women", ["orchid", "peru"], ["Carton", "Cup"], ["economy", "petite"]), + ("Women", ["violet", "papaya"], ["Ounce", "Box"], ["large", "N/A"]), + ("Men", ["drab", "grey"], ["Each", "N/A"], ["extra large", "small"]), + ("Men", ["chocolate", "antique"], ["Dram", "Gram"], ["economy", "petite"]), +] + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 41.""" + item = get_data(run_config.dataset_path, "item", run_config.suffix) + + rule_exprs = [ + ( + (pl.col("i_category") == cat) + & pl.col("i_color").is_in(colors) + & pl.col("i_units").is_in(units) + & pl.col("i_size").is_in(sizes) + ) + for (cat, colors, units, sizes) in rules + ] + subquery_conditions = reduce(op.or_, rule_exprs) + + manufacturers_with_criteria = ( + item.filter(subquery_conditions).select("i_manufact").unique() + ) + + return ( + item.filter( + pl.col("i_manufact_id").is_between( + MANUFACTURER_ID_START, MANUFACTURER_ID_START + 40 + ) + ) + .join(manufacturers_with_criteria, on="i_manufact", how="inner") + .select("i_product_name") + .unique() + .sort("i_product_name") + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q42.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q42.py new file mode 100644 index 00000000000..5e66352c49f --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q42.py @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 42.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 42.""" + return """ + SELECT dt.d_year, + item.i_category_id, + item.i_category, + Sum(ss_ext_sales_price) + FROM date_dim dt, + store_sales, + item + WHERE dt.d_date_sk = store_sales.ss_sold_date_sk + AND store_sales.ss_item_sk = item.i_item_sk + AND item.i_manager_id = 1 + AND dt.d_moy = 12 + AND dt.d_year = 2000 + GROUP BY dt.d_year, + item.i_category_id, + item.i_category + ORDER BY Sum(ss_ext_sales_price) DESC, + dt.d_year, + item.i_category_id, + item.i_category + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 42.""" + # Load tables + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + item = get_data(run_config.dataset_path, "item", run_config.suffix) + return ( + store_sales.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk") + .join(item, left_on="ss_item_sk", right_on="i_item_sk") + .filter( + (pl.col("i_manager_id") == 1) + & (pl.col("d_moy") == 12) + & (pl.col("d_year") == 2000) + ) + .group_by(["d_year", "i_category_id", "i_category"]) + .agg([pl.col("ss_ext_sales_price").sum().alias("sum(ss_ext_sales_price)")]) + .sort( + ["sum(ss_ext_sales_price)", "d_year", "i_category_id", "i_category"], + descending=[True, False, False, False], + ) + .select(["d_year", "i_category_id", "i_category", "sum(ss_ext_sales_price)"]) + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q43.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q43.py new file mode 100644 index 00000000000..f6a9b717031 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q43.py @@ -0,0 +1,156 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 43.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 43.""" + return """ + SELECT s_store_name, + s_store_id, + Sum(CASE + WHEN ( d_day_name = 'Sunday' ) THEN ss_sales_price + ELSE NULL + END) sun_sales, + Sum(CASE + WHEN ( d_day_name = 'Monday' ) THEN ss_sales_price + ELSE NULL + END) mon_sales, + Sum(CASE + WHEN ( d_day_name = 'Tuesday' ) THEN ss_sales_price + ELSE NULL + END) tue_sales, + Sum(CASE + WHEN ( d_day_name = 'Wednesday' ) THEN ss_sales_price + ELSE NULL + END) wed_sales, + Sum(CASE + WHEN ( d_day_name = 'Thursday' ) THEN ss_sales_price + ELSE NULL + END) thu_sales, + Sum(CASE + WHEN ( d_day_name = 'Friday' ) THEN ss_sales_price + ELSE NULL + END) fri_sales, + Sum(CASE + WHEN ( d_day_name = 'Saturday' ) THEN ss_sales_price + ELSE NULL + END) sat_sales + FROM date_dim, + store_sales, + store + WHERE d_date_sk = ss_sold_date_sk + AND s_store_sk = ss_store_sk + AND s_gmt_offset = -5 + AND d_year = 2002 + GROUP BY s_store_name, + s_store_id + ORDER BY s_store_name, + s_store_id, + sun_sales, + mon_sales, + tue_sales, + wed_sales, + thu_sales, + fri_sales, + sat_sales + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 43.""" + # Load tables + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + store = get_data(run_config.dataset_path, "store", run_config.suffix) + # Main query with joins and conditional aggregations + return ( + store_sales.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk") + .join(store, left_on="ss_store_sk", right_on="s_store_sk") + .filter((pl.col("s_gmt_offset") == -5) & (pl.col("d_year") == 2002)) + .with_columns( + [ + # Pre-compute conditional sales amounts for each day + pl.when(pl.col("d_day_name") == "Sunday") + .then(pl.col("ss_sales_price")) + .otherwise(0) + .alias("sun_sales_amount"), + pl.when(pl.col("d_day_name") == "Monday") + .then(pl.col("ss_sales_price")) + .otherwise(0) + .alias("mon_sales_amount"), + pl.when(pl.col("d_day_name") == "Tuesday") + .then(pl.col("ss_sales_price")) + .otherwise(0) + .alias("tue_sales_amount"), + pl.when(pl.col("d_day_name") == "Wednesday") + .then(pl.col("ss_sales_price")) + .otherwise(0) + .alias("wed_sales_amount"), + pl.when(pl.col("d_day_name") == "Thursday") + .then(pl.col("ss_sales_price")) + .otherwise(0) + .alias("thu_sales_amount"), + pl.when(pl.col("d_day_name") == "Friday") + .then(pl.col("ss_sales_price")) + .otherwise(0) + .alias("fri_sales_amount"), + pl.when(pl.col("d_day_name") == "Saturday") + .then(pl.col("ss_sales_price")) + .otherwise(0) + .alias("sat_sales_amount"), + ] + ) + .group_by(["s_store_name", "s_store_id"]) + .agg( + [ + pl.col("sun_sales_amount").sum().alias("sun_sales"), + pl.col("mon_sales_amount").sum().alias("mon_sales"), + pl.col("tue_sales_amount").sum().alias("tue_sales"), + pl.col("wed_sales_amount").sum().alias("wed_sales"), + pl.col("thu_sales_amount").sum().alias("thu_sales"), + pl.col("fri_sales_amount").sum().alias("fri_sales"), + pl.col("sat_sales_amount").sum().alias("sat_sales"), + ] + ) + .select( + [ + "s_store_name", + "s_store_id", + "sun_sales", + "mon_sales", + "tue_sales", + "wed_sales", + "thu_sales", + "fri_sales", + "sat_sales", + ] + ) + .sort( + [ + "s_store_name", + "s_store_id", + "sun_sales", + "mon_sales", + "tue_sales", + "wed_sales", + "thu_sales", + "fri_sales", + "sat_sales", + ] + ) + .limit(100) + )