rapidsai · Matt711 · Oct 28, 2025
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Query 37."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import polars as pl
+
+from cudf_polars.experimental.benchmarks.utils import get_data
+
+if TYPE_CHECKING:
+    from cudf_polars.experimental.benchmarks.utils import RunConfig
+
+
+def duckdb_impl(run_config: RunConfig) -> str:
+    """Query 37."""
+    return """
+    SELECT
+             i_item_id ,
+             i_item_desc ,
+             i_current_price
+    FROM     item,
+             inventory,
+             date_dim,
+             catalog_sales
+    WHERE    i_current_price BETWEEN 20 AND      20 + 30
+    AND      inv_item_sk = i_item_sk
+    AND      d_date_sk=inv_date_sk
+    AND      d_date BETWEEN Cast('1999-03-06' AS DATE) AND      (
+                      Cast('1999-03-06' AS DATE) + INTERVAL '60' day)
+    AND      i_manufact_id IN (843,815,850,840)
+    AND      inv_quantity_on_hand BETWEEN 100 AND      500
+    AND      cs_item_sk = i_item_sk
+    GROUP BY i_item_id,
+             i_item_desc,
+             i_current_price
+    ORDER BY i_item_id
+    LIMIT 100;
+    """
+
+
+def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
+    """Query 37."""
+    # Load tables
+    item = get_data(run_config.dataset_path, "item", run_config.suffix)
+    inventory = get_data(run_config.dataset_path, "inventory", run_config.suffix)
+    date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
+    catalog_sales = get_data(
+        run_config.dataset_path, "catalog_sales", run_config.suffix
+    )
+    return (
+        item.join(inventory, left_on="i_item_sk", right_on="inv_item_sk")
+        .join(date_dim, left_on="inv_date_sk", right_on="d_date_sk")
+        .join(catalog_sales, left_on="i_item_sk", right_on="cs_item_sk")
+        .filter(
+            (pl.col("i_current_price").is_between(20, 50))
+            & (pl.col("i_manufact_id").is_in([843, 815, 850, 840]))
+            & (pl.col("inv_quantity_on_hand").is_between(100, 500))
+            & (pl.col("d_date").is_between(pl.date(1999, 3, 6), pl.date(1999, 5, 5)))
+        )
+        .group_by(["i_item_id", "i_item_desc", "i_current_price"])
+        .agg([])
+        .sort(["i_item_id"])
+        .limit(100)
+    )
@@ -0,0 +1,109 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Query 38."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import polars as pl
+
+from cudf_polars.experimental.benchmarks.utils import get_data
+
+if TYPE_CHECKING:
+    from cudf_polars.experimental.benchmarks.utils import RunConfig
+
+
+def duckdb_impl(run_config: RunConfig) -> str:
+    """Query 38."""
+    return """
+    SELECT Count(*)
+    FROM   (SELECT DISTINCT c_last_name,
+                            c_first_name,
+                            d_date
+            FROM   store_sales,
+                   date_dim,
+                   customer
+            WHERE  store_sales.ss_sold_date_sk = date_dim.d_date_sk
+                   AND store_sales.ss_customer_sk = customer.c_customer_sk
+                   AND d_month_seq BETWEEN 1188 AND 1188 + 11
+            INTERSECT
+            SELECT DISTINCT c_last_name,
+                            c_first_name,
+                            d_date
+            FROM   catalog_sales,
+                   date_dim,
+                   customer
+            WHERE  catalog_sales.cs_sold_date_sk = date_dim.d_date_sk
+                   AND catalog_sales.cs_bill_customer_sk = customer.c_customer_sk
+                   AND d_month_seq BETWEEN 1188 AND 1188 + 11
+            INTERSECT
+            SELECT DISTINCT c_last_name,
+                            c_first_name,
+                            d_date
+            FROM   web_sales,
+                   date_dim,
+                   customer
+            WHERE  web_sales.ws_sold_date_sk = date_dim.d_date_sk
+                   AND web_sales.ws_bill_customer_sk = customer.c_customer_sk
+                   AND d_month_seq BETWEEN 1188 AND 1188 + 11) hot_cust
+    LIMIT 100;
+    """
+
+
+def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
+    """Query 38."""
+    # Load tables
+    store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
+    catalog_sales = get_data(
+        run_config.dataset_path, "catalog_sales", run_config.suffix
+    )
+    web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix)
+    date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
+    customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
+    # Filter date_dim for the specified month sequence range
+    date_filter = date_dim.filter(pl.col("d_month_seq").is_between(1188, 1188 + 11))
+    # Store sales customers with names and dates
+    store_customers = (
+        store_sales.join(date_filter, left_on="ss_sold_date_sk", right_on="d_date_sk")
+        .join(customer, left_on="ss_customer_sk", right_on="c_customer_sk")
+        .select(["c_last_name", "c_first_name", "d_date"])
+        .unique()
+    )
+    # Catalog sales customers with names and dates
+    catalog_customers = (
+        catalog_sales.join(date_filter, left_on="cs_sold_date_sk", right_on="d_date_sk")
+        .join(customer, left_on="cs_bill_customer_sk", right_on="c_customer_sk")
+        .select(["c_last_name", "c_first_name", "d_date"])
+        .unique()
+    )
+    # Web sales customers with names and dates
+    web_customers = (
+        web_sales.join(date_filter, left_on="ws_sold_date_sk", right_on="d_date_sk")
+        .join(customer, left_on="ws_bill_customer_sk", right_on="c_customer_sk")
+        .select(["c_last_name", "c_first_name", "d_date"])
+        .unique()
+    )
+    # Find INTERSECT of all three using a different approach
+    # Combine all three and find tuples that appear exactly 3 times
+    all_customers = pl.concat(
+        [
+            store_customers.with_columns(pl.lit("store").alias("source")),
+            catalog_customers.with_columns(pl.lit("catalog").alias("source")),
+            web_customers.with_columns(pl.lit("web").alias("source")),
+        ]
+    )
+    # Find combinations that appear in all three sources
+    intersect_final = (
+        all_customers.group_by(["c_last_name", "c_first_name", "d_date"])
+        .agg(pl.col("source").n_unique().alias("source_count"))
+        .filter(pl.col("source_count") == 3)
+        .select(["c_last_name", "c_first_name", "d_date"])
+    )
+    # Count the final result
+    return (
+        intersect_final
+        # Cast -> Int64 to match DuckDB
+        .select([pl.len().cast(pl.Int64).alias("count_star()")]).limit(100)
+    )
@@ -0,0 +1,176 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Query 39."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import polars as pl
+
+from cudf_polars.experimental.benchmarks.utils import get_data
+
+if TYPE_CHECKING:
+    from cudf_polars.experimental.benchmarks.utils import RunConfig
+
+
+def duckdb_impl(run_config: RunConfig) -> str:
+    """Query 39."""
+    return """
+        WITH inv AS
+        (SELECT w_warehouse_name,
+                w_warehouse_sk,
+                i_item_sk,
+                d_moy,
+                stdev,
+                mean,
+                CASE mean
+                    WHEN 0 THEN NULL
+                    ELSE stdev/mean
+                END cov
+        FROM
+            (SELECT w_warehouse_name,
+                    w_warehouse_sk,
+                    i_item_sk,
+                    d_moy,
+                    stddev_samp(inv_quantity_on_hand)*1.000 stdev,
+                    avg(inv_quantity_on_hand) mean
+            FROM inventory,
+                item,
+                warehouse,
+                date_dim
+            WHERE inv_item_sk = i_item_sk
+                AND inv_warehouse_sk = w_warehouse_sk
+                AND inv_date_sk = d_date_sk
+                AND d_year =2001
+            GROUP BY w_warehouse_name,
+                    w_warehouse_sk,
+                    i_item_sk,
+                    d_moy) foo
+        WHERE CASE mean
+                    WHEN 0 THEN 0
+                    ELSE stdev/mean
+                END > 1)
+        SELECT inv1.w_warehouse_sk wsk1,
+            inv1.i_item_sk isk1,
+            inv1.d_moy dmoy1,
+            inv1.mean mean1,
+            inv1.cov cov1,
+            inv2.w_warehouse_sk,
+            inv2.i_item_sk,
+            inv2.d_moy,
+            inv2.mean,
+            inv2.cov
+        FROM inv inv1,
+            inv inv2
+        WHERE inv1.i_item_sk = inv2.i_item_sk
+        AND inv1.w_warehouse_sk = inv2.w_warehouse_sk
+        AND inv1.d_moy=1
+        AND inv2.d_moy=1+1
+        ORDER BY inv1.w_warehouse_sk NULLS FIRST,
+                inv1.i_item_sk NULLS FIRST,
+                inv1.d_moy NULLS FIRST,
+                inv1.mean NULLS FIRST,
+                inv1.cov NULLS FIRST,
+                inv2.d_moy NULLS FIRST,
+                inv2.mean NULLS FIRST,
+                inv2.cov NULLS FIRST;
+    """
+
+
+def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
+    """Query 39."""
+    inventory = get_data(run_config.dataset_path, "inventory", run_config.suffix)
+    item = get_data(run_config.dataset_path, "item", run_config.suffix)
+    warehouse = get_data(run_config.dataset_path, "warehouse", run_config.suffix)
+    date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
+
+    base_agg = (
+        inventory.join(item, left_on="inv_item_sk", right_on="i_item_sk")
+        .join(warehouse, left_on="inv_warehouse_sk", right_on="w_warehouse_sk")
+        .join(date_dim, left_on="inv_date_sk", right_on="d_date_sk")
+        .filter(pl.col("d_year") == 2001)
+        .group_by(["w_warehouse_name", "inv_warehouse_sk", "inv_item_sk", "d_moy"])
+        .agg(
+            [
+                pl.col("inv_quantity_on_hand").std().alias("stdev"),
+                pl.col("inv_quantity_on_hand").mean().alias("mean"),
+            ]
+        )
+    )
+
+    inv_cte = base_agg.with_columns(
+        pl.when(pl.col("mean") == 0)
+        .then(None)
+        .otherwise(pl.col("stdev") / pl.col("mean"))
+        .alias("cov")
+    ).filter(
+        pl.when(pl.col("mean") == 0)
+        .then(False)  # noqa: FBT003
+        .otherwise(pl.col("stdev") / pl.col("mean") > 1.0)
+    )
+
+    inv1 = inv_cte.filter(pl.col("d_moy") == 1).select(
+        [
+            pl.col("inv_warehouse_sk").alias("w_warehouse_sk"),
+            pl.col("inv_item_sk").alias("i_item_sk"),
+            "d_moy",
+            "mean",
+            "cov",
+        ]
+    )
+
+    inv2 = (
+        inv_cte.filter(pl.col("d_moy") == 2)
+        .select(
+            [
+                "inv_warehouse_sk",
+                "inv_item_sk",
+                pl.col("d_moy").alias("d_moy_2"),
+                pl.col("mean").alias("mean_2"),
+                pl.col("cov").alias("cov_2"),
+            ]
+        )
+        .with_columns(
+            [
+                pl.col("inv_warehouse_sk").alias("w_warehouse_sk_2"),
+                pl.col("inv_item_sk").alias("i_item_sk_2"),
+            ]
+        )
+    )
+
+    return (
+        inv1.join(
+            inv2,
+            left_on=["w_warehouse_sk", "i_item_sk"],
+            right_on=["inv_warehouse_sk", "inv_item_sk"],
+            how="inner",
+        )
+        .select(
+            [
+                "w_warehouse_sk",
+                "i_item_sk",
+                "d_moy",
+                "mean",
+                "cov",
+                "w_warehouse_sk_2",
+                "i_item_sk_2",
+                "d_moy_2",
+                "mean_2",
+                "cov_2",
+            ]
+        )
+        .sort(
+            [
+                "w_warehouse_sk",
+                "i_item_sk",
+                "d_moy",
+                "mean",
+                "cov",
+                "d_moy_2",
+                "mean_2",
+                "cov_2",
+            ]
+        )
+    )