Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0

"""Query 37."""

from __future__ import annotations

from typing import TYPE_CHECKING

import polars as pl

from cudf_polars.experimental.benchmarks.utils import get_data

if TYPE_CHECKING:
from cudf_polars.experimental.benchmarks.utils import RunConfig


def duckdb_impl(run_config: RunConfig) -> str:
"""Query 37."""
return """
SELECT
i_item_id ,
i_item_desc ,
i_current_price
FROM item,
inventory,
date_dim,
catalog_sales
WHERE i_current_price BETWEEN 20 AND 20 + 30
AND inv_item_sk = i_item_sk
AND d_date_sk=inv_date_sk
AND d_date BETWEEN Cast('1999-03-06' AS DATE) AND (
Cast('1999-03-06' AS DATE) + INTERVAL '60' day)
AND i_manufact_id IN (843,815,850,840)
AND inv_quantity_on_hand BETWEEN 100 AND 500
AND cs_item_sk = i_item_sk
GROUP BY i_item_id,
i_item_desc,
i_current_price
ORDER BY i_item_id
LIMIT 100;
"""


def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
"""Query 37."""
# Load tables
item = get_data(run_config.dataset_path, "item", run_config.suffix)
inventory = get_data(run_config.dataset_path, "inventory", run_config.suffix)
date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
catalog_sales = get_data(
run_config.dataset_path, "catalog_sales", run_config.suffix
)
return (
item.join(inventory, left_on="i_item_sk", right_on="inv_item_sk")
.join(date_dim, left_on="inv_date_sk", right_on="d_date_sk")
.join(catalog_sales, left_on="i_item_sk", right_on="cs_item_sk")
.filter(
(pl.col("i_current_price").is_between(20, 50))
& (pl.col("i_manufact_id").is_in([843, 815, 850, 840]))
& (pl.col("inv_quantity_on_hand").is_between(100, 500))
& (pl.col("d_date").is_between(pl.date(1999, 3, 6), pl.date(1999, 5, 5)))
)
.group_by(["i_item_id", "i_item_desc", "i_current_price"])
.agg([])
.sort(["i_item_id"])
.limit(100)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0

"""Query 38."""

from __future__ import annotations

from typing import TYPE_CHECKING

import polars as pl

from cudf_polars.experimental.benchmarks.utils import get_data

if TYPE_CHECKING:
from cudf_polars.experimental.benchmarks.utils import RunConfig


def duckdb_impl(run_config: RunConfig) -> str:
"""Query 38."""
return """
SELECT Count(*)
FROM (SELECT DISTINCT c_last_name,
c_first_name,
d_date
FROM store_sales,
date_dim,
customer
WHERE store_sales.ss_sold_date_sk = date_dim.d_date_sk
AND store_sales.ss_customer_sk = customer.c_customer_sk
AND d_month_seq BETWEEN 1188 AND 1188 + 11
INTERSECT
SELECT DISTINCT c_last_name,
c_first_name,
d_date
FROM catalog_sales,
date_dim,
customer
WHERE catalog_sales.cs_sold_date_sk = date_dim.d_date_sk
AND catalog_sales.cs_bill_customer_sk = customer.c_customer_sk
AND d_month_seq BETWEEN 1188 AND 1188 + 11
INTERSECT
SELECT DISTINCT c_last_name,
c_first_name,
d_date
FROM web_sales,
date_dim,
customer
WHERE web_sales.ws_sold_date_sk = date_dim.d_date_sk
AND web_sales.ws_bill_customer_sk = customer.c_customer_sk
AND d_month_seq BETWEEN 1188 AND 1188 + 11) hot_cust
LIMIT 100;
"""


def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
"""Query 38."""
# Load tables
store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
catalog_sales = get_data(
run_config.dataset_path, "catalog_sales", run_config.suffix
)
web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix)
date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
# Filter date_dim for the specified month sequence range
date_filter = date_dim.filter(pl.col("d_month_seq").is_between(1188, 1188 + 11))
# Store sales customers with names and dates
store_customers = (
store_sales.join(date_filter, left_on="ss_sold_date_sk", right_on="d_date_sk")
.join(customer, left_on="ss_customer_sk", right_on="c_customer_sk")
.select(["c_last_name", "c_first_name", "d_date"])
.unique()
)
# Catalog sales customers with names and dates
catalog_customers = (
catalog_sales.join(date_filter, left_on="cs_sold_date_sk", right_on="d_date_sk")
.join(customer, left_on="cs_bill_customer_sk", right_on="c_customer_sk")
.select(["c_last_name", "c_first_name", "d_date"])
.unique()
)
# Web sales customers with names and dates
web_customers = (
web_sales.join(date_filter, left_on="ws_sold_date_sk", right_on="d_date_sk")
.join(customer, left_on="ws_bill_customer_sk", right_on="c_customer_sk")
.select(["c_last_name", "c_first_name", "d_date"])
.unique()
)
# Find INTERSECT of all three using a different approach
# Combine all three and find tuples that appear exactly 3 times
all_customers = pl.concat(
[
store_customers.with_columns(pl.lit("store").alias("source")),
catalog_customers.with_columns(pl.lit("catalog").alias("source")),
web_customers.with_columns(pl.lit("web").alias("source")),
]
)
# Find combinations that appear in all three sources
intersect_final = (
all_customers.group_by(["c_last_name", "c_first_name", "d_date"])
.agg(pl.col("source").n_unique().alias("source_count"))
.filter(pl.col("source_count") == 3)
.select(["c_last_name", "c_first_name", "d_date"])
)
# Count the final result
return (
intersect_final
# Cast -> Int64 to match DuckDB
.select([pl.len().cast(pl.Int64).alias("count_star()")]).limit(100)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0

"""Query 39."""

from __future__ import annotations

from typing import TYPE_CHECKING

import polars as pl

from cudf_polars.experimental.benchmarks.utils import get_data

if TYPE_CHECKING:
from cudf_polars.experimental.benchmarks.utils import RunConfig


def duckdb_impl(run_config: RunConfig) -> str:
"""Query 39."""
return """
WITH inv AS
(SELECT w_warehouse_name,
w_warehouse_sk,
i_item_sk,
d_moy,
stdev,
mean,
CASE mean
WHEN 0 THEN NULL
ELSE stdev/mean
END cov
FROM
(SELECT w_warehouse_name,
w_warehouse_sk,
i_item_sk,
d_moy,
stddev_samp(inv_quantity_on_hand)*1.000 stdev,
avg(inv_quantity_on_hand) mean
FROM inventory,
item,
warehouse,
date_dim
WHERE inv_item_sk = i_item_sk
AND inv_warehouse_sk = w_warehouse_sk
AND inv_date_sk = d_date_sk
AND d_year =2001
GROUP BY w_warehouse_name,
w_warehouse_sk,
i_item_sk,
d_moy) foo
WHERE CASE mean
WHEN 0 THEN 0
ELSE stdev/mean
END > 1)
SELECT inv1.w_warehouse_sk wsk1,
inv1.i_item_sk isk1,
inv1.d_moy dmoy1,
inv1.mean mean1,
inv1.cov cov1,
inv2.w_warehouse_sk,
inv2.i_item_sk,
inv2.d_moy,
inv2.mean,
inv2.cov
FROM inv inv1,
inv inv2
WHERE inv1.i_item_sk = inv2.i_item_sk
AND inv1.w_warehouse_sk = inv2.w_warehouse_sk
AND inv1.d_moy=1
AND inv2.d_moy=1+1
ORDER BY inv1.w_warehouse_sk NULLS FIRST,
inv1.i_item_sk NULLS FIRST,
inv1.d_moy NULLS FIRST,
inv1.mean NULLS FIRST,
inv1.cov NULLS FIRST,
inv2.d_moy NULLS FIRST,
inv2.mean NULLS FIRST,
inv2.cov NULLS FIRST;
"""


def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
"""Query 39."""
inventory = get_data(run_config.dataset_path, "inventory", run_config.suffix)
item = get_data(run_config.dataset_path, "item", run_config.suffix)
warehouse = get_data(run_config.dataset_path, "warehouse", run_config.suffix)
date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)

base_agg = (
inventory.join(item, left_on="inv_item_sk", right_on="i_item_sk")
.join(warehouse, left_on="inv_warehouse_sk", right_on="w_warehouse_sk")
.join(date_dim, left_on="inv_date_sk", right_on="d_date_sk")
.filter(pl.col("d_year") == 2001)
.group_by(["w_warehouse_name", "inv_warehouse_sk", "inv_item_sk", "d_moy"])
.agg(
[
pl.col("inv_quantity_on_hand").std().alias("stdev"),
pl.col("inv_quantity_on_hand").mean().alias("mean"),
]
)
)

inv_cte = base_agg.with_columns(
pl.when(pl.col("mean") == 0)
.then(None)
.otherwise(pl.col("stdev") / pl.col("mean"))
.alias("cov")
).filter(
pl.when(pl.col("mean") == 0)
.then(False) # noqa: FBT003
.otherwise(pl.col("stdev") / pl.col("mean") > 1.0)
)

inv1 = inv_cte.filter(pl.col("d_moy") == 1).select(
[
pl.col("inv_warehouse_sk").alias("w_warehouse_sk"),
pl.col("inv_item_sk").alias("i_item_sk"),
"d_moy",
"mean",
"cov",
]
)

inv2 = (
inv_cte.filter(pl.col("d_moy") == 2)
.select(
[
"inv_warehouse_sk",
"inv_item_sk",
pl.col("d_moy").alias("d_moy_2"),
pl.col("mean").alias("mean_2"),
pl.col("cov").alias("cov_2"),
]
)
.with_columns(
[
pl.col("inv_warehouse_sk").alias("w_warehouse_sk_2"),
pl.col("inv_item_sk").alias("i_item_sk_2"),
]
)
)

return (
inv1.join(
inv2,
left_on=["w_warehouse_sk", "i_item_sk"],
right_on=["inv_warehouse_sk", "inv_item_sk"],
how="inner",
)
.select(
[
"w_warehouse_sk",
"i_item_sk",
"d_moy",
"mean",
"cov",
"w_warehouse_sk_2",
"i_item_sk_2",
"d_moy_2",
"mean_2",
"cov_2",
]
)
.sort(
[
"w_warehouse_sk",
"i_item_sk",
"d_moy",
"mean",
"cov",
"d_moy_2",
"mean_2",
"cov_2",
]
)
)
Loading
Loading