Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/workflows/sql-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,14 @@ on:
"targets": "duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
"scale_factor": "--scale-factor 100"
},
{
"id": "fineweb",
"subcommand": "fineweb",
"name": "FineWeb",
"local_dir": "bench-vortex/data/fineweb",
"targets": "duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,datafusion:parquet,datafusion:vortex,datafusion:vortex-compact",
"scale_factor": "--scale-factor 100"
},
]

jobs:
Expand Down
54 changes: 54 additions & 0 deletions bench-vortex/src/bin/query_bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use std::path::PathBuf;
use bench_vortex::benchmark_driver::{DriverConfig, run_benchmark};
use bench_vortex::clickbench::{ClickBenchBenchmark, Flavor};
use bench_vortex::display::DisplayFormat;
use bench_vortex::fineweb::Fineweb;
use bench_vortex::statpopgen::StatPopGenBenchmark;
use bench_vortex::tpcds::TpcDsBenchmark;
use bench_vortex::tpch::tpch_benchmark::TpcHBenchmark;
Expand Down Expand Up @@ -37,6 +38,9 @@ enum Commands {
/// Run Statisical & Population Genetics queries
#[command(name = "statpopgen")]
StatPopGen(StatPopGenArgs),

#[command(name = "fineweb")]
Fineweb(FinewebArgs),
}

/// Common arguments shared across benchmarks
Expand Down Expand Up @@ -202,6 +206,24 @@ struct StatPopGenArgs {
scale_factor: u64,
}

#[derive(Parser, Debug)]
struct FinewebArgs {
#[command(flatten)]
common: CommonArgs,

#[arg(long, value_delimiter = ',', value_parser = value_parser!(Target),
default_values = vec![
"duckdb:parquet",
"duckdb:vortex",
"duckdb:vortex-compact",
"datafusion:parquet",
"datafusion:vortex",
"datafusion:vortex-compact",
]
)]
targets: Vec<Target>,
}

fn validate_scale_factor(val: &str) -> Result<String, String> {
match val.parse::<f32>() {
Ok(n) if [0.01, 0.1, 1., 10., 100., 1000.].contains(&n) => {
Expand Down Expand Up @@ -230,6 +252,7 @@ fn main() -> anyhow::Result<()> {
Commands::TpcH(tpch_args) => run_tpch(tpch_args),
Commands::TpcDS(tpcds_args) => run_tpcds(tpcds_args),
Commands::StatPopGen(stat_pop_gen_args) => run_statpopgen(stat_pop_gen_args),
Commands::Fineweb(fineweb_args) => run_fineweb(fineweb_args),
}
}

Expand Down Expand Up @@ -367,3 +390,34 @@ fn run_statpopgen(args: StatPopGenArgs) -> anyhow::Result<()> {
// Run benchmark using the trait system
run_benchmark(benchmark, config)
}

fn run_fineweb(args: FinewebArgs) -> anyhow::Result<()> {
setup_logging_and_tracing(args.common.verbose, args.common.tracing)?;

let data_url = Url::from_directory_path("fineweb".to_data_path())
.map_err(|_| anyhow::anyhow!("bad data path"))?;

let benchmark = Fineweb::new(data_url);

let config = DriverConfig {
targets: args.targets,
iterations: args.common.iterations,
threads: args.common.threads,
display_format: args.common.display_format,
disable_datafusion_cache: args.common.disable_datafusion_cache,
delete_duckdb_database: args.common.delete_duckdb_database,
queries: args.common.queries,
exclude_queries: args.common.exclude_queries,
output_path: args.common.output_path,
emit_plan: args.common.emit_plan,
export_spans: args.common.export_spans,
show_metrics: args.common.show_metrics,
hide_progress_bar: args.common.hide_progress_bar,
track_memory: args.common.track_memory,
skip_generate: args.common.skip_generate,
explain: args.common.explain,
explain_analyze: args.common.explain_analyze,
};

run_benchmark(benchmark, config)
}
5 changes: 4 additions & 1 deletion bench-vortex/src/datasets/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ pub async fn register_vortex_files(
dataset: &BenchmarkDataset,
) -> Result<()> {
match dataset {
BenchmarkDataset::TpcH { .. } | BenchmarkDataset::TpcDS { .. } => {
BenchmarkDataset::TpcH { .. }
| BenchmarkDataset::TpcDS { .. }
| BenchmarkDataset::Fineweb => {
info!(
"Registering table from {}, with glob {:?}",
&file_url,
Expand Down Expand Up @@ -156,6 +158,7 @@ pub async fn register_vortex_compact_files(
}
BenchmarkDataset::PublicBi { .. } => todo!(),
BenchmarkDataset::StatPopGen { .. } => todo!(),
BenchmarkDataset::Fineweb => todo!(),
}

Ok(())
Expand Down
10 changes: 9 additions & 1 deletion bench-vortex/src/datasets/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use url::Url;
use vortex::ArrayRef;

use crate::clickbench::Flavor;
use crate::{Format, clickbench, statpopgen};
use crate::{Format, clickbench, fineweb, statpopgen};

pub mod data_downloads;
pub mod file;
Expand All @@ -38,6 +38,8 @@ pub enum BenchmarkDataset {
PublicBi { name: String },
#[serde(rename = "statpopgen")]
StatPopGen { n_rows: u64 },
#[serde(rename = "fineweb")]
Fineweb,
}

impl BenchmarkDataset {
Expand All @@ -48,6 +50,7 @@ impl BenchmarkDataset {
BenchmarkDataset::ClickBench { .. } => "clickbench",
BenchmarkDataset::PublicBi { .. } => "public-bi",
BenchmarkDataset::StatPopGen { .. } => "statpopgen",
BenchmarkDataset::Fineweb => "fineweb",
}
}
}
Expand All @@ -63,6 +66,7 @@ impl Display for BenchmarkDataset {
},
BenchmarkDataset::PublicBi { name } => write!(f, "public-bi({name})"),
BenchmarkDataset::StatPopGen { n_rows } => write!(f, "statpopgen(n_rows={n_rows})"),
BenchmarkDataset::Fineweb => write!(f, "fineweb"),
}
}
}
Expand Down Expand Up @@ -102,6 +106,7 @@ impl BenchmarkDataset {
],
BenchmarkDataset::ClickBench { .. } | BenchmarkDataset::PublicBi { .. } => todo!(),
BenchmarkDataset::StatPopGen { .. } => &["statpopgen"],
BenchmarkDataset::Fineweb => &["fineweb"],
}
}

Expand Down Expand Up @@ -153,6 +158,9 @@ impl BenchmarkDataset {
(BenchmarkDataset::StatPopGen { .. }, format) => {
anyhow::bail!("StatPopGen in {format} unsupported in DataFusion")
}
(BenchmarkDataset::Fineweb, format) => {
fineweb::register_table(session, base_url, format).await?
}
}

Ok(())
Expand Down
14 changes: 12 additions & 2 deletions bench-vortex/src/engines/ddb/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::path::PathBuf;
use std::time::{Duration, Instant};

use anyhow::Result;
use log::trace;
use log::{info, trace};
use url::Url;
use vortex::error::VortexExpect;
use vortex_duckdb::duckdb::{Config, Connection, Database};
Expand Down Expand Up @@ -51,6 +51,7 @@ impl DuckDBCtx {
BenchmarkDataset::StatPopGen { n_rows } => {
format!("statpopgen/{n_rows}/{}", format.name()).to_data_path()
}
BenchmarkDataset::Fineweb => format!("fineweb/{}", format.name()).to_data_path(),
};
std::fs::create_dir_all(&dir)?;
let db_path = dir.join("duckdb.db");
Expand Down Expand Up @@ -163,7 +164,7 @@ impl DuckDBCtx {

// Generate and execute table registration commands
let commands = self.generate_table_commands(&effective_url, extension, dataset, object);
trace!("Executing table registration commands: {commands}");
info!("Executing table registration commands: {commands}");
self.execute_query(&commands)?;

Ok(())
Expand Down Expand Up @@ -199,7 +200,9 @@ impl DuckDBCtx {
) -> String {
// Base path contains trailing /.
let base_dir = base_url.as_str();
info!("base_dir1: {base_dir}");
let base_dir = base_dir.strip_prefix("file://").unwrap_or(base_dir);
info!("base_dir2: {base_dir}");
match dataset {
BenchmarkDataset::TpcH { .. } => {
let mut commands = String::new();
Expand Down Expand Up @@ -244,6 +247,13 @@ impl DuckDBCtx {
duckdb_object.to_str()
)
}
BenchmarkDataset::Fineweb => {
let path = format!("{base_dir}*.{extension}");
format!(
"CREATE {} IF NOT EXISTS fineweb AS SELECT * FROM read_{extension}('{path}');",
duckdb_object.to_str(),
)
}
}
}
}
Loading
Loading