Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ Tobias is your friendly PostgreSQL DBA who is obsessed with optimizing your data

Named after a nerdy but friendly DBA who is obsessed with query performance, Tobias helps you find the optimal `work_mem` setting for your PostgreSQL queries. It runs your queries with various memory settings to determine the minimum `work_mem` needed to keep your queries entirely in memory without creating temporary files.

## Prerequisites

You'll need the huggingface CLI. On macOS you can install it via Homebrew:

```shell
$ brew install huggingface-cli
```

## Installation

```shell
Expand Down
4 changes: 4 additions & 0 deletions lib/tobias.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
end
end

# See: https://github.com/ged/ruby-pg/issues/538#issuecomment-1591629049
ENV["PGGSSENCMODE"] = "disable"

require "bundler/setup"
Bundler.require(:default)

Expand All @@ -21,6 +24,7 @@
require "enumerable-stats"
require "benchmark"
require "parquet"
require "parallel"
require "tty-markdown"
require "tty-table"

Expand Down
2 changes: 1 addition & 1 deletion lib/tobias/cli.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def profile(script)
raise "Script not found at: #{script}"
end

container = Container.new(code)
container = Container.new(code, database)
results = {}

parsed = TTY::Markdown.parse(<<~MARKDOWN)
Expand Down
65 changes: 31 additions & 34 deletions lib/tobias/container.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

module Tobias
class Container
def initialize(code)
def initialize(code, database)
@code = code
@database = database
@queries = Concurrent::Hash.new
@sql = Concurrent::Hash.new
@options = Concurrent::Hash.new
Expand All @@ -15,54 +16,46 @@ def initialize(code)
eval(code, binding, __FILE__, __LINE__)
end

def run_setup(context)
run_action(@setup, context)
end
module DefaultHelpers
def db
@database
end

def run_parallel(list = Etc.nprocessors.times, &block)
db.disconnect

def run_load_data(context)
Etc.nprocessors.times do
fork do
context.disconnect
run_action(@load_data, context)
Parallel.each(list, in_processes: Etc.nprocessors) do |item|
instance_exec(item, &block)
end
end
end

context.disconnect
Process.waitall
def run_setup
@database.run("CREATE EXTENSION IF NOT EXISTS pg_stat_statements")
run_action(@setup)
end

def run_query(query, context)
sql = if query.is_a?(String)
query
else
run_action(query, context).sql
end
def run_query(query)
@database.run(run_action(query).sql)

Copilot AI Aug 18, 2025

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The method calls run_action(query).sql but this assumes the result has a .sql method. However, if the query block returns a Sequel dataset, this should work, but if it returns other types, this could fail. Consider adding error handling or type checking.

Suggested change
@database.run(run_action(query).sql)
result = run_action(query)
sql = if result.respond_to?(:sql)
result.sql
elsif result.is_a?(String)
result
else
raise TypeError, "Query block must return a Sequel dataset or a String, got #{result.class}"
end
@database.run(sql)

Copilot uses AI. Check for mistakes.
end

context.run(sql)
def run_teardown
run_action(@teardown)
end

def run_teardown(context)
run_action(@teardown, context)
def options
Struct.new(*@options.keys).new(*@options.values)
end

def run_action(action, context)
options = Struct.new(*@options.keys).new(*@options.values)
def run_action(action)
helpers = @helpers

context.class_eval do
class_eval do
include DefaultHelpers
include helpers

def options=(new_options)
@options = new_options
end

def options
@options
end
end

context.options = options
context.instance_eval(&action)
instance_eval(&action)
end

def queries
Expand Down Expand Up @@ -90,7 +83,11 @@ def load_data(&block)
end

def query(name, sql = nil, &block)
@queries[name] = sql || block
if sql.is_a?(String)
@queries[name] = Proc.new { sql }
else
@queries[name] = block || Proc.new { raise "No SQL provided for query '#{name}'" }
end
end
end
end
5 changes: 2 additions & 3 deletions lib/tobias/evaluations/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,15 @@ def initialize(database, container, options)
def run(&block)
results = Concurrent::Array.new

container.run_setup(database)
container.run_load_data(database)
container.run_setup
container.queries.each do |name, query|
result = run_each(name, query)
results << result if result
end

to_markdown(results)
ensure
container.run_teardown(database)
container.run_teardown
end

def run_each(query)
Expand Down
4 changes: 2 additions & 2 deletions lib/tobias/evaluations/work_mem.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def current_work_mem
end

def description
"Optional work_mem settings"
"Optimal work_mem settings"
Comment thread
binarycleric marked this conversation as resolved.
end

def run_each(name, query)
Expand All @@ -22,7 +22,7 @@ def run_each(name, query)
database.transaction do
database.run("SET LOCAL work_mem = '#{value.to_sql}'")
database.select(Sequel.function(:pg_stat_reset)).first
container.run_query(query, database)
container.run_query(query)

stats = database[:pg_stat_database].
where(datname: Sequel.function(:current_database)).
Expand Down
52 changes: 30 additions & 22 deletions scripts/stress.rb
Original file line number Diff line number Diff line change
@@ -1,25 +1,22 @@
# frozen_string_literal: true

option(:total_rows, 10_000_000)
option(:total_rows, 1_000_000)

setup do
create_table? :workmem_stress do
db.create_table? :workmem_stress do
primary_key :id
column :name, String
column :value, Integer
column :payload, String
column :created_at, :timestamp, default: Sequel::CURRENT_TIMESTAMP
end
end

load_data do
loop do
break if from(:workmem_stress).count >= options.total_rows

100.times do
from(:workmem_stress).multi_insert(1000.times.map do
run_parallel do
(options.total_rows / Etc.nprocessors / 1000).times do
db.from(:workmem_stress).multi_insert(1000.times.map do
{
name: "name_#{Random.rand(1..1000)}",
value: Random.rand(1..10_000),
name: "name_#{Random.rand(1..1_000_000)}",
value: Random.rand(1..1_000_000),
payload: SecureRandom.hex(128)
}
end)
Expand All @@ -28,26 +25,37 @@
end

teardown do
drop_table(:workmem_stress)
db.drop_table(:workmem_stress)
end

query(:large_sort) do
from(:workmem_stress)
.select(:id, :name, :value, :payload)
.order(Sequel.desc(:payload))
.limit(10_000)
db.
from(:workmem_stress).
select(:id, :name, :value, :payload).
order(Sequel.desc(:payload)).
limit(10_000)
end

query(:large_sort_created_at) do
db.
from(:workmem_stress).
select(:id, :name, :value, :payload).
order(Sequel.desc(:created_at)).
limit(10_000)
end

query(:hash_aggregation) do
from(:workmem_stress)
.select(:name, Sequel.function(:avg, :value))
.select { count("*") }
.group(:name)
.limit(10_000)
db.
from(:workmem_stress).
select(:name, Sequel.function(:avg, :value)).
select { count("*") }.
group(:name).
limit(10_000)
end

query(:self_join) do
from(Sequel.as(:workmem_stress, :a)).
db.
from(Sequel.as(:workmem_stress, :a)).
join(Sequel.as(:workmem_stress, :b), id: :id).
where { Sequel[:a][:id] < 1000 }.
where { Sequel[:b][:id] < 1000 }.
Expand Down
7 changes: 4 additions & 3 deletions scripts/tpcc.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# frozen_string_literal: true

query(:stock_by_warehouse_and_district) do
warehouse_id = from(:warehouse).
warehouse_id = db.from(:warehouse).
order(Sequel.lit("RANDOM()")).
limit(1).
first[:w_id]
Expand All @@ -12,7 +12,7 @@
limit(1).
first[:d_id]

from(:stock).
db.from(:stock).
join(:order_line, ol_w_id: :s_w_id, ol_i_id: :s_i_id).
where(ol_w_id: warehouse_id, ol_d_id: district_id).
where(Sequel.lit("s_quantity < ?", rand(100..500))).
Expand All @@ -23,7 +23,8 @@
end

query(:most_active_districts) do
from(:district).
db.
from(:district).
join(:order_line, [[:ol_d_id, :d_id], [:ol_w_id, :d_w_id]]).
group(:d_w_id, :d_id, :d_name).
select(
Expand Down
54 changes: 31 additions & 23 deletions scripts/vector.rb
Original file line number Diff line number Diff line change
@@ -1,43 +1,49 @@
# frozen_string_literal: true

option(:total_vectors, 5_000_000)
option(:vector_dimension, 1_536)

helpers do
def random_vector(size: options.vector_dimension)
def random_vector(size: 1_536)
Array.new(size) { rand(-1.0..1.0) }
end

def download_from_hugging_face(repo, local_dir="/tmp/#{repo}")
`hf download #{repo} --repo-type=dataset --local-dir #{local_dir}`
Comment thread
binarycleric marked this conversation as resolved.
Outdated

Copilot AI Aug 18, 2025

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Direct string interpolation in shell commands is vulnerable to command injection. Use proper shell escaping or consider using a Ruby library like Open3 with proper parameter sanitization.

Suggested change
`hf download #{repo} --repo-type=dataset --local-dir #{local_dir}`
stdout, status = Open3.capture2("hf", "download", repo, "--repo-type=dataset", "--local-dir", local_dir)
stdout

Copilot uses AI. Check for mistakes.
end
end

setup do
run("CREATE EXTENSION IF NOT EXISTS vector")
db.run("CREATE EXTENSION IF NOT EXISTS vector")

dimensions = options.vector_dimension
create_table? :items do
db.create_table? :items do
primary_key :id
column :embedding, "vector(#{dimensions})"
column :title, :text
column :text, :text
column :embedding, "vector(#{1_536})"
column :created_at, :timestamp, default: Sequel::CURRENT_TIMESTAMP
end
end

load_data do
loop do
break if from(:items).count >= options.total_vectors

from(:items).multi_insert(1_000.times.map do
{
embedding: ::Pgvector.encode(random_vector)
}
end)
download_from_hugging_face("KShivendu/dbpedia-entities-openai-1M", "/tmp/dbpedia-entities-openai-1M")
run_parallel(Dir.glob("/tmp/dbpedia-entities-openai-1M/data/*.parquet")) do |file|
Parquet.each_row(file, columns: ["title", "text", "openai"]) do |row|
db.from(:items).insert(
title: row["title"],
text: row["text"],
embedding: "[#{row["openai"].join(",")}]"
Comment thread
binarycleric marked this conversation as resolved.
Outdated
)
end
end

db.run("SET maintenance_work_mem = '128MB';")
db.run("CREATE INDEX IF NOT EXISTS items_embedding_idx ON items USING ivfflat (embedding) WITH (lists = 100)")
end

teardown do
drop_table(:items)
run("DROP EXTENSION IF EXISTS vector")
db.drop_table(:items)
db.run("DROP EXTENSION IF EXISTS vector")
end

query(:euclidean_nearest_neighbors) do
from(:items).
db.
from(:items).
nearest_neighbors(
:embedding,
random_vector,
Expand All @@ -47,7 +53,8 @@ def random_vector(size: options.vector_dimension)
end

query(:cosine_nearest_neighbors) do
from(:items).
db.
from(:items).
nearest_neighbors(
:embedding,
random_vector,
Expand All @@ -57,7 +64,8 @@ def random_vector(size: options.vector_dimension)
end

query(:inner_product_nearest_neighbors) do
from(:items).
db.
from(:items).
nearest_neighbors(
:embedding,
random_vector,
Expand Down
1 change: 1 addition & 0 deletions tobias.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Gem::Specification.new do |s|
s.add_dependency "sequel", "~> 5.76", ">= 5.76.0"
s.add_dependency "parquet", "~> 0.7", ">= 0.7.3"
s.add_dependency "pgvector", "~> 0.3", ">= 0.3.0"
s.add_dependency "parallel", "~> 1.20", ">= 1.20.0"
s.add_dependency "thor", "~> 1.3", ">= 1.3.0"
s.add_dependency "tty-markdown", "~> 0.7", ">= 0.7.0"
s.add_dependency "tty-table", "~> 0.12", ">= 0.12.0"
Expand Down
Loading