Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: speed up random access by file name by changing how we index into vnodes #490

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
402 changes: 212 additions & 190 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "Oxen"
version = "0.24.3"
version = "0.25.0"
edition = "2021"
license-file = "LICENSE"
description = "Oxen is a fast, unstructured data version control, to help version large machine learning datasets written in Rust."
Expand Down
29 changes: 24 additions & 5 deletions benchmark/generate_image_repo_parallel.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,39 @@
import os
import numpy as np
from PIL import Image
from PIL import Image, ImageDraw, ImageFont
import argparse
from tqdm import tqdm
import pandas as pd
from multiprocessing import Pool, cpu_count

def generate_single_image(args):
i, num_dirs, images_dir, image_size, output_dir = args
i, num_dirs, images_dir, image_size, output_dir, show_index = args
subdir = os.path.join(images_dir, f"split_{i % num_dirs}")
noise = np.random.randint(0, 256, (image_size[0], image_size[1], 3), dtype=np.uint8)
img = Image.fromarray(noise)

# Add index text if requested
if show_index:
# Make the image a black background
img = Image.new("RGB", image_size, (0, 0, 0))
draw = ImageDraw.Draw(img)
# Use a simple font and size that scales with image
font_size = min(image_size) // 10
try:
font = ImageFont.truetype("arial.ttf", font_size)
except:
# Fallback to default font if arial not available
font = ImageFont.load_default()

text = str(i)
# Draw text in top-left corner with white color
draw.text((10, 10), text, fill=(255, 255, 255), font=font)

path = os.path.join(subdir, f"noise_image_{i}.png")
img.save(path)
return os.path.relpath(path, output_dir)

def generate_noise_images(num_images, output_dir, num_dirs, image_size):
def generate_noise_images(num_images, output_dir, num_dirs, image_size, show_index):
print(f"Generating {num_images} images with {num_dirs} directories in {output_dir}")
os.makedirs(output_dir, exist_ok=True)
images_dir = os.path.join(output_dir, "images")
Expand All @@ -25,7 +43,7 @@ def generate_noise_images(num_images, output_dir, num_dirs, image_size):
os.makedirs(os.path.join(images_dir, f"split_{i}"), exist_ok=True)

# Prepare arguments for parallel processing
args_list = [(i, num_dirs, images_dir, image_size, output_dir) for i in range(num_images)]
args_list = [(i, num_dirs, images_dir, image_size, output_dir, show_index) for i in range(num_images)]

# Use multiprocessing to generate images in parallel
with Pool(processes=cpu_count()) as pool:
Expand All @@ -41,11 +59,12 @@ def generate_noise_images(num_images, output_dir, num_dirs, image_size):
parser.add_argument("--num_dirs", type=int, default=1000)
parser.add_argument("--output_dir", type=str, default="noise_images")
parser.add_argument("--image_size", type=int, nargs=2, default=(128, 128))
parser.add_argument("--show_index", action="store_true", help="Show image index on generated images")
# TODO: Add random sample % as a parameter and use that instead of mod
args = parser.parse_args()


image_paths = generate_noise_images(args.num_images, args.output_dir, args.num_dirs, args.image_size)
image_paths = generate_noise_images(args.num_images, args.output_dir, args.num_dirs, args.image_size, args.show_index)
print("Image generation complete!")

# create random labels for each image of cat or dog
Expand Down
2 changes: 1 addition & 1 deletion src/cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "oxen-cli"
version = "0.24.3"
version = "0.25.0"
edition = "2021"

[dependencies]
Expand Down
16 changes: 13 additions & 3 deletions src/cli/src/cmd/delete_remote.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ impl RunCmd for DeleteRemoteCmd {
.help("The host you want to create the remote repository on. For example: 'hub.oxen.ai'")
.action(clap::ArgAction::Set),
)
.arg(
Arg::new("scheme")
.long("scheme")
.help("The scheme for the url of the remote repository. For example: 'https' or 'http'")
.action(clap::ArgAction::Set),
)
.arg(
Arg::new("yes")
.long("yes")
Expand All @@ -55,10 +61,14 @@ impl RunCmd for DeleteRemoteCmd {
.get_one::<String>("host")
.map(String::from)
.unwrap_or(DEFAULT_HOST.to_string());
// Default scheme
let scheme = args
.get_one::<String>("scheme")
.map(String::from)
.unwrap_or("https".to_string());

let Some(remote_repo) =
api::client::repositories::get_by_name_and_host(namespace_name, host).await?
else {
let url = format!("{}://{host}/{namespace_name}", scheme);
let Some(remote_repo) = api::client::repositories::get_by_url(&url).await? else {
return Err(OxenError::basic_str(format!(
"Remote repository not found: {namespace_name}"
)));
Expand Down
33 changes: 28 additions & 5 deletions src/cli/src/cmd/node.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use async_trait::async_trait;
use clap::{Arg, Command};
use liboxen::core::v0_19_0::index::CommitMerkleTree;
use liboxen::core::v_latest::index::CommitMerkleTree;
use liboxen::error::OxenError;
use liboxen::model::{LocalRepository, MerkleHash};
use liboxen::repositories;

use std::str::FromStr;

Expand All @@ -20,7 +21,6 @@ impl RunCmd for NodeCmd {
// Setups the CLI args for the command
Command::new(NAME)
.about("Inspect an oxen merkle tree node")
.arg(Arg::new("node").required(true).action(clap::ArgAction::Set))
// add --verbose flag
.arg(
Arg::new("verbose")
Expand All @@ -29,13 +29,36 @@ impl RunCmd for NodeCmd {
.help("Verbose output")
.action(clap::ArgAction::SetTrue),
)
// add --node flag
.arg(
Arg::new("node")
.long("node")
.short('n')
.help("Node hash to inspect"),
)
// add --file flag
.arg(
Arg::new("file")
.long("file")
.short('f')
.help("File path to inspect"),
)
}

async fn run(&self, args: &clap::ArgMatches) -> Result<(), OxenError> {
// Parse Args
let node_hash = args.get_one::<String>("node").expect("Must supply node");

// Find the repository
let repository = LocalRepository::from_current_dir()?;

// if the --file flag is set, we need to get the node for the file
if let Some(file) = args.get_one::<String>("file") {
let commit = repositories::commits::head_commit(&repository)?;
let node = repositories::entries::get_file(&repository, &commit, file)?;
println!("{:?}", node);
return Ok(());
}

// otherwise, get the node based on the node hash
let node_hash = args.get_one::<String>("node").expect("Must supply node");
let node_hash = MerkleHash::from_str(node_hash)?;
let node = CommitMerkleTree::read_node(&repository, &node_hash, false)?;

Expand Down
19 changes: 10 additions & 9 deletions src/cli/src/cmd/tree.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use async_trait::async_trait;
use clap::{Arg, Command};
use liboxen::core::v0_19_0::index::CommitMerkleTree;
use liboxen::core::v_latest::index::CommitMerkleTree;
use liboxen::error::OxenError;
use liboxen::model::{Commit, LocalRepository, MerkleHash};
use liboxen::repositories;
Expand Down Expand Up @@ -112,27 +112,28 @@ impl TreeCmd {
depth: i32,
) -> Result<(), OxenError> {
let load_start = Instant::now(); // Start timing
let tree = match (repo.subtree_paths(), repo.depth()) {
match (repo.subtree_paths(), repo.depth()) {
(Some(subtrees), Some(depth)) => {
println!("Working with subtrees: {:?}", subtrees);
println!("Depth: {}", depth);
println!("Loading first tree...");
CommitMerkleTree::from_path_depth(repo, commit, subtrees.first().unwrap(), depth)?
repositories::tree::print_tree_depth_subtree(
repo,
commit,
depth,
subtrees.first().unwrap(),
)?;
}
(_, _) => {
if let Some(path) = path {
CommitMerkleTree::from_path(repo, commit, path, true)?
repositories::tree::print_tree_path(repo, commit, path)?;
} else {
CommitMerkleTree::from_commit(repo, commit)?
repositories::tree::print_tree_depth(repo, commit, depth)?;
}
}
};
let load_duration = load_start.elapsed(); // Calculate duration
let print_start = Instant::now(); // Start timing
tree.print_depth(depth);
let print_duration = print_start.elapsed(); // Calculate duration
println!("Time to load tree: {:?}", load_duration);
println!("Time to print tree: {:?}", print_duration);
Ok(())
}
}
2 changes: 1 addition & 1 deletion src/lib/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "liboxen"
version = "0.24.3"
version = "0.25.0"
edition = "2021"
license-file = "LICENSE"
description = "Oxen is a fast, unstructured data version control, to help version datasets, written in Rust."
Expand Down
Loading
Loading