Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up random access by file name by changing how we index into vnodes #490

Merged
merged 25 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
8b40663
speed up random access by file name by changing how we index into vno…
gschoeni Dec 28, 2024
c43b815
WIP: Changing to v_latest
gschoeni Dec 31, 2024
696939a
Merge branch 'main' into feat/speedup-random-access-by-path
gschoeni Jan 7, 2025
b16b116
remove redis dependency
gschoeni Jan 7, 2025
7ebf8a3
have different impl's for VNode
gschoeni Jan 8, 2025
183d515
make some older views load with v19
gschoeni Jan 8, 2025
6c729d3
unit tests passing for v0.25.0
gschoeni Jan 8, 2025
7461c1c
unit tests passing after refactoring CommitMerkleTree out
gschoeni Jan 9, 2025
af84b79
fix clippy errors
gschoeni Jan 9, 2025
360c222
fall back to old decoding method if VNode cannot deserialize
gschoeni Jan 9, 2025
0115848
bump Dockerfile builder to rust:1.83.0
gschoeni Jan 9, 2025
aa494b1
refactor out stats into its own module
gschoeni Jan 9, 2025
c363719
wrap CommitNode in an enum and get rid of old v0.10.0 commit logic
gschoeni Jan 9, 2025
be3ee85
wrap DirNode in and enum for forward and backwards compatibility
gschoeni Jan 9, 2025
dde0d27
wrap FileNode in enum
gschoeni Jan 10, 2025
5a6402e
Merge branch 'bug/oxen-add' into feat/speedup-random-access-by-path
gschoeni Jan 11, 2025
8b8bc00
remove v0.10.0 logic
gschoeni Jan 11, 2025
c75c74a
adding traits for the node types in the merkle tree to implement with…
gschoeni Jan 12, 2025
4d93268
adding a field to dir node
gschoeni Jan 13, 2025
7b94160
make sure num_entries gets populated properly on DirNode
gschoeni Jan 14, 2025
f5eb9e8
Merge pull request #504 from Oxen-AI/feat/add-num-children-to-dir-node
gschoeni Jan 14, 2025
cc7da9d
implement migration to v0.25.0 with ability to write tree to disk
gschoeni Jan 16, 2025
245e1ce
better error messaging on migrations, and make sure to bump repo vers…
gschoeni Jan 16, 2025
e866bb3
fix check for migration to not load a commit
gschoeni Jan 16, 2025
b489d18
Merge branch 'main' into feat/speedup-random-access-by-path
gschoeni Jan 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions src/cli/src/cmd/delete_remote.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ impl RunCmd for DeleteRemoteCmd {
.help("The host you want to create the remote repository on. For example: 'hub.oxen.ai'")
.action(clap::ArgAction::Set),
)
.arg(
Arg::new("scheme")
.long("scheme")
.help("The scheme for the url of the remote repository. For example: 'https' or 'http'")
.action(clap::ArgAction::Set),
)
.arg(
Arg::new("yes")
.long("yes")
Expand All @@ -55,10 +61,14 @@ impl RunCmd for DeleteRemoteCmd {
.get_one::<String>("host")
.map(String::from)
.unwrap_or(DEFAULT_HOST.to_string());
// Default scheme
let scheme = args
.get_one::<String>("scheme")
.map(String::from)
.unwrap_or("https".to_string());

let Some(remote_repo) =
api::client::repositories::get_by_name_and_host(namespace_name, host).await?
else {
let url = format!("{}://{host}/{namespace_name}", scheme);
let Some(remote_repo) = api::client::repositories::get_by_url(&url).await? else {
return Err(OxenError::basic_str(format!(
"Remote repository not found: {namespace_name}"
)));
Expand Down
31 changes: 27 additions & 4 deletions src/cli/src/cmd/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use clap::{Arg, Command};
use liboxen::core::v0_19_0::index::CommitMerkleTree;
use liboxen::error::OxenError;
use liboxen::model::{LocalRepository, MerkleHash};
use liboxen::repositories;

use std::str::FromStr;

Expand All @@ -20,7 +21,6 @@ impl RunCmd for NodeCmd {
// Setups the CLI args for the command
Command::new(NAME)
.about("Inspect an oxen merkle tree node")
.arg(Arg::new("node").required(true).action(clap::ArgAction::Set))
// add --verbose flag
.arg(
Arg::new("verbose")
Expand All @@ -29,13 +29,36 @@ impl RunCmd for NodeCmd {
.help("Verbose output")
.action(clap::ArgAction::SetTrue),
)
// add --node flag
.arg(
Arg::new("node")
.long("node")
.short('n')
.help("Node hash to inspect"),
)
// add --file flag
.arg(
Arg::new("file")
.long("file")
.short('f')
.help("File path to inspect"),
)
}

async fn run(&self, args: &clap::ArgMatches) -> Result<(), OxenError> {
// Parse Args
let node_hash = args.get_one::<String>("node").expect("Must supply node");

// Find the repository
let repository = LocalRepository::from_current_dir()?;

// if the --file flag is set, we need to get the node for the file
if let Some(file) = args.get_one::<String>("file") {
let commit = repositories::commits::head_commit(&repository)?;
let node = repositories::entries::get_file(&repository, &commit, file)?;
println!("{:?}", node);
return Ok(());
}

// otherwise, get the node based on the node hash
let node_hash = args.get_one::<String>("node").expect("Must supply node");
let node_hash = MerkleHash::from_str(node_hash)?;
let node = CommitMerkleTree::read_node(&repository, &node_hash, false)?;

Expand Down
14 changes: 13 additions & 1 deletion src/lib/src/api/client/tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ use flate2::Compression;
use futures_util::TryStreamExt;
use std::collections::HashSet;
use std::path::PathBuf;
use std::sync::Arc;
use std::time;

use crate::api::client;
use crate::constants::{NODES_DIR, OXEN_HIDDEN_DIR, TREE_DIR};
use crate::core::v0_19_0::index::merkle_node_db::node_db_path;
use crate::core::v0_19_0::index::CommitMerkleTree;
use crate::core::v0_19_0::structs::PushProgress;
use crate::error::OxenError;
use crate::model::merkle_tree::node::MerkleTreeNode;
use crate::model::{Commit, LocalRepository, MerkleHash, RemoteRepository};
Expand Down Expand Up @@ -52,11 +54,13 @@ pub async fn create_nodes(
local_repo: &LocalRepository,
remote_repo: &RemoteRepository,
nodes: HashSet<MerkleTreeNode>,
progress: &Arc<PushProgress>,
) -> Result<(), OxenError> {
// Compress the node
let enc = GzEncoder::new(Vec::new(), Compression::default());
let mut tar = tar::Builder::new(enc);
for node in nodes {
let mut children_count = 0;
for (i, node) in nodes.iter().enumerate() {
let node_dir = node_db_path(local_repo, &node.hash);
let tree_dir = local_repo
.path
Expand All @@ -69,7 +73,15 @@ pub async fn create_nodes(
sub_dir,
node_dir
);
progress.set_message(format!(
"Packing {}/{} nodes with {} children",
i + 1,
nodes.len(),
children_count
));

tar.append_dir_all(sub_dir, node_dir)?;
children_count += node.children.len();
}

tar.finish()?;
Expand Down
2 changes: 1 addition & 1 deletion src/lib/src/core/refs/ref_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ impl RefReader {
}

pub fn get_branch_by_name(&self, name: &str) -> Result<Option<Branch>, OxenError> {
log::debug!("get_branch_by_name {name}");
// log::debug!("get_branch_by_name {name}");
match self.get_commit_id_for_branch(name) {
Ok(Some(commit_id)) => Ok(Some(Branch {
name: name.to_string(),
Expand Down
6 changes: 4 additions & 2 deletions src/lib/src/core/v0_19_0/branches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -421,8 +421,10 @@ pub fn restore_file(
let version_path = util::fs::version_path_from_hash(repo, file_node.hash.to_string());
if !version_path.exists() {
return Err(OxenError::basic_str(format!(
"Source file not found in versions directory: {:?}",
version_path
"Restoring file, source file {} not found in versions directory: {:?} when restoring to {:?}",
file_node.name,
version_path,
dst_path
)));
}

Expand Down
8 changes: 4 additions & 4 deletions src/lib/src/core/v0_19_0/commits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,10 @@ pub fn get_by_id(
) -> Result<Option<Commit>, OxenError> {
let commit_id_str = commit_id_str.as_ref();
let Ok(commit_id) = MerkleHash::from_str(commit_id_str) else {
log::debug!(
"get_by_id could not create commit_id from [{}]",
commit_id_str
);
// log::debug!(
// "get_by_id could not create commit_id from [{}]",
// commit_id_str
// );
return Ok(None);
};
get_by_hash(repo, &commit_id)
Expand Down
96 changes: 46 additions & 50 deletions src/lib/src/core/v0_19_0/entries.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,15 @@ pub fn get_file_merkle_tree_node(
commit: &Commit,
path: impl AsRef<Path>,
) -> Result<Option<MerkleTreeNode>, OxenError> {
let parent = path.as_ref().parent().unwrap_or(Path::new(""));
let parent_node = CommitMerkleTree::dir_with_children(repo, commit, parent)?;
let Some(parent_node) = parent_node else {
log::debug!("path has no parent: {:?}", path.as_ref());
let path = path.as_ref();
let dir_hashes = CommitMerkleTree::dir_hashes(repo, commit)?;
if dir_hashes.contains_key(path) {
// This is a directory, not a file
log::debug!("get_file_merkle_tree_node path is a directory: {:?}", path);
return Ok(None);
};

let Some(file_name) = path.as_ref().file_name() else {
log::debug!("path has no file name: {:?}", path.as_ref());
return Ok(None);
};
}

let file_node = parent_node.get_by_path(file_name)?;
let file_node = CommitMerkleTree::read_file(repo, &dir_hashes, path)?;
Ok(file_node)
}

Expand Down Expand Up @@ -325,20 +321,20 @@ fn p_dir_entries(
) -> Result<(), OxenError> {
let search_directory = search_directory.as_ref();
let current_directory = current_directory.as_ref();
log::debug!(
"p_dir_entries current_directory {:?} search_directory {:?} node {}",
current_directory,
search_directory,
node
);
// log::debug!(
// "p_dir_entries current_directory {:?} search_directory {:?} node {}",
// current_directory,
// search_directory,
// node
// );
for child in &node.children {
match &child.node {
EMerkleTreeNode::VNode(_) => {
log::debug!(
"p_dir_entries got vnode {:?} search_directory {:?}",
current_directory,
search_directory
);
// log::debug!(
// "p_dir_entries got vnode {:?} search_directory {:?}",
// current_directory,
// search_directory
// );
p_dir_entries(
repo,
child,
Expand All @@ -350,25 +346,25 @@ fn p_dir_entries(
)?;
}
EMerkleTreeNode::Directory(child_dir) => {
log::debug!(
"p_dir_entries current_directory {:?} search_directory {:?} child_dir {:?}",
current_directory,
search_directory,
child_dir.name
);
// log::debug!(
// "p_dir_entries current_directory {:?} search_directory {:?} child_dir {:?}",
// current_directory,
// search_directory,
// child_dir.name
// );
if current_directory == search_directory && !child_dir.name.is_empty() {
log::debug!(
"p_dir_entries adding dir entry current_directory {:?}",
current_directory
);
// log::debug!(
// "p_dir_entries adding dir entry current_directory {:?}",
// current_directory
// );
let metadata = dir_node_to_metadata_entry(
repo,
child,
parsed_resource,
found_commits,
true,
)?;
log::debug!("p_dir_entries added dir entry {:?}", metadata);
// log::debug!("p_dir_entries added dir entry {:?}", metadata);
entries.push(metadata.unwrap());
}
let current_directory = current_directory.join(&child_dir.name);
Expand All @@ -382,27 +378,27 @@ fn p_dir_entries(
entries,
)?;
}
EMerkleTreeNode::File(child_file) => {
log::debug!(
"p_dir_entries current_directory {:?} search_directory {:?} child_file {:?}",
current_directory,
search_directory,
child_file.name
);
EMerkleTreeNode::File(_child_file) => {
// log::debug!(
// "p_dir_entries current_directory {:?} search_directory {:?} child_file {:?}",
// current_directory,
// search_directory,
// child_file.name
// );

if current_directory == search_directory {
log::debug!(
"p_dir_entries adding file entry current_directory {:?} file_name {:?}",
current_directory,
child_file.name
);
// log::debug!(
// "p_dir_entries adding file entry current_directory {:?} file_name {:?}",
// current_directory,
// child_file.name
// );
let metadata =
file_node_to_metadata_entry(repo, child, parsed_resource, found_commits)?;
log::debug!(
"p_dir_entries added file entry {:?} file_name {:?}",
metadata,
child_file.name
);
// log::debug!(
// "p_dir_entries added file entry {:?} file_name {:?}",
// metadata,
// child_file.name
// );
entries.push(metadata.unwrap());
}
}
Expand Down
7 changes: 7 additions & 0 deletions src/lib/src/core/v0_19_0/fetch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ pub async fn maybe_fetch_missing_entries(

// TODO: what should we print here? If there is nothing to pull, we
// shouldn't show the PullProgress
log::debug!("Fetching missing entries for commit {}", commit);

// Keep track of how many bytes we have downloaded
let pull_progress = Arc::new(PullProgress::new());
Expand All @@ -392,6 +393,12 @@ async fn r_download_entries(
directory: &Path,
pull_progress: &Arc<PullProgress>,
) -> Result<(), OxenError> {
log::debug!(
"fetch r_download_entries ({}) {:?} {:?}",
node.children.len(),
node.hash,
node.node
);
for child in &node.children {
let mut new_directory = directory.to_path_buf();
if let EMerkleTreeNode::Directory(dir_node) = &child.node {
Expand Down
Loading
Loading