Skip to content

Commit

Permalink
Merge branch 'main' into feat/import-huggingface
Browse files Browse the repository at this point in the history
  • Loading branch information
Lilianxr committed Jan 7, 2025
2 parents 00562d3 + 3433a80 commit e371042
Show file tree
Hide file tree
Showing 9 changed files with 118 additions and 58 deletions.
8 changes: 4 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "Oxen"
version = "0.24.2"
version = "0.24.3"
edition = "2021"
license-file = "LICENSE"
description = "Oxen is a fast, unstructured data version control, to help version large machine learning datasets written in Rust."
Expand Down
2 changes: 1 addition & 1 deletion src/cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "oxen-cli"
version = "0.24.2"
version = "0.24.3"
edition = "2021"

[dependencies]
Expand Down
16 changes: 8 additions & 8 deletions src/cli/src/cmd/diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -234,16 +234,16 @@ impl DiffCmd {
result: &mut DiffResult,
output: Option<PathBuf>,
) -> Result<(), OxenError> {
match result {
DiffResult::Tabular(result) => {
let mut df = result.contents.clone();
// Save to disk if we have an output
if let Some(file_path) = output {
// Save to disk if we have an output
if let Some(file_path) = output {
match result {
DiffResult::Tabular(result) => {
let mut df = result.contents.clone();
tabular::write_df(&mut df, file_path.clone())?;
}
}
DiffResult::Text(_) => {
println!("Saving to disk not supported for text output");
DiffResult::Text(_) => {
println!("Saving to disk not supported for text output");
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/lib/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "liboxen"
version = "0.24.2"
version = "0.24.3"
edition = "2021"
license-file = "LICENSE"
description = "Oxen is a fast, unstructured data version control, to help version datasets, written in Rust."
Expand Down
7 changes: 6 additions & 1 deletion src/lib/src/core/df/sql.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::path::{Path, PathBuf};

use crate::core::df::tabular;
use crate::model::LocalRepository;
use crate::opts::DFOpts;
use crate::repositories;
Expand Down Expand Up @@ -30,7 +31,11 @@ pub fn query_df_from_repo(

let db_path = repositories::workspaces::data_frames::duckdb_path(&workspace, path);
let mut conn = df_db::get_connection(db_path)?;
query_df(&mut conn, sql, Some(opts))
let df = query_df(&mut conn, sql, Some(opts))?;

// If we are doing this from the CLI, we don't want to export the hidden Oxen columns
let df = tabular::strip_excluded_cols(df)?;
Ok(df)
}

pub fn query_df(
Expand Down
46 changes: 24 additions & 22 deletions src/lib/src/core/v0_19_0/download.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,11 @@ async fn r_download_entries(
) -> Result<(), OxenError> {
log::debug!("downloading entries for {:?}", directory);

let mut entries: Vec<Entry> = vec![];
if let EMerkleTreeNode::VNode(_) = &node.node {
let mut entries: Vec<Entry> = vec![];

for child in &node.children {
match &child.node {
EMerkleTreeNode::File(file_node) => {
for child in &node.children {
if let EMerkleTreeNode::File(file_node) = &child.node {
entries.push(Entry::CommitEntry(CommitEntry {
commit_id: file_node.last_commit_id.to_string(),
path: directory.join(&file_node.name),
Expand All @@ -75,26 +75,8 @@ async fn r_download_entries(
last_modified_nanoseconds: file_node.last_modified_nanoseconds,
}));
}
EMerkleTreeNode::Directory(dir_node) => {
let mut new_directory = directory.to_path_buf();
new_directory.push(&dir_node.name);

if child.has_children() {
Box::pin(r_download_entries(
remote_repo,
local_repo_path,
child,
&new_directory,
pull_progress,
))
.await?;
}
}
_ => {}
}
}

if !entries.is_empty() {
log::debug!("downloading {} entries to working dir", entries.len());
core::v0_10_0::index::puller::pull_entries_to_working_dir(
remote_repo,
Expand All @@ -105,5 +87,25 @@ async fn r_download_entries(
.await?;
}

for child in &node.children {
log::debug!("downloading entry {:?}", child.hash);

let mut new_directory = directory.to_path_buf();
if let EMerkleTreeNode::Directory(dir_node) = &child.node {
new_directory.push(&dir_node.name);
}

if child.has_children() {
Box::pin(r_download_entries(
remote_repo,
local_repo_path,
child,
&new_directory,
pull_progress,
))
.await?;
}
}

Ok(())
}
91 changes: 72 additions & 19 deletions src/lib/src/repositories/diffs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,18 +125,27 @@ pub fn diff(
(cpath_1, cpath_2)
} else {
// If no file2, compare with file1 at head.
let commit = Some(repositories::commits::head_commit(&repository)?);
let Some(commit) = repositories::commits::head_commit_maybe(&repository)? else {
let error = "Error: head commit not found".to_string();
return Err(OxenError::basic_str(error));
};

(
CommitPath {
commit,
path: path_1.as_ref().to_path_buf(),
},
CommitPath {
commit: None,
path: path_1.as_ref().to_path_buf(),
},
)
let (file_node,) = (
repositories::entries::get_file(&repository, &commit, &path_1)?.ok_or_else(|| {
OxenError::ResourceNotFound(
format!("Error: file {} not committed", path_1.as_ref().display()).into(),
)
})?,
);

let hash = file_node.hash.to_string();

let committed_file = util::fs::version_path_from_node(&repository, &hash, &path_1);
// log::debug!("committed file: {:?}", committed_file);
let result =
repositories::diffs::diff_files(path_1, committed_file, keys, targets, vec![])?;

return Ok(result);
};

let result = diff_commits(&repository, cpath_1, cpath_2, keys, targets, vec![])?;
Expand Down Expand Up @@ -174,6 +183,7 @@ pub fn diff_commits(

if let Some(mut commit_2) = cpath_2.commit {
// if there are merge conflicts, compare against the conflict commit instead

let merger = EntryMergeConflictReader::new(repo)?;

if merger.has_conflicts()? {
Expand Down Expand Up @@ -966,7 +976,6 @@ fn read_dupes(repo: &LocalRepository, compare_id: &str) -> Result<TabularDiffDup

Ok(dupes)
}

// Abbreviated version for when summary is known (e.g. computing top-level self node of an already-calculated dir diff)
pub fn get_dir_diff_entry_with_summary(
repo: &LocalRepository,
Expand All @@ -975,21 +984,65 @@ pub fn get_dir_diff_entry_with_summary(
head_commit: &Commit,
summary: GenericDiffSummary,
) -> Result<Option<DiffEntry>, OxenError> {
match repo.min_version() {
MinOxenVersion::V0_19_0 => core::v0_19_0::diff::get_dir_diff_entry_with_summary(
// Dir hashes db is cheaper to open than objects reader
let base_dir_hashes_db_path = ObjectDBReader::dir_hashes_db_dir(&repo.path, &base_commit.id);
let head_dir_hashes_db_path = ObjectDBReader::dir_hashes_db_dir(&repo.path, &head_commit.id);

let base_dir_hashes_db: DBWithThreadMode<MultiThreaded> = DBWithThreadMode::open_for_read_only(
&db::key_val::opts::default(),
dunce::simplified(&base_dir_hashes_db_path),
false,
)?;

let head_dir_hashes_db: DBWithThreadMode<MultiThreaded> = DBWithThreadMode::open_for_read_only(
&db::key_val::opts::default(),
dunce::simplified(&head_dir_hashes_db_path),
false,
)?;

let maybe_base_dir_hash: Option<String> = path_db::get_entry(&base_dir_hashes_db, &dir)?;
let maybe_head_dir_hash: Option<String> = path_db::get_entry(&head_dir_hashes_db, &dir)?;

match (maybe_base_dir_hash, maybe_head_dir_hash) {
(Some(base_dir_hash), Some(head_dir_hash)) => {
let base_dir_hash = base_dir_hash.to_string();
let head_dir_hash = head_dir_hash.to_string();

if base_dir_hash == head_dir_hash {
Ok(None)
} else {
Ok(Some(DiffEntry::from_dir_with_summary(
repo,
Some(&dir),
base_commit,
Some(&dir),
head_commit,
summary,
DiffEntryStatus::Modified,
)?))
}
}
(None, Some(_)) => Ok(Some(DiffEntry::from_dir_with_summary(
repo,
dir,
None,
base_commit,
Some(&dir),
head_commit,
summary,
),
MinOxenVersion::V0_10_0 => core::v0_10_0::diff::get_dir_diff_entry_with_summary(
DiffEntryStatus::Added,
)?)),
(Some(_), None) => Ok(Some(DiffEntry::from_dir_with_summary(
repo,
dir,
Some(&dir),
base_commit,
None,
head_commit,
summary,
),
DiffEntryStatus::Removed,
)?)),
(None, None) => Err(OxenError::basic_str(
"Could not calculate dir diff tree: dir does not exist in either commit.",
)),
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/server/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "oxen-server"
version = "0.24.2"
version = "0.24.3"
edition = "2021"

[dependencies]
Expand Down

0 comments on commit e371042

Please sign in to comment.