diff --git a/oxen/Cargo.lock b/oxen/Cargo.lock index e51864b..0ab5db4 100644 --- a/oxen/Cargo.lock +++ b/oxen/Cargo.lock @@ -3065,9 +3065,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "liboxen" -version = "0.24.1" +version = "0.24.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff3a79a9597b91659b8f54b40449abb45394d997bfaf9b8bb8c8966e81295ad8" +checksum = "6549e194ea7e6360b9264e47d1375491c54e090a9914d8b8a805990c52472537" dependencies = [ "actix-files", "actix-web", diff --git a/oxen/Cargo.toml b/oxen/Cargo.toml index 56bcfce..92579f6 100644 --- a/oxen/Cargo.toml +++ b/oxen/Cargo.toml @@ -28,7 +28,7 @@ polars = { version = "0.44.0", features = [ ] } serde_json = "1.0.106" uuid = { version = "1.4.1", features = ["serde", "v4"] } -liboxen = "0.24.1" +liboxen = "0.24.4" # liboxen = { path = "../../Oxen/src/lib" } [build-dependencies] diff --git a/oxen/pyproject.toml b/oxen/pyproject.toml index d788baa..e344ca0 100644 --- a/oxen/pyproject.toml +++ b/oxen/pyproject.toml @@ -12,16 +12,12 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "numpy", - "opencv-python-headless", "pandas", "polars", "pyarrow", "requests", "toml", - "tqdm", - "torch", - "tensorflow", + "tqdm" ] diff --git a/oxen/python/oxen/__init__.py b/oxen/python/oxen/__init__.py index 7903c7d..7b3c33c 100644 --- a/oxen/python/oxen/__init__.py +++ b/oxen/python/oxen/__init__.py @@ -17,11 +17,8 @@ from oxen.remote_repo import RemoteRepo from oxen.workspace import Workspace from oxen.data_frame import DataFrame -from oxen.dag import DAG -from oxen.op import Op from oxen import auth from oxen import datasets -from oxen import loaders from oxen.clone import clone from oxen.diff.diff import diff from oxen.init import init @@ -30,7 +27,6 @@ # Names of public modules we want to expose __all__ = [ "Dataset", - "DAG", "PyCommit", "PyDataset", "PyWorkspace", @@ -38,7 +34,6 @@ "PyRemoteRepo", "PyRepo", "PyStagedData", - "Op", "clone", "init", "is_configured", @@ -48,7 +43,6 @@ "Repo", "auth", "datasets", - "loaders", "util", "diff", ] diff --git a/oxen/python/oxen/dag.py b/oxen/python/oxen/dag.py deleted file mode 100644 index 54be36d..0000000 --- a/oxen/python/oxen/dag.py +++ /dev/null @@ -1,26 +0,0 @@ -class DAG: - def __init__(self, outputs=None): - self.outputs = outputs or [] - - def evaluate(self): - visited = set() - outputs = [] - - def _dfs(node): - if node not in visited: - visited.add(node) - for parent in node.parents: - _dfs(parent) - outputs.append(node) - - for output_node in self.outputs: - _dfs(output_node) - - desired_output_ids = [node.id for node in self.outputs] - results = [None] * len(self.outputs) - for node in outputs: - result = node.run() - if node.id in desired_output_ids: - results[desired_output_ids.index(node.id)] = result - - return results diff --git a/oxen/python/oxen/loaders/__init__.py b/oxen/python/oxen/loaders/__init__.py deleted file mode 100644 index 2641a7f..0000000 --- a/oxen/python/oxen/loaders/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Loaders -from .chat import ChatLoader -from .regression import RegressionLoader -from .image_classification import ImageClassificationLoader - -# Names of public modules we want to expose -__all__ = ["ChatLoader", "RegressionLoader", "ImageClassificationLoader"] diff --git a/oxen/python/oxen/loaders/chat.py b/oxen/python/oxen/loaders/chat.py deleted file mode 100644 index d2f0b11..0000000 --- a/oxen/python/oxen/loaders/chat.py +++ /dev/null @@ -1,58 +0,0 @@ -from oxen.dag import DAG - -from oxen.ops import ( - Identity, - ReadText, - ReadDF, - ExtractCol, - StrColTemplate, - ConcatSeries, -) - - -class ChatLoader: - """ - Formats / templatizes data from an Oxen repository for use in chatbot training. - """ - - def __init__(self, prompt_file, data_file): - """ - Creates a new ChatLoader. - - Parameters - ---------- - prompt_file : str - Path to a text file containing a prompt template for the chatbot. - data_file: str - Path to a tabular file containing the chatbot training data, - with "prompt" and "response" columns - """ - # Define input nodes - prompt_name = Identity(input="prompt") - column_name = Identity(input="response") - prompt = ReadText(input=prompt_file) - data_frame = ReadDF(input=data_file) - - # Define intermediate nodes - extract_prompt = ExtractCol(name="extract_prompt")(data_frame, prompt_name) - extract_response = ExtractCol(name="extract_response")(data_frame, column_name) - templatize = StrColTemplate(name="templatize")(prompt, extract_prompt) - output = ConcatSeries(name="concat_output")(templatize, extract_response) - - # Create and compile the graph - self.graph = DAG(outputs=[output]) - - def run(self): - """ - Returns - -------- - outputs[0] : pl.DataFrame - DataFrame with columns containing the templatized prompt ("prompt") - and response ("response") - """ - # Run the graph to get the outputs - result = self.graph.evaluate() - - print("\n\nResult:") - print(result) - return result diff --git a/oxen/python/oxen/loaders/image_classification.py b/oxen/python/oxen/loaders/image_classification.py deleted file mode 100644 index a84f139..0000000 --- a/oxen/python/oxen/loaders/image_classification.py +++ /dev/null @@ -1,98 +0,0 @@ -from oxen.dag import DAG - -from oxen.ops import ( - Identity, - ReadDF, - ExtractCol, - CreateLabelMap, - EncodeLabels, - ReadImageDir, - ReadText, - ResizeImages, -) - - -class ImageClassificationLoader: - """ - Prepares data from an Oxen repository for use - in supervised image classification tasks. - """ - - def __init__( - self, - imagery_root_dir, - label_file, - df_file, - path_name="path", - label_name="label", - resize_to=None, - resize_method="crop", - ): - """ - Creates a new ImageClassificationLoader. - - Parameters - ---------- - imagery_root_dir : str - Directory relative to which the image paths - in the DataFrame file are specified. - label_file: str - Path to a text file containing a line-separated - list of canonical labels for the dataset. - df_file : str - Path to a tabular file containing the image paths - and associate labels (and any additional metadata). - path_name : str - Column name in df_file containing the image paths - label_name : str - Column name in df_file containing the image labels - resize_to : int | None - Size to which images should be resized (square, in pixels) - resize_method : str - Method to use for resizing images. One of "crop", "pad", or "squash". - crop : resize (preserving aspect) such - that smaller size = target size, then center crop - pad: resize (prserving aspect) such that larger size = target size, - then pad with zeros equally on all sides - squash: resize (not presercing aspect) - """ - # Define input nodes - data_frame = ReadDF(input=df_file) - label_list = ReadText(input=label_file) - path_name = Identity(input=path_name) - label_name = Identity(input=label_name) - imagery_root_dir = Identity(input=imagery_root_dir) - resize_to = Identity(input=resize_to) - resize_method = Identity(input=resize_method) - - # Define intermediate nodes - paths = ExtractCol()(data_frame, path_name) - label_text = ExtractCol()(data_frame, label_name) - image_list = ReadImageDir()(imagery_root_dir, paths) - - # Define output nodes - images = ResizeImages()(image_list, resize_to, resize_method) - - label_map = CreateLabelMap()(label_list, label_text) - labels = EncodeLabels()(label_text, label_map) - - # Create and compile the graph - self.graph = DAG(outputs=[images, labels, label_map]) - - def run(self): - """ - Returns - ------- - outputs[0] (images) : np.ndarray - All images found in the dataset, as a numpy array of shape (n, h, w, c) - outputs[1] (labels) : np.nadarray - Encoded labels for training, index-matched to the images array - outputs[2] (mapper) : dict - A dictionary mapping the encoded labels to their canonical names - """ - # Run the graph to get the outputs - result = self.graph.evaluate() - - print("\n\nResult:") - print(result) - return result diff --git a/oxen/python/oxen/loaders/regression.py b/oxen/python/oxen/loaders/regression.py deleted file mode 100644 index 1ef420d..0000000 --- a/oxen/python/oxen/loaders/regression.py +++ /dev/null @@ -1,48 +0,0 @@ -from oxen.dag import DAG - -from oxen.ops import Identity, ReadDF, ExtractCol, ConcatSeries - - -class RegressionLoader: - def __init__(self, data_file, pred_name, f_names): - """ - Extracts and formats relevant features and labels - from a tabular dataset for use in regression tasks. - - Parameters - ---------- - data_file : str - Path to a tabular file containing the input features - and prediction target for a regression task - pred_nam : str - Column name in data_file containing the prediction target - f_names : list - List of column names in data_file containing the input features - """ - # Define input nodes - pred_name = Identity(input=pred_name) - data_frame = ReadDF(input=data_file) - - # Define intermediate nodes - prediction = ExtractCol()(data_frame, pred_name) - extracts = [ExtractCol()(data_frame, Identity(input=col)) for col in f_names] - features = ConcatSeries(name="concat")(*extracts) - - # Create and compile the graph - self.graph = DAG(outputs=[features, prediction]) - - def run(self): - # Run the graph to get the outputs - """ - Returns - --------- - outputs[0] (features) : pl.DataFrame - DataFrame containing only the specified input features - outputs[1] (prediction) : pl.Series - Series containing the prediction target - """ - result = self.graph.evaluate() - - print("\n\nResult:") - print(result) - return result diff --git a/oxen/python/oxen/op.py b/oxen/python/oxen/op.py deleted file mode 100644 index 1c27e54..0000000 --- a/oxen/python/oxen/op.py +++ /dev/null @@ -1,55 +0,0 @@ -import uuid - - -class Op: - def __init__(self, *args, **kwargs): - self.args = args - self.kwargs = kwargs - self.id = uuid.uuid4() - self.input = None - self.name = self.__class__.__name__ - - if "name" in kwargs: - self.name = kwargs["name"] - - if "input" in kwargs: - self.input = kwargs["input"] - - # print(f"Creating op {self.name}(args={self.args}, kwargs={self.kwargs})") - - self.parents = [] - - def __repr__(self): - return f"{self.name}({self.args})" - - # Links to the parent Operations that need to run first - def __call__(self, *args): - for arg in args: - # print(f" {self} --parent--> {arg}") - self.parents.append(arg) - return self - - # For the child op to implement - def call(self, _args): - raise NotImplementedError() - - # Combines the data, the args, and the parent inputs, and computes the output - def run(self): - # print("=" * 5) - # print(f"Running {self}") - # print(f"parents {self.parents}") - - # these will be the inputs to the subsequent node call - inputs = [] - if self.input: - inputs.append(self.input) - if self.args: - inputs.append(self.args) - if self.parents: - inputs.extend([node.input for node in self.parents]) - - # transform the inputs into the output - self.input = self.call(inputs) - # print(f"output {self.input}") - # print("=" * 5) - return self.input diff --git a/oxen/python/oxen/ops/__init__.py b/oxen/python/oxen/ops/__init__.py deleted file mode 100644 index 59279c0..0000000 --- a/oxen/python/oxen/ops/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Operations -from .concat_series import ConcatSeries -from .extract_col import ExtractCol -from .identity import Identity -from .read_df import ReadDF -from .read_text import ReadText -from .str_col_template import StrColTemplate -from .read_image_dir import ReadImageDir -from .encode_labels import EncodeLabels -from .create_label_map import CreateLabelMap -from .resize_images import ResizeImages - -# Names of public modules we want to expose -__all__ = [ - "ConcatSeries", - "ExtractCol", - "Identity", - "ReadDF", - "ReadText", - "StrColTemplate", - "ReadImageDir", - "EncodeLabels", - "CreateLabelMap", - "ResizeImages", -] diff --git a/oxen/python/oxen/ops/concat_series.py b/oxen/python/oxen/ops/concat_series.py deleted file mode 100644 index 2dc271e..0000000 --- a/oxen/python/oxen/ops/concat_series.py +++ /dev/null @@ -1,10 +0,0 @@ -import oxen -import polars as pl - - -class ConcatSeries(oxen.Op): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def call(self, args): - return pl.DataFrame(args) diff --git a/oxen/python/oxen/ops/create_label_map.py b/oxen/python/oxen/ops/create_label_map.py deleted file mode 100644 index 0ef5266..0000000 --- a/oxen/python/oxen/ops/create_label_map.py +++ /dev/null @@ -1,28 +0,0 @@ -import oxen - - -class CreateLabelMap(oxen.Op): - """ - Creates a dictionary mapping string labels to integers, - based on a canonical line-separated text file of labels. - - Args: - args[0]: str - String of line-separated labels - args[1]: List[str] - Iterable of labels in data, as a check against unexpected data values - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def call(self, args): - lines = args[0].split("\n") - canonical_labels = set(lines) - data_labels = set(args[1]) - missing_labels = data_labels.difference(canonical_labels) - if len(missing_labels) != 0: - raise ValueError( - f"Some label(s) in data missing from labels file: {missing_labels}" - ) - return {line: i for i, line in enumerate(lines)} diff --git a/oxen/python/oxen/ops/encode_labels.py b/oxen/python/oxen/ops/encode_labels.py deleted file mode 100644 index 5f86194..0000000 --- a/oxen/python/oxen/ops/encode_labels.py +++ /dev/null @@ -1,20 +0,0 @@ -import oxen -import numpy as np - - -class EncodeLabels(oxen.Op): - """ - Maps a column of labels to a column of integers according to a dictionary. - - Args: - args[0] : polars.Series - Polars series of labels to be mapped - args[1] : Dict[str, int] - Dictionary mapping labels to integers - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def call(self, args): - return np.array(args[0].replace(args[1])) diff --git a/oxen/python/oxen/ops/extract_col.py b/oxen/python/oxen/ops/extract_col.py deleted file mode 100644 index 9ec2b25..0000000 --- a/oxen/python/oxen/ops/extract_col.py +++ /dev/null @@ -1,19 +0,0 @@ -import oxen - - -class ExtractCol(oxen.Op): - def __init__(self, *args, **kwargs): - """ - Extracts a column from a DataFrame. - - Args: - args[0]: polars.DataFrame - DataFrame to extract column from - args[1]: str - Name of column to extract - """ - super().__init__(*args, **kwargs) - - def call(self, args): - df, column = args - return df[column] diff --git a/oxen/python/oxen/ops/identity.py b/oxen/python/oxen/ops/identity.py deleted file mode 100644 index 00b3acb..0000000 --- a/oxen/python/oxen/ops/identity.py +++ /dev/null @@ -1,10 +0,0 @@ -import oxen - - -class Identity(oxen.Op): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def call(self, args): - # print(f"identity_fn {args}") - return self.input diff --git a/oxen/python/oxen/ops/read_df.py b/oxen/python/oxen/ops/read_df.py deleted file mode 100644 index 25d6f3c..0000000 --- a/oxen/python/oxen/ops/read_df.py +++ /dev/null @@ -1,18 +0,0 @@ -import oxen - - -class ReadDF(oxen.Op): - """ - Reads a polars DataFrame from a file. - - Args: - args[0] : str - File path to read DataFrame from - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def call(self, args): - # print(f"read_df {args}") - return oxen.util.read_df(args[0]) diff --git a/oxen/python/oxen/ops/read_image_dir.py b/oxen/python/oxen/ops/read_image_dir.py deleted file mode 100644 index 2bd6fe9..0000000 --- a/oxen/python/oxen/ops/read_image_dir.py +++ /dev/null @@ -1,28 +0,0 @@ -import oxen -from tqdm import tqdm -import cv2 - - -class ReadImageDir(oxen.Op): - """ - Reads in imagery as specified by a DataFrame column of paths. - - Args: - args[0] : str - Root imagery directory, is prefixed to DataFrame paths - args[1] : List[str] - Column of paths to imagery (relative to root directory specified in args[0]) - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def call(self, args): - image_data = [] - prefix = args[0] - print("Reading images...") - for path in tqdm(args[1]): - img = cv2.imread(f"{prefix}/{path}", cv2.IMREAD_UNCHANGED) - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - image_data.append(img) - return image_data diff --git a/oxen/python/oxen/ops/read_text.py b/oxen/python/oxen/ops/read_text.py deleted file mode 100644 index 3481469..0000000 --- a/oxen/python/oxen/ops/read_text.py +++ /dev/null @@ -1,19 +0,0 @@ -import oxen -from pathlib import Path - - -class ReadText(oxen.Op): - """ - Reads a text file - - Args: - args[0] : str - File path to read text from - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def call(self, args): - file = args[0] - return Path(file).read_text() diff --git a/oxen/python/oxen/ops/resize_images.py b/oxen/python/oxen/ops/resize_images.py deleted file mode 100644 index 73d455f..0000000 --- a/oxen/python/oxen/ops/resize_images.py +++ /dev/null @@ -1,114 +0,0 @@ -import oxen -import numpy as np -import cv2 -from tqdm import tqdm - - -class ResizeImages(oxen.Op): - """ - Resizes a list of images to a common size for use in computer vision tasks. - - Args: - args[0]: List[np.ndarray] - List of images to resize (height, width, channels) - args[1]: int | None - Height and width dimension for cropping square images - args[2]: str - Method for resizing images to square size. Options are - "crop", "pad", and "squeeze". - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def crop(self, image, size): - # Resize such that shortest side is target size - if image.shape[0] < image.shape[1]: - resized = self._resize_same_aspect(image, height=size) - else: - resized = self._resize_same_aspect(image, width=size) - result = self._center_crop(resized, size, size) - return result.astype(image.dtype) - - def pad(self, image, size, inter=cv2.INTER_LINEAR): - # Resize largest dimension to target size - if image.shape[0] < image.shape[1]: - resized = self._resize_same_aspect(image, width=size, inter=inter) - else: - resized = self._resize_same_aspect(image, height=size, inter=inter) - - color = (0, 0, 0) - result = np.full((size, size, image.shape[2]), color) - - old_height, old_width = resized.shape[:2] - x_center = (size - old_width) // 2 - y_center = (size - old_height) // 2 - - # copy img image into center of result image - result[y_center : y_center + old_height, x_center : x_center + old_width] = ( - resized - ) - - return result.astype(image.dtype) - - def squeeze(self, image, size, inter=cv2.INTER_LINEAR): - result = cv2.resize(image, (size, size), interpolation=inter) - return result.astype(image.dtype) - - def _resize_same_aspect( - self, image, height=None, width=None, inter=cv2.INTER_LINEAR - ): - dim = None - (h, w) = image.shape[:2] - - if width is None and height is None: - return image - - if width is None: - r = height / float(h) - dim = (int(w * r), height) - else: - r = width / float(w) - dim = (width, int(h * r)) - - result = cv2.resize(image, dim, interpolation=inter) - return result - - def _center_crop(self, image, out_height, out_width): - height, width = image.shape[:2] - startx = width // 2 - out_width // 2 - starty = height // 2 - out_height // 2 - - if len(image.shape) > 2: - return image[starty : starty + out_height, startx : startx + out_width, :] - else: - return image[starty : starty + out_height, startx : startx + out_width] - - def call(self, args): - if args[1] is None: - return np.array(args[0]) - - n_channels = args[0][0].shape[2] - out_dtype = args[0][0].dtype - result = np.zeros( - ( - len(args[0]), - args[1], - args[1], - n_channels, - ), - dtype=out_dtype, - ) - - print("Resizing images...") - for i in tqdm(range(len(args[0]))): - if args[2] == "crop": - modified = self.crop(args[0][i], args[1]) - elif args[2] == "pad": - modified = self.pad(args[0][i], args[1]) - elif args[2] == "squeeze": - modified = self.squeeze(args[0][i], args[1]) - else: - raise ValueError(f"Invalid argument {args[2]} for resize_method") - result[i] = modified - return result diff --git a/oxen/python/oxen/ops/str_col_template.py b/oxen/python/oxen/ops/str_col_template.py deleted file mode 100644 index 2cd7608..0000000 --- a/oxen/python/oxen/ops/str_col_template.py +++ /dev/null @@ -1,12 +0,0 @@ -import oxen - - -class StrColTemplate(oxen.Op): - def __init__(self, *args, **kwargs): - self.search = kwargs["search"] if "search" in kwargs else "{prompt}" - super().__init__(*args, **kwargs) - - def call(self, args): - value, column = args - result = column.map_elements(lambda x: value.replace(self.search, x)) - return result diff --git a/oxen/python/oxen/repo.py b/oxen/python/oxen/repo.py index 344f8b9..a6e8d45 100644 --- a/oxen/python/oxen/repo.py +++ b/oxen/python/oxen/repo.py @@ -213,3 +213,9 @@ def current_branch(self): Returns the current branch. """ return self._repo.current_branch() + + def merge(self, branch: str): + """ + Merge a branch into the current branch. + """ + return self._repo.merge(branch) diff --git a/oxen/requirements.txt b/oxen/requirements.txt index 1cc3549..6089845 100644 --- a/oxen/requirements.txt +++ b/oxen/requirements.txt @@ -1,14 +1,10 @@ maturin -numpy>=2.0.0 -opencv-python-headless==4.10.0.84 pandas>=2.0.1 -polars>=1.15.0 +polars>=1.20.0 pyarrow>=18.0.0 pytest-datadir==1.4.1 pytest==8.3.4 requests>=2.32.3 ruff>=0.5.0 -tensorflow>=2.16.1 toml==0.10.2 -torch>=2.2.0 tqdm>=4.67.1 diff --git a/oxen/src/py_repo.rs b/oxen/src/py_repo.rs index f445a05..c7df01a 100644 --- a/oxen/src/py_repo.rs +++ b/oxen/src/py_repo.rs @@ -202,13 +202,21 @@ impl PyRepo { branch: branch.to_string(), subtree_paths: None, depth: None, - all + all, }; repositories::pull_remote_branch(&repo, &fetch_opts).await })?; Ok(()) } + pub fn merge(&self, branch: &str) -> Result, PyOxenError> { + let repo = LocalRepository::from_dir(&self.path)?; + match repositories::merge::merge(&repo, branch)? { + Some(commit) => Ok(Some(PyCommit { commit })), + None => Ok(None), + } + } + // pub fn diff(&self, path: &str) -> Result { // let repo = LocalRepository::from_dir(&self.path)?; // let diff = diff --git a/oxen/tests/conftest.py b/oxen/tests/conftest.py index a4d932b..d302d08 100644 --- a/oxen/tests/conftest.py +++ b/oxen/tests/conftest.py @@ -114,6 +114,16 @@ def house_prices_local_repo_no_commits(shared_datadir): yield repo +@pytest.fixture +def house_prices_local_repo_fully_committed(house_prices_local_repo_no_commits): + repo = house_prices_local_repo_no_commits + + repo.add(os.path.join(repo.path, "prices.csv")) + repo.commit("Add prices.csv") + + yield repo + + @pytest.fixture def empty_remote_repo(): repo_name = f"py-ox/test_repo_{str(uuid.uuid4())}" diff --git a/oxen/tests/test_chat_loader.py b/oxen/tests/test_chat_loader.py deleted file mode 100644 index 9f6b925..0000000 --- a/oxen/tests/test_chat_loader.py +++ /dev/null @@ -1,11 +0,0 @@ -from oxen.loaders import ChatLoader -import os - - -def test_chat_loader(chat_bot_local_repo_no_commits): - repo = chat_bot_local_repo_no_commits - prompt_file = os.path.join(repo.path, "prompt.txt") - data_file = os.path.join(repo.path, "examples.tsv") - loader = ChatLoader(prompt_file, data_file) - result = loader.run()[0] - assert result.height == 6 diff --git a/oxen/tests/test_dataloader_pytorch.py b/oxen/tests/test_dataloader_pytorch.py deleted file mode 100644 index 78b25cc..0000000 --- a/oxen/tests/test_dataloader_pytorch.py +++ /dev/null @@ -1,78 +0,0 @@ -# from torch.utils.data import DataLoader - -# from oxen import Dataset, DataEntry, Features - - -# def test_dataset_load_celeba_train_download( -# celeba_remote_repo_fully_pushed, empty_local_dir -# ): -# _local_repo is the original local repo -# remote_repo is the remote repo we pushed to -# _local_repo, remote_repo = celeba_remote_repo_fully_pushed - -# download the remote dataframe, and load the data into a dataloader -# cache_dir = empty_local_dir - -# "images", "annotations/train.csv" "labels.txt" -# | / | -# | / | -# - -# Image Classification Loader Graph - -# csv_input = oxen.DataFrameLoader("annotations/train.csv") -# images_input = oxen.DirLoader("images") -# labels_input = oxen.FileLoader("labels.txt") - -# image_column = oxen.ColumnExtractor(["image"])(csv_input) -# label_column = oxen.ColumnExtractor(["label"])(csv_input) - -# line_to_idx = oxen.LineToIdx()(label_column, labels_input) - -# image_output = oxen.ImageTensor()(image_column, images_input) -# label_output = oxen.LabelTensor()(label_column, line_to_idx) - -# lag = oxen.LoaderGraph( -# inputs=[csv_input, images_input, labels_input], -# outputs=[image_output, label_output], -# ) - -# LLM Loader Graph - -# csv_input = oxen.DataFrameLoader("train.csv") -# prompt_input = oxen.FileLoader("prompt.txt") - -# prompt_column = oxen.ColumnExtractor(["prompt"])(csv_input) -# response_column = oxen.ColumnExtractor(["response"])(csv_input) - -# prompt_column = oxen.PromptTemplate()(prompt_input, prompt_column) - -# prompt_output = oxen.TextTokenizer()(prompt_column) -# response_output = oxen.TextTokenizer()(response_column) - -# lag = oxen.LoaderGraph( -# inputs=[csv_input, prompt_input], -# outputs=[prompt_output, response_output], -# ) - -# House Price Regression - -# csv_input = oxen.DataFrameLoader("housing.csv") -# price = oxen.ColumnExtractor(["price"])(csv_input) -# features = oxen.ColumnExtractor(["sqft", "num_bed", "num_bath"])(csv_input) - -# lag = oxen.LoaderGraph( -# inputs=[csv_input], -# outputs=[price, features], -# ) - -# dataset = Dataset(remote_repo, lag, cache_dir=cache_dir, download=True) - -# # train_files = ["annotations/train.csv", "images"] -# # dataset.load(train_files, download=True) - -# dataloader = DataLoader(dataset, batch_size=4, shuffle=False) - -# for i, data in enumerate(dataloader, 0): -# # get the inputs; data is a list of [inputs, labels] -# inputs, labels = data diff --git a/oxen/tests/test_image_classification_loader.py b/oxen/tests/test_image_classification_loader.py deleted file mode 100644 index 78b7a1f..0000000 --- a/oxen/tests/test_image_classification_loader.py +++ /dev/null @@ -1,152 +0,0 @@ -import os -import pytest -import tensorflow as tf -import numpy as np -from oxen.loaders import ImageClassificationLoader - - -def test_image_classification_dataloader_local( - celeba_local_repo_fully_committed, empty_local_dir -): - repo = celeba_local_repo_fully_committed - - train_file = os.path.join(repo.path, "annotations", "train.csv") - label_file = os.path.join(repo.path, "annotations", "labels.txt") - - loader = ImageClassificationLoader( - imagery_root_dir=repo.path, - label_file=label_file, - df_file=train_file, - path_name="file", - label_name="hair_color", - ) - data, labels, mapper = loader.run() - - assert data.dtype == np.uint8 - - assert data.shape == (5, 218, 178, 3), "Data not returned in expected shape" - assert labels.shape == (5,) - assert len(mapper.items()) == 3 - - # Test ease of use with pytorch - # torch_data = TensorDataset(torch.from_numpy(data), torch.from_numpy(labels)) - # torch_dl = DataLoader(torch_data, batch_size=1) - # assert len(torch_dl) == 5 - - # Test ease of use with tensorflow - dataset = tf.data.Dataset.from_tensor_slices((data, labels)) - - dataset = dataset.shuffle(buffer_size=len(data)) - dataset = dataset.batch(1) - - assert len(dataset) == 5 - - -def test_image_loader_missing_unique_label( - celeba_local_repo_fully_committed, empty_local_dir -): - repo = celeba_local_repo_fully_committed - train_file = os.path.join(repo.path, "annotations", "test.csv") - label_file = os.path.join(repo.path, "annotations", "labels.txt") - - loader = ImageClassificationLoader( - imagery_root_dir=repo.path, - label_file=label_file, - df_file=train_file, - path_name="file", - label_name="hair_color", - ) - with pytest.raises(ValueError) as e: - data, labels, mapper = loader.run() - - assert "label(s) in data missing" in str(e.value) - - -def test_image_loader_resize_crop( - celeba_local_repo_fully_committed, empty_local_dir, tmp_path -): - repo = celeba_local_repo_fully_committed - train_file = os.path.join(repo.path, "annotations", "train.csv") - label_file = os.path.join(repo.path, "annotations", "labels.txt") - - loader = ImageClassificationLoader( - imagery_root_dir=repo.path, - label_file=label_file, - df_file=train_file, - path_name="file", - label_name="hair_color", - resize_to=512, - resize_method="crop", - ) - - data, labels, mapper = loader.run() - - assert data.dtype == np.uint8 - - np.save(tmp_path / "imgs.npy", data) - assert data.shape == (5, 512, 512, 3), "Data not returned in expected shape" - - -def test_image_loader_resize_squeeze( - celeba_local_repo_fully_committed, empty_local_dir, tmp_path -): - repo = celeba_local_repo_fully_committed - train_file = os.path.join(repo.path, "annotations", "train.csv") - label_file = os.path.join(repo.path, "annotations", "labels.txt") - - loader = ImageClassificationLoader( - imagery_root_dir=repo.path, - label_file=label_file, - df_file=train_file, - path_name="file", - label_name="hair_color", - resize_to=252, - resize_method="squeeze", - ) - data, labels, mapper = loader.run() - np.save(tmp_path / "imgs.npy", data) - print("this this dtype is", data.dtype) - assert data.shape == (5, 252, 252, 3), "Data not returned in expected shape" - - -def test_image_loader_resize_pad( - celeba_local_repo_fully_committed, empty_local_dir, tmp_path -): - repo = celeba_local_repo_fully_committed - train_file = os.path.join(repo.path, "annotations", "train.csv") - label_file = os.path.join(repo.path, "annotations", "labels.txt") - - loader = ImageClassificationLoader( - imagery_root_dir=repo.path, - label_file=label_file, - df_file=train_file, - path_name="file", - label_name="hair_color", - resize_to=111, - resize_method="pad", - ) - data, labels, mapper = loader.run() - np.save(tmp_path / "imgs.npy", data) - print("this dtype is", data.dtype) - assert data.shape == (5, 111, 111, 3), "Data not returned in expected shape" - - -def test_bad_resize_method( - celeba_local_repo_fully_committed, empty_local_dir, tmp_path -): - repo = celeba_local_repo_fully_committed - train_file = os.path.join(repo.path, "annotations", "train.csv") - label_file = os.path.join(repo.path, "annotations", "labels.txt") - - with pytest.raises(ValueError) as e: - print(e) - loader = ImageClassificationLoader( - imagery_root_dir=repo.path, - label_file=label_file, - df_file=train_file, - path_name="file", - label_name="hair_color", - resize_to=111, - resize_method="bad_method", - ) - data, labels, mapper = loader.run() diff --git a/oxen/tests/test_merge.py b/oxen/tests/test_merge.py new file mode 100644 index 0000000..6ce90ab --- /dev/null +++ b/oxen/tests/test_merge.py @@ -0,0 +1,45 @@ +import os + + +def test_merge(house_prices_local_repo_fully_committed): + repo = house_prices_local_repo_fully_committed + + prices_file = "prices.csv" + full_path = os.path.join(repo.path, prices_file) + initial_branch = repo.current_branch.name + + # read initial prices.csv contents + with open(full_path, "r") as f: + initial_contents = f.read() + + # oxen checkout -b new_branch + new_branch = "new_branch" + repo.checkout(new_branch, create=True) + assert repo.current_branch.name == "new_branch" + + # update prices.csv + new_line = "6000000,6000,7,7,2015" + with open(full_path, "a") as f: + f.write(new_line) + + # oxen add prices.csv + repo.add(full_path) + + # oxen commit + repo.commit("Add new price") + with open(full_path, "r") as f: + updated_contents = f.read() + + # oxen checkout main + repo.checkout(initial_branch) + assert repo.current_branch.name == initial_branch + with open(full_path, "r") as f: + contents = f.read() + assert contents == initial_contents + + # oxen merge new_branch + repo.merge(new_branch) + assert repo.current_branch.name == initial_branch + with open(full_path, "r") as f: + contents = f.read() + assert contents == updated_contents diff --git a/oxen/tests/test_regression_loader.py b/oxen/tests/test_regression_loader.py deleted file mode 100644 index 70c1bbf..0000000 --- a/oxen/tests/test_regression_loader.py +++ /dev/null @@ -1,21 +0,0 @@ -from oxen.loaders import RegressionLoader -import os - - -def test_regression_loader(house_prices_local_repo_no_commits): - repo = house_prices_local_repo_no_commits - data_file = os.path.join(repo.path, "prices.csv") - - loader = RegressionLoader(data_file, "price", ["sqft", "num_bed", "num_bath"]) - result = loader.run() - assert len(result) == 2 - - # is a dataframe - assert result[0].height == 5 - assert result[0].width == 3 - - # is a series - assert result[1].len() == 5 - - print(result[0]) - print(result[1]) diff --git a/oxen/tests/test_workspace_commit.py b/oxen/tests/test_workspace_commit.py index a01e8d9..9f37488 100644 --- a/oxen/tests/test_workspace_commit.py +++ b/oxen/tests/test_workspace_commit.py @@ -1,4 +1,5 @@ import os +import pytest from oxen import RemoteRepo, Workspace @@ -18,5 +19,11 @@ def test_commit_one_file( def test_commit_empty(celeba_remote_repo_one_image_pushed: RemoteRepo, shared_datadir): _, remote_repo = celeba_remote_repo_one_image_pushed workspace = Workspace(remote_repo, "main") - workspace.commit("a commit message") - assert len(remote_repo.log()) == 2 + + with pytest.raises(ValueError) as e: + # empty commits in workspace should raise an error + workspace.commit("a commit message") + assert "No changes to commit" in str(e) + + # should still be 1 commit + assert len(remote_repo.log()) == 1