-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #237 from treeverse/lakefs-enterprise-demo-v2.0
Mount demo along with Hugging Face integration
- Loading branch information
Showing
9 changed files
with
861 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
FROM jupyter/scipy-notebook:notebook-7.0.6 | ||
|
||
USER root | ||
|
||
RUN apt-get update && apt install -y fuse | ||
|
||
RUN pip install lakefs==0.8.0 | ||
RUN pip install lakefs-spec==0.11.1 | ||
RUN pip install datasets==3.2.0 | ||
RUN pip install 'datasets[vision]' | ||
RUN pip install torchvision==0.17.2 | ||
|
||
# Disable the "Would you like to receive official Jupyter news?" popup | ||
RUN jupyter labextension disable "@jupyterlab/apputils-extension:announcements" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import argparse | ||
# Initialize parser | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--repo_name", help="lakeFS repository name") | ||
parser.add_argument("--branch_name", help="lakeFS branch name") | ||
parser.add_argument("--mount_location", help="Location for lakeFS Mount") | ||
parser.add_argument("--dataset_name", help="Hugging Face dataset name") | ||
parser.add_argument("--number_of_images", help="Number of Images") | ||
args = parser.parse_args() | ||
|
||
repo_name = args.repo_name | ||
experimentBranch = args.branch_name | ||
mount_location = args.mount_location | ||
hugging_face_dataset_name = args.dataset_name | ||
number_of_images = int(args.number_of_images) | ||
|
||
import lakefs | ||
repo = lakefs.Repository(repo_name) | ||
branchExperiment = repo.branch(experimentBranch) | ||
|
||
# Load the dataset from lakeFS Mount location | ||
from datasets import load_from_disk, DatasetDict | ||
dataset = load_from_disk(f'{mount_location}/{hugging_face_dataset_name}') | ||
|
||
# Select number of images | ||
dataset=dataset.select(range(number_of_images)) | ||
print(dataset) | ||
|
||
# Save dataset to lakeFS repo | ||
dataset.save_to_disk(f'lakefs://{repo_name}/{experimentBranch}/datasets/{hugging_face_dataset_name}_subset/') | ||
branchExperiment.commit(message='Uploaded transformed images!', metadata={'using': 'python_sdk'}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Fast Data Loading and Reproducibility for Deep Learning Workloads with lakeFS Mount | ||
|
||
Start by ⭐️ starring [lakeFS open source](https://go.lakefs.io/oreilly-course) project. | ||
|
||
This repository includes a Jupyter Notebook which you can run on your local machine. | ||
|
||
## Prerequisites | ||
* Docker installed on your local machine | ||
* This demo requires connecting to a lakeFS Server. You can either install lakeFS Server locally (https://docs.lakefs.io/quickstart.html), or spin up for free on the lakeFS cloud (https://lakefs.cloud). | ||
* Watch [this video](https://www.youtube.com/watch?v=BgKuoa8LAaU) to understand the use case as well as the demo. | ||
* [Contact lakeFS](https://lakefs.io/contact-sales/) to get the lakeFS Everest binary for Linux x86_64 OS. Download and save the binary on your laptop. | ||
|
||
## Setup | ||
|
||
1. Start by cloning this repository: | ||
|
||
```bash | ||
git clone https://github.com/treeverse/lakeFS-samples && cd lakeFS-samples/01_standalone_examples/lakefs-mount-demo | ||
``` | ||
|
||
2. Run following commands to download and run Docker container which includes Python, Hugging Face datasets library, Pytorch, Jupyter Notebook and lakeFS Python client (Docker image size is around 10GB): | ||
|
||
```bash | ||
docker build -t lakefs-mount-demo . | ||
|
||
docker run -d -p 8892:8888 --privileged --user root -e GRANT_SUDO=yes -v $PWD:/home/jovyan -v $PWD/jupyter_notebook_config.py:/home/jovyan/.jupyter/jupyter_notebook_config.py --name lakefs-mount-demo lakefs-mount-demo | ||
|
||
``` | ||
|
||
3. Copy the Everest binary for Linux x86_64 OS on your laptop inside "lakeFS-samples/01_standalone_examples/lakefs-mount-demo" folder. | ||
|
||
4. Open JupyterLab UI [http://127.0.0.1:8892/](http://127.0.0.1:8892/) in your web browser. | ||
|
||
## Demo Instructions | ||
|
||
1. Once you have successfully completed setup then open "lakeFS Mount Demo" notebook from JupyterLab UI and follow the instructions. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
import argparse | ||
# Initialize parser | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--mount_location", help="Location for lakeFS Mount") | ||
parser.add_argument("--dataset_name", help="Hugging Face dataset name") | ||
args = parser.parse_args() | ||
|
||
mount_location = args.mount_location | ||
hugging_face_dataset_name = args.dataset_name | ||
|
||
# Load the dataset from lakeFS Mount location | ||
from datasets import load_from_disk, load_dataset | ||
dataset = load_from_disk(f'{mount_location}/{hugging_face_dataset_name}') | ||
print(dataset) |
41 changes: 41 additions & 0 deletions
41
01_standalone_examples/lakefs-mount-demo/assets/lakefs_demo.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import os | ||
|
||
def print_diff(diff): | ||
results = map( | ||
lambda n:[n.path,n.path_type,n.size_bytes,n.type], | ||
diff) | ||
|
||
from tabulate import tabulate | ||
print(tabulate( | ||
results, | ||
headers=['Path','Path Type','Size(Bytes)','Type'])) | ||
|
||
def print_commit(log): | ||
from datetime import datetime | ||
from pprint import pprint | ||
|
||
print('Message:', log.message) | ||
print('ID:', log.id) | ||
print('Committer:', log.committer) | ||
print('Creation Date:', datetime.utcfromtimestamp(log.creation_date).strftime('%Y-%m-%d %H:%M:%S')) | ||
print('Parents:', log.parents) | ||
print('Metadata:') | ||
pprint(log.metadata) | ||
|
||
def lakefs_ui_endpoint(lakefsEndPoint): | ||
if lakefsEndPoint.startswith('http://host.docker.internal'): | ||
lakefsUIEndPoint = lakefsEndPoint.replace('host.docker.internal','127.0.0.1') | ||
elif lakefsEndPoint.startswith('http://lakefs'): | ||
lakefsUIEndPoint = lakefsEndPoint.replace('lakefs','127.0.0.1') | ||
else: | ||
lakefsUIEndPoint = lakefsEndPoint | ||
|
||
return lakefsUIEndPoint | ||
|
||
def upload_objects(branch, local_path): | ||
for path, subdirs, files in os.walk(os.path.expanduser('~')+'/'+local_path): | ||
for file in files: | ||
if file.endswith(".jpg"): | ||
folder = path.rsplit("/")[-1] | ||
contentToUpload = open(path+'/'+file, 'rb').read() # Only a single file per upload which must be named \\\"content\\\" | ||
print(branch.object(local_path + '/' + folder + '/' + file).upload(data=contentToUpload, mode='wb', pre_sign=False)) |
4 changes: 4 additions & 0 deletions
4
01_standalone_examples/lakefs-mount-demo/jupyter_notebook_config.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Configuration file for jupyter-notebook. | ||
c.NotebookApp.password = '' | ||
c.NotebookApp.token = '' | ||
c.NotebookApp.ip='*' |
Oops, something went wrong.