Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

upgrade base container to nemo:23.10 #42

Merged
merged 10 commits into from
Jan 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/gpu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
gpu-ci:
runs-on: linux-amd64-gpu-p100-latest-1
container:
image: nvcr.io/nvidian/crossfit-ci:23.09
image: nvcr.io/nvidian/crossfit-ci:23.10
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
options: --shm-size=1G
Expand All @@ -32,7 +32,7 @@ jobs:
benchmark:
runs-on: linux-amd64-gpu-p100-latest-1
container:
image: nvcr.io/nvidian/crossfit-ci:23.09
image: nvcr.io/nvidian/crossfit-ci:23.10
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
options: --shm-size=1G
Expand Down
2 changes: 2 additions & 0 deletions crossfit/backend/torch/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def call_on_worker(self, worker, *args, **kwargs):
return worker.torch_model(*args, **kwargs)

def get_model(self, worker):
if not hasattr(worker, "torch_model"):
self.load_on_worker(worker)
return worker.torch_model

def estimate_memory(self, max_num_tokens: int, batch_size: int) -> int:
Expand Down
5 changes: 1 addition & 4 deletions crossfit/backend/torch/op/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,6 @@ def __init__(
self.model_output_col = model_output_col
self.pred_output_col = pred_output_col

def setup(self):
self.model.load_on_worker(self)

@torch.no_grad()
def call(self, data, partition_info=None):
index = data.index
Expand All @@ -72,7 +69,7 @@ def call(self, data, partition_info=None):
)

all_outputs_ls = []
for output in loader.map(self.model.get_model(self)):
for output in loader.map(self.model.get_model(self.get_worker())):
if isinstance(output, dict):
if self.model_output_col not in output:
raise ValueError(f"Column '{self.model_outupt_col}' not found in model output.")
Expand Down
30 changes: 3 additions & 27 deletions crossfit/op/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# limitations under the License.

import inspect
import uuid

import dask.dataframe as dd
from dask.distributed import get_worker, wait
Expand All @@ -27,7 +26,7 @@ def __init__(self, pre=None, cols=False, keep_cols=None):
self.pre = pre
self.cols = cols
self.keep_cols = keep_cols or []
self.id = str(uuid.uuid4())
self.worker_name = getattr(self.get_worker(), "name", 0)

def setup(self):
pass
Expand All @@ -46,29 +45,6 @@ def get_worker(self):

return worker

def _get_init_name(self):
init_name = f"setup_done_{self.id}"
return init_name

def setup_worker(self):
worker = self.get_worker()

self.worker_name = getattr(worker, "name", 0)
init_name = self._get_init_name()

if not hasattr(worker, init_name):
self.setup()
setattr(worker, init_name, True)

def teardown_worker(self):
worker = self.get_worker()

init_name = self._get_init_name()

if hasattr(worker, init_name):
delattr(worker, init_name)
self.teardown()

def call_dask(self, data: dd.DataFrame):
output = data.map_partitions(self, meta=self._build_dask_meta(data))

Expand Down Expand Up @@ -101,10 +77,10 @@ def add_keep_cols(self, data, output):
def __call__(self, data, *args, partition_info=None, **kwargs):
if isinstance(data, dd.DataFrame):
output = self.call_dask(data, *args, **kwargs)
self.teardown_worker()
self.teardown()
return output

self.setup_worker()
self.setup()

if self.pre is not None:
params = inspect.signature(self.pre).parameters
Expand Down
21 changes: 15 additions & 6 deletions crossfit/op/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,18 @@ def __init__(
self.model = model
self.max_length = max_length or model.max_seq_length()

# Make sure we download the tokenizer just once
GPUTokenizer.from_pretrained(self.model)

def setup(self):
self.tokenizer = GPUTokenizer.from_pretrained(self.model)
self.setup()

def tokenize_strings(self, sentences, max_length=None):
return self.tokenizer(
worker = self.get_worker()

if hasattr(worker, "tokenizer"):
tokenizer = worker.tokenizer
else:
tokenizer = GPUTokenizer.from_pretrained(self.model)
worker.tokenizer = tokenizer

return worker.tokenizer(
sentences,
max_length=max_length or self.max_length,
max_num_rows=len(sentences),
Expand All @@ -56,6 +60,11 @@ def tokenize_strings(self, sentences, max_length=None):
add_special_tokens=True,
)

def teardown(self):
worker = self.get_worker()
if hasattr(worker, "tokenizer"):
delattr(worker, "tokenizer")

def call_column(self, data):
if isinstance(data, cudf.DataFrame):
raise ValueError(
Expand Down
2 changes: 0 additions & 2 deletions crossfit/report/beir/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ def embed(
else:
return EmbeddingDatataset.from_dir(emb_dir, data=dataset)

dfs = []
for dtype in ["query", "item"]:
if os.path.exists(os.path.join(emb_dir, dtype)):
continue
Expand All @@ -76,7 +75,6 @@ def embed(
embeddings = pipe(df)

embeddings.to_parquet(os.path.join(emb_dir, dtype))
dfs.append(df)

output: EmbeddingDatataset = EmbeddingDatataset.from_dir(emb_dir, data=dataset)
pred_path = os.path.join(emb_dir, "predictions")
Expand Down
5 changes: 3 additions & 2 deletions docker/ci/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
FROM nvcr.io/nvidia/pytorch:23.09-py3
FROM nvcr.io/nvidia/nemo:23.10

COPY . /tmp/crossfit/
RUN cd /tmp/crossfit && \
pip install .[pytorch-dev] && \
python3 -m pip install .[pytorch-dev] && \
python3 -m pip install beir && \
rm -r /tmp/crossfit

ENV CF_HOME /root/.cf
Expand Down
2 changes: 1 addition & 1 deletion docker/ci/build_and_push.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
set -e

IMAGE_NAME=nvcr.io/nvidian/crossfit-ci
IMAGE_TAG=23.09
IMAGE_TAG=23.10

docker build -t ${IMAGE_NAME}:${IMAGE_TAG} -f docker/ci/Dockerfile .

Expand Down
2 changes: 1 addition & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ typing_extensions
typing_utils
tqdm
rich
pynvml>=11.0.0,<11.5
pynvml>=11.0.0,<11.5
1 change: 1 addition & 0 deletions requirements/pytorch.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ torch>=1.0
transformers
curated-transformers
bitsandbytes
sentence-transformers
Loading