diff --git a/fastsafetensors/__init__.py b/fastsafetensors/__init__.py index f005e5e..5fcf6c8 100644 --- a/fastsafetensors/__init__.py +++ b/fastsafetensors/__init__.py @@ -1,6 +1,10 @@ # Copyright 2024 IBM Inc. All rights reserved # SPDX-License-Identifier: Apache-2.0 +from importlib.metadata import version + +__version__ = version(__name__) + from .common import SafeTensorsMetadata, SingleGroup, TensorFrame, get_device_numa_node from .file_buffer import FilesBufferOnDevice from .loader import SafeTensorsFileLoader, fastsafe_open diff --git a/fastsafetensors/common.py b/fastsafetensors/common.py index 1855b1b..d369975 100644 --- a/fastsafetensors/common.py +++ b/fastsafetensors/common.py @@ -3,7 +3,7 @@ import json import os -import platform +import sys from collections import OrderedDict from dataclasses import dataclass from typing import Dict, List, Optional, Tuple @@ -15,7 +15,7 @@ def get_device_numa_node(device: Optional[int]) -> Optional[int]: - if device is None or platform.system() != "Linux": + if device is None or not sys.platform.startswith("linux"): return None pci_addr = fstcpp.get_device_pci_bus(device) if pci_addr == "": diff --git a/fastsafetensors/cpp.pyi b/fastsafetensors/cpp.pyi index 6222b6c..7e83691 100644 --- a/fastsafetensors/cpp.pyi +++ b/fastsafetensors/cpp.pyi @@ -9,6 +9,7 @@ class gds_device_buffer: self, dst_off: int, src_off: int, tmp: "gds_device_buffer", length: int ) -> int: ... def get_base_address(self) -> int: ... + def get_length(self) -> int: ... class nogds_file_reader: def __init__( @@ -35,6 +36,11 @@ class gds_file_reader: ) -> int: ... def wait_read(self, id: int) -> int: ... +class cpp_metrics: + bounce_buffer_bytes: int + + def __init__(self) -> None: ... + def is_cuda_found() -> bool: ... def is_cufile_found() -> bool: ... def cufile_version() -> int: ... @@ -50,3 +56,4 @@ def cpu_free(addr: int) -> None: ... def gpu_malloc(length: int) -> int: ... def gpu_free(addr: int) -> None: ... def load_nvidia_functions() -> None: ... +def get_cpp_metrics() -> cpp_metrics: ... diff --git a/fastsafetensors/cpp/ext.cpp b/fastsafetensors/cpp/ext.cpp index bf47410..4f08894 100644 --- a/fastsafetensors/cpp/ext.cpp +++ b/fastsafetensors/cpp/ext.cpp @@ -16,6 +16,8 @@ static bool debug_log = false; +static cpp_metrics_t mc = {.bounce_buffer_bytes = 0}; + /* cpu_mode functions: for tests and debugs */ static CUfileError_t cpu_cuFileDriverOpen() { return CUfileError_t{.err = CU_FILE_SUCCESS}; } @@ -255,7 +257,7 @@ int init_gds() } if (debug_log) { std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::printf("[DEBUG] init_gds: cuFileDriverOpen=%lld us\n", + std::printf("[DEBUG] init_gds: cuFileDriverOpen=%" PRId64 " us\n", std::chrono::duration_cast(end - begin).count()); } return 0; @@ -275,7 +277,7 @@ int close_gds() } if (debug_log) { std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::printf("[DEBUG] close_gds: cuFileDriverClose, elapsed=%lld us\n", + std::printf("[DEBUG] close_gds: cuFileDriverClose, elapsed=%" PRId64 " us\n", std::chrono::duration_cast(end - begin).count()); } return 0; @@ -352,7 +354,7 @@ const int gds_device_buffer::cufile_register(uint64_t offset, uint64_t length) { } if (debug_log) { std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::printf("[DEBUG] gds_device_buffer.cufile_register: addr=%p, offset=%" PRIu64 ", length=%" PRIu64 ", register=%lld us\n", dst, offset, length, + std::printf("[DEBUG] gds_device_buffer.cufile_register: addr=%p, offset=%" PRIu64 ", length=%" PRIu64 ", register=%" PRId64 " us\n", dst, offset, length, std::chrono::duration_cast(end - begin_register).count()); } return 0; @@ -369,7 +371,7 @@ const int gds_device_buffer::cufile_deregister(uint64_t offset) { } if (debug_log) { std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::printf("[DEBUG] gds_device_buffer.cufile_deregister: addr=%p, offset=%" PRIu64 ", elapsed=%lld us\n", dst, offset, + std::printf("[DEBUG] gds_device_buffer.cufile_deregister: addr=%p, offset=%" PRIu64 ", elapsed=%" PRId64 " us\n", dst, offset, std::chrono::duration_cast(end - begin).count()); } return 0; @@ -410,7 +412,7 @@ const int gds_device_buffer::memmove(uint64_t _dst_off, uint64_t _src_off, const } if (debug_log) { std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::printf("[DEBUG] gds_device_buffer.memmove: dst=%p, src=%p, tmp=%p, length=%" PRIu64 ", elapsed=%lld us\n", dst, src, tmp, length, + std::printf("[DEBUG] gds_device_buffer.memmove: dst=%p, src=%p, tmp=%p, length=%" PRIu64 ", elapsed=%" PRId64 " us\n", dst, src, tmp, length, std::chrono::duration_cast(end - begin).count()); } return 0; @@ -434,7 +436,7 @@ void nogds_file_reader::_thread(const int thread_id, ext_funcs_t *fns, const int } if (debug_log) { std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::printf("[DEBUG] nogds_file_reader._thread: mmap, fd=%d, offset=%" PRIu64 ", length=%" PRIu64 ", elapsed=%lld us\n", + std::printf("[DEBUG] nogds_file_reader._thread: mmap, fd=%d, offset=%" PRIu64 ", length=%" PRIu64 ", elapsed=%" PRId64 " us\n", fd, offset, length, std::chrono::duration_cast(end - begin).count()); } } @@ -469,7 +471,7 @@ void nogds_file_reader::_thread(const int thread_id, ext_funcs_t *fns, const int count += c; if (debug_log) { std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::printf("[DEBUG] nogds_file_reader._thread: read (mmap=%d), fd=%d, offset=%" PRIu64 ", count=%" PRIi64 ", c=%" PRIi64 ", copy=%lld us, cuda_copy=%lld us\n", + std::printf("[DEBUG] nogds_file_reader._thread: read (mmap=%d), fd=%d, offset=%" PRIu64 ", count=%" PRIi64 ", c=%" PRIi64 ", copy=%" PRId64 " us, cuda_copy=%" PRId64 " us\n", s->_use_mmap, fd, offset, count, c, std::chrono::duration_cast(memcpy_begin - begin).count(), std::chrono::duration_cast(end - memcpy_begin).count()); } } @@ -500,15 +502,18 @@ const int nogds_file_reader::submit_read(const int fd, const gds_device_buffer& if (this->_s._read_buffer == nullptr) { cudaError_t err; std::chrono::steady_clock::time_point alloc_begin = std::chrono::steady_clock::now(); - err = _fns->cudaHostAlloc(&this->_s._read_buffer, this->_s._bbuf_size_kb * 1024 * this->_s._max_threads, 0); + auto buf_len = this->_s._bbuf_size_kb * 1024 * this->_s._max_threads; + err = _fns->cudaHostAlloc(&this->_s._read_buffer, buf_len, 0); if (err != cudaSuccess) { - std::printf("nogds_file_reader.submit_read: cudaHostAlloc(%" PRIi64 ") failed\n", this->_s._bbuf_size_kb * 1024 * this->_s._max_threads); + std::printf("nogds_file_reader.submit_read: cudaHostAlloc(%" PRIi64 ") failed\n", buf_len); return -1; } + mc.bounce_buffer_bytes += buf_len; if (debug_log) { std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::printf("[DEBUG] nogds_file_reader.submit_read: cudaHostAlloc, size=%" PRIi64 ", elapsed=%lld us\n", - this->_s._bbuf_size_kb * 1024, std::chrono::duration_cast(end - alloc_begin).count()); + std::printf("[DEBUG] nogds_file_reader.submit_read: cudaHostAlloc, addr=%p, size=%" PRIi64 ", elapsed=%" PRId64 " us\n", + reinterpret_cast(this->_s._read_buffer), + buf_len, std::chrono::duration_cast(end - alloc_begin).count()); } } std::thread *t = this->_threads[thread_id % this->_s._max_threads]; @@ -540,8 +545,14 @@ const uintptr_t nogds_file_reader::wait_read(const int thread_id) { nogds_file_reader::~nogds_file_reader() { std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); if (this->_s._read_buffer != nullptr) { + auto buf_len = this->_s._bbuf_size_kb * 1024 * this->_s._max_threads; _fns->cudaFreeHost(this->_s._read_buffer); + if (debug_log) { + std::printf("[DEBUG] cudaFreeHost, addr=%p, size=%" PRIi64 "\n", + reinterpret_cast(this->_s._read_buffer), buf_len); + } this->_s._read_buffer = nullptr; + mc.bounce_buffer_bytes -= buf_len; } if (this->_threads != nullptr) { for (uint64_t i = 0; i < this->_s._max_threads; ++i) { @@ -556,7 +567,7 @@ nogds_file_reader::~nogds_file_reader() { } if (debug_log) { std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::printf("[DEBUG] ~nogds_file_reader: elapsed=%lld us\n", + std::printf("[DEBUG] ~nogds_file_reader: elapsed=%" PRId64 " us\n", std::chrono::duration_cast(end - begin).count()); } } @@ -595,7 +606,7 @@ raw_gds_file_handle::raw_gds_file_handle(std::string filename, bool o_direct, bo } if (debug_log) { std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::printf("[DEBUG] raw_gds_file_handle: fd=%d, cf_handle=%p, elapsed=%lld us\n", fd, cf_handle, + std::printf("[DEBUG] raw_gds_file_handle: fd=%d, cf_handle=%p, elapsed=%" PRId64 " us\n", fd, cf_handle, std::chrono::duration_cast(end - begin).count()); } this->_cf_handle = cf_handle; @@ -650,7 +661,7 @@ void gds_file_reader::_thread(const int thread_id, ext_funcs_t *fns, const gds_f } if (debug_log) { std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::printf("[DEBUG] gds_file_reader._thread: fh=%p, offset=%" PRIu64 ", length=%" PRIu64 ", count=%zd, read=%lld us, notify=%lld us\n", + std::printf("[DEBUG] gds_file_reader._thread: fh=%p, offset=%" PRIu64 ", length=%" PRIu64 ", count=%zd, read=%" PRId64" us, notify=%" PRId64 " us\n", fh._get_cf_handle(), offset, length, count, std::chrono::duration_cast(begin_notify - begin).count(), std::chrono::duration_cast(end - begin_notify).count()); @@ -699,6 +710,10 @@ const ssize_t gds_file_reader::wait_read(const int id) { return ret; } +cpp_metrics_t get_cpp_metrics() { + return mc; +} + // Bindings PYBIND11_MODULE(__MOD_NAME__, m) @@ -718,13 +733,15 @@ PYBIND11_MODULE(__MOD_NAME__, m) m.def("gpu_malloc", &gpu_malloc); m.def("gpu_free", &gpu_free); m.def("load_nvidia_functions", &load_nvidia_functions); + m.def("get_cpp_metrics", &get_cpp_metrics); pybind11::class_(m, "gds_device_buffer") .def(pybind11::init()) .def("cufile_register", &gds_device_buffer::cufile_register) .def("cufile_deregister", &gds_device_buffer::cufile_deregister) .def("memmove", &gds_device_buffer::memmove) - .def("get_base_address", &gds_device_buffer::get_base_address); + .def("get_base_address", &gds_device_buffer::get_base_address) + .def("get_length", &gds_device_buffer::get_length); pybind11::class_(m, "nogds_file_reader") .def(pybind11::init()) @@ -738,4 +755,8 @@ PYBIND11_MODULE(__MOD_NAME__, m) .def(pybind11::init()) .def("submit_read", &gds_file_reader::submit_read) .def("wait_read", &gds_file_reader::wait_read); + + pybind11::class_(m, "cpp_metrics") + .def(pybind11::init<>()) + .def_readwrite("bounce_buffer_bytes", &cpp_metrics_t::bounce_buffer_bytes); } diff --git a/fastsafetensors/cpp/ext.hpp b/fastsafetensors/cpp/ext.hpp index 465d4f7..eafd24c 100644 --- a/fastsafetensors/cpp/ext.hpp +++ b/fastsafetensors/cpp/ext.hpp @@ -104,6 +104,9 @@ class gds_device_buffer { const uintptr_t get_base_address() const { return this->_devPtr_base->get_uintptr(); } + const uint64_t get_length() const { + return _length; + } }; class nogds_file_reader { @@ -198,4 +201,8 @@ typedef struct ext_funcs { int (*numa_run_on_node)(int); } ext_funcs_t; +typedef struct cpp_metrics { + size_t bounce_buffer_bytes; +} cpp_metrics_t; + #endif //__EXT_HPP__ \ No newline at end of file diff --git a/fastsafetensors/frameworks/__init__.py b/fastsafetensors/frameworks/__init__.py index de13790..8c6b258 100644 --- a/fastsafetensors/frameworks/__init__.py +++ b/fastsafetensors/frameworks/__init__.py @@ -163,15 +163,16 @@ def randn(self, s: tuple, device: Device, dtype: DType) -> T: def support_fp8(self) -> bool: pass + @abstractmethod + def get_mem_used(self) -> int: + pass + def get_framework_op(name: str) -> FrameworkOpBase: if name == "pt" or name == "pytorch" or name == "torch": - from ._torch import TorchOp - - return TorchOp() + from ._torch import get_framework_op as op elif name == "paddle": - from ._paddle import PaddleOp - - return PaddleOp() + from ._paddle import get_framework_op as op else: raise Exception(f"Unknown framework name: {name}") + return op() diff --git a/fastsafetensors/frameworks/_paddle.py b/fastsafetensors/frameworks/_paddle.py index 0293d6d..13f5eec 100644 --- a/fastsafetensors/frameworks/_paddle.py +++ b/fastsafetensors/frameworks/_paddle.py @@ -140,6 +140,9 @@ def recv( class PaddleOp(FrameworkOpBase[PaddleTensor, PaddleProcessGroup]): + def __init__(self) -> None: + self.mem_used = 0 + def get_name(self) -> str: return "paddle" @@ -173,13 +176,16 @@ def alloc_tensor_memory(self, length: int, dev: Device) -> gds_device_buffer: rbuf = gpu_malloc(length) else: rbuf = cpu_malloc(length) + self.mem_used += length return gds_device_buffer(rbuf, length, dev.type == DeviceType.GPU) def free_tensor_memory(self, gbuf: gds_device_buffer, dev: Device) -> None: + length = gbuf.get_length() if dev.type == DeviceType.GPU: gpu_free(gbuf.get_base_address()) else: cpu_free(gbuf.get_base_address()) + self.mem_used -= length def get_empty_tensor( self, shape: List[int], dtype: DType, device: Device @@ -248,3 +254,16 @@ def randn(self, s: tuple, device: Device, dtype: DType) -> PaddleTensor: def support_fp8(self) -> bool: return DType.F8_E5M2 in dtype_convert + + def get_mem_used(self) -> int: + return self.mem_used + + +_op: Optional[PaddleOp] = None + + +def get_framework_op() -> FrameworkOpBase: + global _op + if _op is None: + _op = PaddleOp() + return _op diff --git a/fastsafetensors/frameworks/_torch.py b/fastsafetensors/frameworks/_torch.py index 1b77987..1487153 100644 --- a/fastsafetensors/frameworks/_torch.py +++ b/fastsafetensors/frameworks/_torch.py @@ -133,6 +133,9 @@ def recv( class TorchOp(FrameworkOpBase[TorchTensor, TorchProcessGroup]): + def __init__(self) -> None: + self.mem_used = 0 + def get_name(self) -> str: return "pytorch" @@ -149,13 +152,16 @@ def alloc_tensor_memory(self, length: int, dev: Device) -> gds_device_buffer: rbuf = torch.cuda.caching_allocator_alloc(length) else: rbuf = cpu_malloc(length) + self.mem_used += length return gds_device_buffer(rbuf, length, dev.type == DeviceType.CUDA) def free_tensor_memory(self, gbuf: gds_device_buffer, dev: Device): + length = gbuf.get_length() if dev.type == DeviceType.CUDA: torch.cuda.caching_allocator_delete(gbuf.get_base_address()) else: cpu_free(gbuf.get_base_address()) + self.mem_used -= length def get_empty_tensor( self, shape: List[int], dtype: DType, device: Device @@ -218,3 +224,16 @@ def randn(self, s: tuple, device: Device, dtype: DType) -> TorchTensor: def support_fp8(self) -> bool: return DType.F8_E5M2 in dtype_convert + + def get_mem_used(self): + return self.mem_used + + +_op: Optional[TorchOp] = None + + +def get_framework_op() -> FrameworkOpBase: + global _op + if _op is None: + _op = TorchOp() + return _op diff --git a/fastsafetensors/loader.py b/fastsafetensors/loader.py index 8e4b9b7..60959fa 100644 --- a/fastsafetensors/loader.py +++ b/fastsafetensors/loader.py @@ -91,6 +91,7 @@ def reset(self): def close(self): self.reset() + del self.reader def get_keys(self) -> List[str]: return list(self.frames.keys()) diff --git a/fastsafetensors/tensor_factory.py b/fastsafetensors/tensor_factory.py index 43b7403..839d188 100644 --- a/fastsafetensors/tensor_factory.py +++ b/fastsafetensors/tensor_factory.py @@ -50,6 +50,8 @@ def __init__( def submit_io(self, use_buf_register: bool, max_copy_block_size: int): if self.copier is not None: self.gbuf = self.copier.submit_io(use_buf_register, max_copy_block_size) + if self.gbuf and self.debug_log: + print(f"submit_io: new buf, addr={self.gbuf.get_base_address():#x}") def wait_io(self, dtype: DType = DType.AUTO, noalign: bool = False): if self.copier is not None and self.gbuf is not None: @@ -218,4 +220,8 @@ def free_dev_ptrs(self): self.tensors = {} if self.gbuf is not None: self.framework.free_tensor_memory(self.gbuf, self.device) + if self.debug_log: + print( + f"free_dev_ptrs: delete buf, addr={self.gbuf.get_base_address():#x}" + ) self.gbuf = None diff --git a/pyproject.toml b/pyproject.toml index dd3be8c..bca282e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "fastsafetensors" -version = "0.1.14" +version = "0.1.15" description = "High-performance safetensors model loader" authors = [{name = "Takeshi Yoshimura", email = "tyos@jp.ibm.com"}] maintainers = [{name = "Takeshi Yoshimura", email = "tyos@jp.ibm.com"}] diff --git a/tests/conftest.py b/tests/conftest.py index 5018399..96d2f95 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,7 @@ from fastsafetensors import SingleGroup from fastsafetensors import cpp as fstcpp +from fastsafetensors.cpp import load_nvidia_functions from fastsafetensors.frameworks import FrameworkOpBase, get_framework_op from fastsafetensors.st_types import Device @@ -16,6 +17,7 @@ os.makedirs(TF_DIR, 0o777, True) os.makedirs(TMP_DIR, 0o777, True) +load_nvidia_functions() FRAMEWORK = get_framework_op(os.getenv("TEST_FASTSAFETENSORS_FRAMEWORK", "please set")) diff --git a/tests/test_fastsafetensors.py b/tests/test_fastsafetensors.py index 8ae32a1..dcbdccd 100644 --- a/tests/test_fastsafetensors.py +++ b/tests/test_fastsafetensors.py @@ -86,7 +86,6 @@ def run_nogds_file_read( def test_device(fstcpp_log) -> None: - print("test_device") with pytest.raises(ValueError, match="Unknown device type: aaaa"): Device.from_str("aaaa:0") with pytest.raises(ValueError, match="Invalid index: -xxx"): @@ -101,7 +100,6 @@ def test_device(fstcpp_log) -> None: def test_framework(fstcpp_log, framework) -> None: - print("test_framework") t = framework.get_empty_tensor([1], DType.F16, Device.from_str("cpu")) with pytest.raises(Exception): framework.is_equal(t, [float(0.0)]) @@ -121,6 +119,13 @@ def test_framework(fstcpp_log, framework) -> None: assert framework.get_cuda_ver() == cuda_ver +def test_get_framework_fail(fstcpp_log) -> None: + from fastsafetensors.frameworks import get_framework_op + + with pytest.raises(Exception, match="Unknown framework name"): + get_framework_op("aaaaa") + + def make_header_bytes(s: str): header = s.encode("utf-8") n = len(header) @@ -199,6 +204,9 @@ def test_load_metadata_and_dlpack(fstcpp_log, input_files, framework) -> None: if not printed: print(actual_meta.__repr__()) printed = True + framework.free_tensor_memory(gbuf, device) + assert framework.get_mem_used() == 0 + assert fstcpp.get_cpp_metrics().bounce_buffer_bytes == 0 def test_set_debug_log() -> None: @@ -237,6 +245,7 @@ def test_alloc_gds_buffer(fstcpp_log, framework) -> None: gbuf = framework.alloc_tensor_memory(1024, device) addr = gbuf.get_base_address() assert addr != 0 + framework.free_tensor_memory(gbuf, device) def test_cufile_register_deregister(fstcpp_log, framework) -> None: @@ -247,6 +256,7 @@ def test_cufile_register_deregister(fstcpp_log, framework) -> None: assert gbuf.cufile_register(256, 1024 - 256) == 0 assert gbuf.cufile_deregister(0) == 0 assert gbuf.cufile_deregister(256) == 0 + framework.free_tensor_memory(gbuf, device) def test_memmove(fstcpp_log, framework) -> None: @@ -261,6 +271,9 @@ def test_memmove(fstcpp_log, framework) -> None: # However, this piece of memory itself is only 1024. # After offsetting by 12, there is no 1024 left in the remaining memory. # This part really puzzles me. So I change the moving size to 256*3 (<1024) + framework.free_tensor_memory(gbuf, device) + framework.free_tensor_memory(tmp, device) + assert framework.get_mem_used() == 0 def test_nogds_file_reader(fstcpp_log, input_files, framework) -> None: @@ -287,6 +300,10 @@ def test_nogds_file_reader(fstcpp_log, input_files, framework) -> None: for req in reqs: assert reader.wait_read(req) > 0 os.close(fd) + framework.free_tensor_memory(gbuf, device) + del reader + assert framework.get_mem_used() == 0 + assert fstcpp.get_cpp_metrics().bounce_buffer_bytes == 0 def test_NoGdsFileCopier(fstcpp_log, input_files, framework) -> None: @@ -301,6 +318,10 @@ def test_NoGdsFileCopier(fstcpp_log, input_files, framework) -> None: actual = tensors[key] assert framework.is_equal(actual, exp) framework.free_tensor_memory(gbuf, device) + del copier + del reader + assert framework.get_mem_used() == 0 + assert fstcpp.get_cpp_metrics().bounce_buffer_bytes == 0 def test_GdsFileCopier(fstcpp_log, input_files, framework) -> None: @@ -315,6 +336,9 @@ def test_GdsFileCopier(fstcpp_log, input_files, framework) -> None: actual = tensors[key] assert framework.is_equal(actual, exp) framework.free_tensor_memory(gbuf, device) + del reader + assert framework.get_mem_used() == 0 + assert fstcpp.get_cpp_metrics().bounce_buffer_bytes == 0 def test_SafeTensorsFileLoader(fstcpp_log, input_files, framework) -> None: @@ -360,6 +384,8 @@ def test_SafeTensorsFileLoader(fstcpp_log, input_files, framework) -> None: assert bufs.get_filename("aaaaaaaaaaaaa") == "" bufs.close() loader.close() + assert framework.get_mem_used() == 0 + assert fstcpp.get_cpp_metrics().bounce_buffer_bytes == 0 def test_SafeTensorsFileLoaderNoGds(fstcpp_log, input_files, framework) -> None: @@ -380,6 +406,8 @@ def test_SafeTensorsFileLoaderNoGds(fstcpp_log, input_files, framework) -> None: assert framework.is_equal(actual, exp) bufs.close() loader.close() + assert framework.get_mem_used() == 0 + assert fstcpp.get_cpp_metrics().bounce_buffer_bytes == 0 def test_fastsafe_open(fstcpp_log, input_files, framework) -> None: @@ -427,6 +455,8 @@ def weight_iterator(): assert isinstance(t, paddle.Tensor) break + assert framework.get_mem_used() == 0 + assert fstcpp.get_cpp_metrics().bounce_buffer_bytes == 0 def _test_type( @@ -452,6 +482,8 @@ def _test_type( for key in f.keys(): t1 = f.get_tensor_wrapped(key).clone().detach() assert framework.is_equal(t1, t2[key]) + assert framework.get_mem_used() == 0 + assert fstcpp.get_cpp_metrics().bounce_buffer_bytes == 0 def test_int8(fstcpp_log, tmp_dir, framework) -> None: @@ -484,3 +516,36 @@ def test_float8_e4m3fn_to_int8(fstcpp_log, tmp_dir, framework) -> None: return device, _ = get_and_check_device(framework) _test_type(tmp_dir, DType.F8_E4M3, device, framework, DType.I8) + + +def test_cpp_metrics(fstcpp_log, framework) -> None: + device, _ = get_and_check_device(framework) + exp_length = 0 + assert framework.get_mem_used() == exp_length + + gbuf = framework.alloc_tensor_memory(128, device) + exp_length += 128 + assert framework.get_mem_used() == exp_length + + framework.free_tensor_memory(gbuf, device) + exp_length -= 128 + assert framework.get_mem_used() == exp_length + + gbuf2 = framework.alloc_tensor_memory(1024, device) + exp_length += 1024 + assert framework.get_mem_used() == exp_length + + gbuf3 = framework.alloc_tensor_memory(128, device) + exp_length += 128 + assert framework.get_mem_used() == exp_length + + framework.free_tensor_memory(gbuf2, device) + exp_length -= 1024 + assert framework.get_mem_used() == exp_length + + framework.free_tensor_memory(gbuf3, device) + exp_length -= 128 + assert framework.get_mem_used() == exp_length + + assert exp_length == 0 + assert framework.get_mem_used() == 0 diff --git a/tests/test_multi.py b/tests/test_multi.py index aaf304d..b80b2f9 100644 --- a/tests/test_multi.py +++ b/tests/test_multi.py @@ -112,6 +112,8 @@ def test_shuffle(fstcpp_log, input_files, pg, framework): bufs.close() loader.close() + assert framework.get_mem_used() == 0 + assert fstcpp.get_cpp_metrics().bounce_buffer_bytes == 0 if __name__ == "__main__":