mlcommons · priyakasimbeg · Feb 10, 2025 · Jan 15, 2025 · Jan 15, 2025 · Feb 10, 2025
@@ -17,7 +17,7 @@ jobs:
         pip install pylint==2.16.1
     - name: Run pylint
       run: |
-        pylint algorithmic_efficiency
+        pylint algoperf
         pylint reference_algorithms
         pylint prize_qualification_baselines
         pylint submission_runner.py

@@ -12,8 +12,8 @@ makefile
 *.swp
 */data/
 *events.out.tfevents*
-algorithmic_efficiency/workloads/librispeech_conformer/data_dir
-algorithmic_efficiency/workloads/librispeech_conformer/work_dir
+algoperf/workloads/librispeech_conformer/data_dir
+algoperf/workloads/librispeech_conformer/work_dir
 *.flac
 *.npy
 *.csv
@@ -25,4 +25,4 @@ scoring/plots/
 !scoring/test_data/experiment_dir/study_0/mnist_jax/trial_0/eval_measurements.csv
 !scoring/test_data/experiment_dir/study_0/mnist_jax/trial_1/eval_measurements.csv
 
-algorithmic_efficiency/_version.py
+algoperf/_version.py
@@ -4,34 +4,39 @@
 
 - Finalized variant workload targets.
 - Fix in random_utils helper function.
-- For conformer PyTorch Dropout layers set `inplace=True`. 
+- For conformer PyTorch Dropout layers set `inplace=True`.
 - Clear CUDA cache at begining of each trial for PyTorch.
 
 ## algoperf-benchmark-0.1.4 (2024-03-26)
 
 Upgrade CUDA version to CUDA 12.1:
+
 - Upgrade CUDA version in Dockerfiles that will be used for scoring.
 - Update Jax and PyTorch package version tags to use local CUDA installation.
 
-Add flag for completely disabling checkpointing. 
+Add flag for completely disabling checkpointing.
+
 - Note that we will run with checkpointing off at scoring time.
 
-Update Deepspeech and Conformer variant target setting configurations. 
-- Note that variant targets are not final. 
+Update Deepspeech and Conformer variant target setting configurations.
+
+- Note that variant targets are not final.
 
 Fixed bug in scoring code to take best trial in a study for external-tuning ruleset.
 
-Added instructions for submission. 
+Added instructions for submission.
 
-Changed default number of workers for PyTorch data loaders to 0. Running with >0 may lead to incorrect eval results see https://github.com/mlcommons/algorithmic-efficiency/issues/732. 
+Changed default number of workers for PyTorch data loaders to 0. Running with >0 may lead to incorrect eval results see <https://github.com/mlcommons/algorithmic-efficiency/issues/732>.
 
 ## algoperf-benchmark-0.1.2 (2024-03-04)
+
 Workload variant additions and fixes:
+
 - Add Deepspeech workload variant
 - Fix bugs in Imagenet ResNet, WMT and Criteo1tb variants
 
 Add prize qualification logs for external tuning ruleset.
-Note: FastMRI trials with dropout are not yet added due to https://github.com/mlcommons/algorithmic-efficiency/issues/664.
+Note: FastMRI trials with dropout are not yet added due to <https://github.com/mlcommons/algorithmic-efficiency/issues/664>.
 
 Add missing funcitonality to Docker startup script for self_tuning ruleset.
 Add self_tuning ruleset option to script that runs all workloads for scoring.
@@ -41,6 +46,7 @@ Datasetup fixes.
 Fix tests that check training differences in PyTorch and JAX on GPU.
 
 ## algoperf-benchmark-0.1.1 (2024-01-19)
+
 Bug fixes to FastMRI metric calculation and targets.
 
 Added workload variants and targets for ogbg, fastmri, librispeech_conformer, imagenet_resnet, imagenet_vit, criteo1tb to be used as held-out workloads.

@@ -205,7 +205,7 @@ docker run -t -d \
 -v $HOME/data/:/data/ \
 -v $HOME/experiment_runs/:/experiment_runs \
 -v $HOME/experiment_runs/logs:/logs \
--v $HOME/algorithmic-efficiency:/algorithmic-efficiency \
+-v $HOME/algorithmic-efficiency:/algoperf \
 --gpus all \
 --ipc=host \
 <image_path> \
@@ -229,7 +229,7 @@ To run the below commands, use the versions installed via `pip install -e '.[dev
 To automatically fix formatting errors, run the following (*WARNING:* this will edit your code, so it is suggested to make a git commit first!):
 
 ```bash
-yapf -i -r -vv -p algorithmic_efficiency datasets prize_qualification_baselines reference_algorithms tests *.py
+yapf -i -r -vv -p algoperf datasets prize_qualification_baselines reference_algorithms tests *.py
 ```
 
 To sort all import orderings, run the following:
@@ -247,7 +247,7 @@ isort . --check --diff
 To print out all offending pylint issues, run the following:
 
 ```bash
-pylint algorithmic_efficiency
+pylint algoperf
 pylint datasets
 pylint prize_qualification_baselines
 pylint reference_algorithms
@@ -288,4 +288,4 @@ You can check what version `setuptools_scm` is creating by running `python -m se
 
 To create a new version, create a new release (and tag) in the GitHub UI.
 The package version is automatically updated to the new version.
-Once the package is installed, the version can be accessed as the package attribute `algorithmic_efficiency.__version__`, i.e. via `python -c "import algorithmic_efficiency; print(algorithmic_efficiency.__version__)"`.
+Once the package is installed, the version can be accessed as the package attribute `algoperf.__version__`, i.e. via `python -c "import algoperf; print(algoperf.__version__)"`.
@@ -641,4 +641,4 @@ That said, while submitting Adam with some novel heuristic to set various hyperp
 The JAX and PyTorch versions of the Criteo, FastMRI, Librispeech, OGBG, and WMT workloads use the same TensorFlow input pipelines. Due to differences in how JAX and PyTorch distribute computations across devices, the PyTorch workloads have an additional overhead for these workloads.
 
 Since we use PyTorch's [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel) implementation, there is one Python process for each device. Depending on the hardware and the settings of the cluster, running a TensorFlow input pipeline in each Python process can lead to errors, since too many threads are created in each process. See [this PR thread](https://github.com/mlcommons/algorithmic-efficiency/pull/85) for more details.
-While this issue might not affect all setups, we currently implement a different strategy: we only run the TensorFlow input pipeline in one Python process (with `rank == 0`), and [broadcast](https://pytorch.org/docs/stable/distributed.html#torch.distributed.broadcast) the batches to all other devices. This introduces an additional communication overhead for each batch. See the [implementation for the WMT workload](https://github.com/mlcommons/algorithmic-efficiency/blob/main/algorithmic_efficiency/workloads/wmt/wmt_pytorch/workload.py#L215-L288) as an example.
+While this issue might not affect all setups, we currently implement a different strategy: we only run the TensorFlow input pipeline in one Python process (with `rank == 0`), and [broadcast](https://pytorch.org/docs/stable/distributed.html#torch.distributed.broadcast) the batches to all other devices. This introduces an additional communication overhead for each batch. See the [implementation for the WMT workload](https://github.com/mlcommons/algorithmic-efficiency/blob/main/algoperf/workloads/wmt/wmt_pytorch/workload.py#L215-L288) as an example.
@@ -58,7 +58,7 @@ To set up a virtual enviornment and install this repository
     cd algorithmic-efficiency
     ```
 
-3. Run the following pip3 install commands based on your chosen framework to install `algorithmic_efficiency` and its dependencies.
+3. Run the following pip3 install commands based on your chosen framework to install `algoperf` and its dependencies.
 
     For **JAX**:
 

@@ -16,8 +16,8 @@
 from tensorflow.io import gfile  # pytype: disable=import-error
 import torch
 
-from algorithmic_efficiency import spec
-from algorithmic_efficiency.pytorch_utils import pytorch_setup
+from algoperf import spec
+from algoperf.pytorch_utils import pytorch_setup
 
 _, _, DEVICE, _ = pytorch_setup()
 CheckpointReturn = Tuple[spec.OptimizerState,

@@ -11,7 +11,7 @@
 from torch.utils.data import DistributedSampler
 from torch.utils.data import Sampler
 
-from algorithmic_efficiency import spec
+from algoperf import spec
 
 
 def shard_and_maybe_pad_np(

@@ -1,7 +1,7 @@
 import jax.dlpack
 import torch
 
-from algorithmic_efficiency import spec
+from algoperf import spec
 
 
 def jax_to_pytorch(x: spec.Tensor, take_ownership: bool = False) -> spec.Tensor:

@@ -18,8 +18,8 @@
 import psutil
 import torch.distributed as dist
 
-from algorithmic_efficiency import spec
-from algorithmic_efficiency.pytorch_utils import pytorch_setup
+from algoperf import spec
+from algoperf.pytorch_utils import pytorch_setup
 
 USE_PYTORCH_DDP, RANK, DEVICE, _ = pytorch_setup()
 

@@ -6,7 +6,7 @@
 import jax
 from torch import nn
 
-from algorithmic_efficiency import spec
+from algoperf import spec
 
 
 def pytorch_param_shapes(model: nn.Module) -> Dict[str, spec.ShapeTuple]:

@@ -7,11 +7,11 @@
 import torch
 import torch.distributed as dist
 
-from algorithmic_efficiency import spec
-from algorithmic_efficiency.profiler import Profiler
-from algorithmic_efficiency.workloads.librispeech_conformer.librispeech_pytorch.models import \
+from algoperf import spec
+from algoperf.profiler import Profiler
+from algoperf.workloads.librispeech_conformer.librispeech_pytorch.models import \
     BatchNorm as ConformerBatchNorm
-from algorithmic_efficiency.workloads.librispeech_deepspeech.librispeech_pytorch.models import \
+from algoperf.workloads.librispeech_deepspeech.librispeech_pytorch.models import \
     BatchNorm as DeepspeechBatchNorm
 
 

@@ -13,8 +13,8 @@
 import tensorflow as tf
 import tensorflow_datasets as tfds
 
-from algorithmic_efficiency import spec
-from algorithmic_efficiency.data_utils import shard_and_maybe_pad_np
+from algoperf import spec
+from algoperf.data_utils import shard_and_maybe_pad_np
 
 
 def preprocess_for_train(image: spec.Tensor,

@@ -10,9 +10,8 @@
 from flax import linen as nn
 import jax.numpy as jnp
 
-from algorithmic_efficiency import spec
-from algorithmic_efficiency.workloads.imagenet_resnet.imagenet_jax.models import \
-    ResNetBlock
+from algoperf import spec
+from algoperf.workloads.imagenet_resnet.imagenet_jax.models import ResNetBlock
 
 ModuleDef = nn.Module
 

@@ -11,12 +11,11 @@
 import optax
 import tensorflow_datasets as tfds
 
-from algorithmic_efficiency import param_utils
-from algorithmic_efficiency import spec
-from algorithmic_efficiency.workloads.cifar.cifar_jax import models
-from algorithmic_efficiency.workloads.cifar.cifar_jax.input_pipeline import \
-    create_input_iter
-from algorithmic_efficiency.workloads.cifar.workload import BaseCifarWorkload
+from algoperf import param_utils
+from algoperf import spec
+from algoperf.workloads.cifar.cifar_jax import models
+from algoperf.workloads.cifar.cifar_jax.input_pipeline import create_input_iter
+from algoperf.workloads.cifar.workload import BaseCifarWorkload
 
 
 class CifarWorkload(BaseCifarWorkload):

@@ -10,14 +10,13 @@
 import torch
 from torch import nn
 
-from algorithmic_efficiency import spec
-from algorithmic_efficiency.init_utils import pytorch_default_init
-from algorithmic_efficiency.workloads.imagenet_resnet.imagenet_pytorch.models import \
+from algoperf import spec
+from algoperf.init_utils import pytorch_default_init
+from algoperf.workloads.imagenet_resnet.imagenet_pytorch.models import \
     BasicBlock
-from algorithmic_efficiency.workloads.imagenet_resnet.imagenet_pytorch.models import \
+from algoperf.workloads.imagenet_resnet.imagenet_pytorch.models import \
     Bottleneck
-from algorithmic_efficiency.workloads.imagenet_resnet.imagenet_pytorch.models import \
-    conv1x1
+from algoperf.workloads.imagenet_resnet.imagenet_pytorch.models import conv1x1
 
 
 class ResNet(nn.Module):

@@ -12,13 +12,12 @@
 from torchvision import transforms
 from torchvision.datasets import CIFAR10
 
-from algorithmic_efficiency import data_utils
-from algorithmic_efficiency import param_utils
-from algorithmic_efficiency import pytorch_utils
-from algorithmic_efficiency import spec
-from algorithmic_efficiency.workloads.cifar.cifar_pytorch.models import \
-    resnet18
-from algorithmic_efficiency.workloads.cifar.workload import BaseCifarWorkload
+from algoperf import data_utils
+from algoperf import param_utils
+from algoperf import pytorch_utils
+from algoperf import spec
+from algoperf.workloads.cifar.cifar_pytorch.models import resnet18
+from algoperf.workloads.cifar.workload import BaseCifarWorkload
 
 USE_PYTORCH_DDP, RANK, DEVICE, N_GPUS = pytorch_utils.pytorch_setup()
 

@@ -7,9 +7,9 @@
 import jax
 import torch
 
-from algorithmic_efficiency import spec
-from algorithmic_efficiency.pytorch_utils import pytorch_setup
-import algorithmic_efficiency.random_utils as prng
+from algoperf import spec
+from algoperf.pytorch_utils import pytorch_setup
+import algoperf.random_utils as prng
 
 USE_PYTORCH_DDP, _, _, _ = pytorch_setup()
 

@@ -8,10 +8,10 @@
 import jax.numpy as jnp
 import numpy as np
 
-from algorithmic_efficiency import param_utils
-from algorithmic_efficiency import spec
-from algorithmic_efficiency.workloads.criteo1tb.criteo1tb_jax import models
-from algorithmic_efficiency.workloads.criteo1tb.workload import \
+from algoperf import param_utils
+from algoperf import spec
+from algoperf.workloads.criteo1tb.criteo1tb_jax import models
+from algoperf.workloads.criteo1tb.workload import \
     BaseCriteo1TbDlrmSmallWorkload
 
 

@@ -7,11 +7,11 @@
 import torch.distributed as dist
 from torch.nn.parallel import DistributedDataParallel as DDP
 
-from algorithmic_efficiency import param_utils
-from algorithmic_efficiency import spec
-from algorithmic_efficiency.pytorch_utils import pytorch_setup
-from algorithmic_efficiency.workloads.criteo1tb.criteo1tb_pytorch import models
-from algorithmic_efficiency.workloads.criteo1tb.workload import \
+from algoperf import param_utils
+from algoperf import spec
+from algoperf.pytorch_utils import pytorch_setup
+from algoperf.workloads.criteo1tb.criteo1tb_pytorch import models
+from algoperf.workloads.criteo1tb.workload import \
     BaseCriteo1TbDlrmSmallWorkload
 
 USE_PYTORCH_DDP, RANK, DEVICE, N_GPUS = pytorch_setup()

@@ -12,7 +12,7 @@
 
 import tensorflow as tf
 
-from algorithmic_efficiency import data_utils
+from algoperf import data_utils
 
 _NUM_DAY_23_FILES = 36
 

@@ -7,8 +7,8 @@
 from absl import flags
 import torch.distributed as dist
 
-from algorithmic_efficiency import spec
-from algorithmic_efficiency.workloads.criteo1tb import input_pipeline
+from algoperf import spec
+from algoperf.workloads.criteo1tb import input_pipeline
 
 FLAGS = flags.FLAGS
 

@@ -8,13 +8,12 @@
 import jax
 import jax.numpy as jnp
 
-from algorithmic_efficiency import param_utils
-from algorithmic_efficiency import spec
-import algorithmic_efficiency.random_utils as prng
-from algorithmic_efficiency.workloads.fastmri.fastmri_jax.models import UNet
-from algorithmic_efficiency.workloads.fastmri.fastmri_jax.ssim import ssim
-from algorithmic_efficiency.workloads.fastmri.workload import \
-    BaseFastMRIWorkload
+from algoperf import param_utils
+from algoperf import spec
+import algoperf.random_utils as prng
+from algoperf.workloads.fastmri.fastmri_jax.models import UNet
+from algoperf.workloads.fastmri.fastmri_jax.ssim import ssim
+from algoperf.workloads.fastmri.workload import BaseFastMRIWorkload
 
 
 class FastMRIWorkload(BaseFastMRIWorkload):

@@ -12,7 +12,7 @@
 from torch import Tensor
 from torch.nn import functional as F
 
-from algorithmic_efficiency import init_utils
+from algoperf import init_utils
 
 
 class UNet(nn.Module):

@@ -6,7 +6,7 @@
 import torch.nn.functional as F
 from torchvision.transforms.functional import pad as pad_fn
 
-from algorithmic_efficiency.pytorch_utils import pytorch_setup
+from algoperf.pytorch_utils import pytorch_setup
 
 DEVICE = pytorch_setup()[2]