huggingface · zhenglongjiepheonix · Aug 12, 2024 · Jun 3, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/optimum/fx/parallelization/__init__.py b/optimum/fx/parallelization/__init__.py
@@ -0,0 +1,13 @@
+import torch
+from torch.fx import GraphModule
+from typing import List
+from .core import ParallelExecutionCtx, Config
+from .passes import build_parallel_pass_pipeline
+
+
+def parallelize_backend(graph_module: GraphModule, example_inputs: List[torch.Tensor], ctx: ParallelExecutionCtx, config: Config):
+    ctx.example_inputs = example_inputs
+    pass_pipeline = build_parallel_pass_pipeline()
+    graph_module = pass_pipeline(graph_module=graph_module, ctx=ctx, config=config)
+    ctx.compile_times += 1
+    return graph_module
diff --git a/optimum/fx/parallelization/core.py b/optimum/fx/parallelization/core.py
@@ -0,0 +1,109 @@
+from dataclasses import dataclass, field
+from typing import List, Any, List, Dict, Callable
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+from functools import partial
+
+class HashableSlice:
+    def __init__(self, start : int, stop : int, step : int) -> None:
+        self.start = start
+        self.stop = stop
+        self.step = step
+
+    def __hash__(self) -> int:
+        return hash(f'{self.start},{self.stop},{self.step}')
+
+    def __eq__(self, value: object) -> bool:
+        return isinstance(value, HashableSlice) and self.start == value.start and \
+            self.stop == value.stop and self.step == value.step
+
+    def to_slice(self) -> None:
+        return slice(self.start, self.stop, self.step)
+
+
+@dataclass
+class ParameterMeta:
+    # parameter name
+    source : str = None
+    # which axis to index
+    dim : int = None
+    # index to slice the tensor
+    index : slice = None
+
+
+@dataclass
+class ParameterMapping:
+    id : int = None
+    meta : ParameterMeta = None
+
+
+@dataclass
+class ParallelParameterMapping(ParameterMapping):
+    # the axis being parallelized
+    parallel_dim : int = None
+    # for multi-source parameter mapping
+    mapping : Dict[HashableSlice, ParameterMeta] = field(default_factory=dict)
+
+
+@dataclass
+class ParallelExecutionCtx:
+    """
+    Parallel execution context which contains runtime information.
+
+    - example_inputs
+        A list of tensors which are used as example inputs for graphs captured by dynamo.
+
+    - parallel_layer_cache
+        Cache which maps layers(`nn.Linear`, `nn.Embedding`) to their parallel counterparts.
+        Note that we will build the cache in the first compilation process, and for recompilations
+        later on, we will directly replace the modules with their parallel counterparts in the cache,
+        because we have to make sure we don't initiate new parameters and replace original ones when
+        recompilation happens in training process.
+
+    - parameter_mapping
+        Mapping between parameter ids and their correponding names in the original module. Note
+        that it changes as we create new parameters to replace original ones in the first compilation
+        process. It's useful because dynamo flattens the graph(which invalidates the parameter name
+        hierarchy) but the original parameters are kept.
+
+    - weight_map
+        Mapping between parameter names and their locations on disk, useful when loading weights
+        from disk.
+
+    - tp_group
+        Tensor parallel process group the current process belongs to.
+
+    - compile_times
+        Number of compilation times happened during the whole process.
+
+    - current_device
+        Device correpsonding to the current process.
+    """
+    example_inputs : List[Any] = field(default_factory=list)
+    parallel_layer_cache : Dict[int, nn.Module] = field(default_factory=dict)
+    parameter_mapping : Dict[int, ParameterMapping] = field(default_factory=dict)
+    weight_map : Dict[str, str] = field(default_factory=dict)
+    tp_group : dist.ProcessGroup = None
+    compile_times : int = 0
+    current_device : torch.device = None
+
+
+@dataclass
+class Config:
+    """
+    Static config which contains instructions which do not change in runtime.
+
+    - lint_and_recompile
+        Whether to run graph linting and module recompilation after every pass.
+
+    - clean_markers_after_all_passes
+        Whether to clean markers of analytical passes after all passes have run.
+
+    - weight_init_fn
+        Initialization function of weights in `nn.Linear` and `nn.Embedding` layers,
+        if not provided weights loading path.
+    """
+    lint_and_recompile : bool = True
+    clean_markers_after_all_passes : bool = True
+    weight_init_fn : Callable = partial(nn.init.normal_, std=0.02)
diff --git a/optimum/fx/parallelization/distributed/__init__.py b/optimum/fx/parallelization/distributed/__init__.py
@@ -0,0 +1,7 @@
+from .dist_ops import (
+    differentiable_all_gather,
+    differentiable_identity,
+    differentiable_all_reduce_sum,
+    differentiable_scatter,
+    scatter,
+)
diff --git a/optimum/fx/parallelization/distributed/dist_ops.py b/optimum/fx/parallelization/distributed/dist_ops.py
@@ -0,0 +1,113 @@
+import torch
+import torch.distributed as dist
+
+def all_reduce(group: dist.ProcessGroup, tensor : torch.Tensor) -> torch.Tensor:
+    world_size = dist.get_world_size(group)
+    if world_size == 1:
+        return tensor
+
+    dist.all_reduce(tensor, group=group)
+    return tensor
+
+def all_gather(group: dist.ProcessGroup, tensor: torch.Tensor, gather_dim: int = -1) -> torch.Tensor:
+    world_size = dist.get_world_size(group)
+    if world_size == 1:
+        return tensor
+    rank = dist.get_rank(group = group)
+
+    tensor = tensor.contiguous()
+    tensors = [torch.empty_like(tensor) for _ in range(world_size)]
+    tensors[rank] = tensor
+
+    dist.all_gather(tensors, tensor, group=group)
+    return torch.cat(tensors, dim=gather_dim)
+
+def split(group: dist.ProcessGroup, tensor: torch.Tensor, split_dim: int = -1) -> torch.Tensor:
+    world_size = dist.get_world_size(group)
+    if world_size == 1:
+        return tensor
+
+    rank = dist.get_rank(group)
+    size = tensor.size()
+    assert size[split_dim] % world_size == 0
+    tensors = torch.split(tensor, size[split_dim] // world_size, dim = split_dim)
+    tensor = tensors[rank].contiguous()
+
+    return tensor
+
+def scatter(group: dist.ProcessGroup, tensor: torch.Tensor, output_tensor: torch.Tensor, scatter_dim: int = 0) -> torch.Tensor:
+    world_size = dist.get_world_size(group)
+    if world_size == 1:
+        return tensor
+
+    rank = dist.get_rank(group)
+    if rank == 0:
+        size = tensor.size()
+        assert size[scatter_dim] % world_size == 0
+        tensors = torch.split(tensor, size[scatter_dim] // world_size, dim=scatter_dim)
+        scatter_list = [tensor.contiguous() for tensor in tensors]
+        output_tensor = scatter_list[rank]
+    else:
+        scatter_list = None
+    dist.scatter(tensor=output_tensor, scatter_list=scatter_list, src=0, group=group)
+    return output_tensor
+
+
+class DifferentiableIdentity(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, tensor, group: dist.ProcessGroup):
+        ctx.group = group
+        return tensor
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        group = ctx.group
+        return DifferentiableAllReduceSum.apply(grad_output, group), None
+
+
+class DifferentiableAllReduceSum(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, tensor: torch.Tensor, group: dist.ProcessGroup) -> torch.Tensor:
+        ctx.group = group
+        return all_reduce(group=group, tensor=tensor)
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Any:
+        return grad_output, None
+
+
+class DifferentiableScatter(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, tensor: torch.Tensor, group: dist.ProcessGroup, dim: int = -1) -> torch.Tensor:
+        ctx.group = group
+        ctx.dim = dim
+        return split(group=group, tensor=tensor, split_dim=dim)
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+        return DifferentiableAllGather.apply(grad_output, group=ctx.group, dim=ctx.dim), None, None
+
+
+class DifferentiableAllGather(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, tensor: torch.Tensor, group: dist.ProcessGroup, dim: int = -1) -> torch.Tensor:
+        ctx.group = group
+        ctx.dim = dim
+        return all_gather(group=group, tensor=tensor, gather_dim=dim)
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+        return DifferentiableScatter.apply(grad_output, group=ctx.group, dim=ctx.dim), None, None
+
+
+def differentiable_all_reduce_sum(tensor: torch.Tensor, group: dist.ProcessGroup):
+    return DifferentiableAllReduceSum.apply(tensor, group)
+
+def differentiable_identity(tensor: torch.Tensor,  group: dist.ProcessGroup):
+    return DifferentiableIdentity.apply(tensor, group)
+
+def differentiable_all_gather(tensor: torch.Tensor, group: dist.ProcessGroup, dim=-1):
+    return DifferentiableAllGather.apply(tensor, group, dim)
+
+def differentiable_scatter(tensor: torch.Tensor, group: dist.ProcessGroup, dim=-1):
+    return DifferentiableScatter.apply(tensor, group, dim)
diff --git a/optimum/fx/parallelization/parallel_layers/__init__.py b/optimum/fx/parallelization/parallel_layers/__init__.py
@@ -0,0 +1 @@
+from .linear import RowParallelLinear, ColumnParallelLinear