Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions verl/utils/net_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import ipaddress
import random
import socket


Expand Down Expand Up @@ -70,15 +71,26 @@ def is_valid_ipv6_address(address: str) -> bool:
return False


def get_free_port(address: str) -> tuple[int, socket.socket]:
def get_free_port(address: str, seed: int | None = None) -> tuple[int, socket.socket]:
family = socket.AF_INET
if is_valid_ipv6_address(address):
family = socket.AF_INET6

sock = socket.socket(family=family, type=socket.SOCK_STREAM)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
sock.bind((address, 0))

port = sock.getsockname()[1]
return port, sock
# When a seed is provided, use it to deterministically pick ports from a wide range.
# This reduces port conflicts when multiple get_free_port running concurrently.
if seed is not None:
rng = random.Random(seed)
for _ in range(10):
port = rng.randint(20000, 60000)
try:
sock.bind((address, port))
return sock.getsockname()[1], sock
except OSError:
continue

sock.bind((address, 0))
return sock.getsockname()[1], sock
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,10 @@ def __init__(
# used for NCCL process group
if self.node_rank == 0:
self._master_address = self._server_address
self._master_port, self._master_sock = get_free_port(self._server_address)
# Seed with replica_rank + pid to avoid port conflicts across replicas and restarts
self._master_port, self._master_sock = get_free_port(
self._server_address, seed=self.replica_rank + os.getpid()
)
logger.info(
f"SGLangHttpServer, replica_rank: {self.replica_rank}, "
f"master address: {self._master_address}, port: {self._master_port}"
Expand Down
Loading