Skip to content

Commit

Permalink
Merge pull request #34 from sb-ai-lab/feature/prerequisites-launch-1
Browse files Browse the repository at this point in the history
Updated networking with localhost for MlFlow
  • Loading branch information
zakharova-anastasiia authored May 20, 2024
2 parents 29c2875 + 3388c39 commit b41c662
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 16 deletions.
8 changes: 8 additions & 0 deletions prerequisites/docker-compose-mlflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ services:
- mlruns:/var/lib/mlruns
ports:
- ${MLFLOW_PORT}:5000
networks:
- vfl-network

postgres-vfl:
image: postgres:15.3
Expand All @@ -25,6 +27,12 @@ services:
- PGDATA=/var/lib/postgresql/data/pgdata
volumes:
- postgres_data:/var/lib/postgresql/data
networks:
- vfl-network

networks:
vfl-network:
external: true

volumes:
postgres_data:
Expand Down
3 changes: 2 additions & 1 deletion prerequisites/docker-compose-monitoring.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ services:
ports:
- "${PROMETHEUS_PORT}:9090"
volumes:
- "${DOCKER_COMPOSE_PATH}/configs/prometheus.yml:/etc/prometheus/prometheus.yml"
- "${DOCKER_COMPOSE_PATH}/configs/prometheus.yml:/etc/prometheus/prometheus.yml"
- prometheus_storage:/prometheus
networks:
- vfl-network
Expand All @@ -30,6 +30,7 @@ services:

networks:
vfl-network:
external: true

volumes:
prometheus_storage:
Expand Down
6 changes: 4 additions & 2 deletions stalactite/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,10 @@ def log_timing(name: str, log_func: Callable = print):
@contextmanager
def reporting(config: VFLConfig):
if config.master.run_mlflow:
mlflow_host = os.environ.get('STALACTITE_MLFLOW_HOST', config.prerequisites.mlflow_host)
mlflow.set_tracking_uri(f"http://{mlflow_host}:{config.prerequisites.mlflow_port}")
mlflow_host = os.environ.get(
'STALACTITE_MLFLOW_URI', f"http://{config.prerequisites.mlflow_host}:{config.prerequisites.mlflow_port}"
)
mlflow.set_tracking_uri(mlflow_host)
mlflow.set_experiment(config.common.experiment_label)
mlflow.start_run()

Expand Down
7 changes: 6 additions & 1 deletion stalactite/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@
get_status,
is_test_environment,
run_subprocess_command,
stop_containers, start_distributed_agent, start_multiprocess_agents, run_local_experiment,
stop_containers,
start_distributed_agent,
start_multiprocess_agents,
run_local_experiment,
create_external_network,
)

logging.getLogger('git').setLevel(logging.ERROR)
Expand Down Expand Up @@ -85,6 +89,7 @@ def start(config_path, detached, group):
logger.info(f"Starting prerequisites containers ({group})")
config = VFLConfig.load_and_validate(config_path)
env_vars = get_env_vars(config)
create_external_network()
for group_name in group:
command = f"{config.docker.docker_compose_command} -f docker-compose-{group_name}.yml"
run_subprocess_command(
Expand Down
33 changes: 21 additions & 12 deletions stalactite/utils_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,20 @@
BASE_IMAGE_FILE = "grpc-base.dockerfile"
BASE_IMAGE_FILE_CPU = "grpc-base-cpu.dockerfile"
BASE_IMAGE_TAG = "grpc-base:latest"
PREREQUISITES_NETWORK = "monitoring_vfl-network" # Do not change this value
MLFLOW_NETWORK = "mlflow_default"
EXTERNAL_PREREQUISITES_NETWORK = "vfl-network" # Do not change this value
MLFLOW_CONTAINER_NAME = "mlflow-mlflow-vfl-1"

logger = logging.getLogger(__name__)
logging.getLogger('docker').setLevel(logging.ERROR)


def create_external_network(docker_client: APIClient = APIClient()):
if networks := docker_client.networks(names=[EXTERNAL_PREREQUISITES_NETWORK], filters={'driver': 'bridge'}):
logger.debug(f'{EXTERNAL_PREREQUISITES_NETWORK} has already been created ({networks}). Skipping.')
else:
docker_client.create_network(name=EXTERNAL_PREREQUISITES_NETWORK, driver='bridge', internal=False)


def validate_int(value: Any):
try:
value = int(value)
Expand Down Expand Up @@ -193,6 +200,7 @@ def create_and_start_container(

def get_mlflow_endpoint(config: VFLConfig) -> str:
mlflow_host = config.prerequisites.mlflow_host
mlflow_port = config.prerequisites.mlflow_port
if mlflow_host in ['0.0.0.0', 'localhost']:
logger.info('Searching the MlFlow container locally')
client = APIClient()
Expand All @@ -205,14 +213,15 @@ def get_mlflow_endpoint(config: VFLConfig) -> str:
)
raise exc
try:
mlflow_host = container_info['NetworkSettings']['Networks'][MLFLOW_NETWORK]['Gateway']
mlflow_host = MLFLOW_CONTAINER_NAME
mlflow_port = 5000
except KeyError:
raise ValueError(
'MlFlow container does not configured via `stalactite prerequisites`, rerun the command or use'
' host machine IP address in the `config.prerequisites.mlflow_host` configuration parameter'
)
logger.info(f'Found MlFlow at {mlflow_host}')
return mlflow_host
return f"http://{mlflow_host}:{mlflow_port}"


def start_distributed_agent(
Expand Down Expand Up @@ -247,11 +256,11 @@ def start_distributed_agent(
model_path = os.path.abspath(config.vfl_model.vfl_model_path)

if role == Role.master:
if networks := client.networks(names=[PREREQUISITES_NETWORK]):
if networks := client.networks(names=[EXTERNAL_PREREQUISITES_NETWORK]):
network = networks.pop()["Name"]
else:
network = PREREQUISITES_NETWORK
client.create_network(network)
network = EXTERNAL_PREREQUISITES_NETWORK
create_external_network(client)
networking_config = client.create_networking_config({network: client.create_endpoint_config()})
else:
networking_config = None
Expand All @@ -269,7 +278,7 @@ def start_distributed_agent(
port_binds = {config.grpc_server.port: config.grpc_server.port}
ports = [config.grpc_server.port]
name = ctx.obj["master_container_name"] + ("-predict" if infer else "")
env_vars['STALACTITE_MLFLOW_HOST'] = get_mlflow_endpoint(config)
env_vars['STALACTITE_MLFLOW_URI'] = get_mlflow_endpoint(config)
if config.master.cuda_visible_devices != 'all' and config.docker.use_gpu:
env_vars['CUDA_VISIBLE_DEVICES'] = config.master.cuda_visible_devices
elif role == Role.arbiter:
Expand Down Expand Up @@ -329,11 +338,11 @@ def start_multiprocess_agents(
logger.info("Building an image of the agent. If build for the first time, it may take a while...")
build_base_image(client, use_gpu=config.docker.use_gpu)

if networks := client.networks(names=[PREREQUISITES_NETWORK]):
if networks := client.networks(names=[EXTERNAL_PREREQUISITES_NETWORK]):
network = networks.pop()["Name"]
else:
network = PREREQUISITES_NETWORK
client.create_network(network)
network = EXTERNAL_PREREQUISITES_NETWORK
create_external_network(client)
networking_config = client.create_networking_config({network: client.create_endpoint_config()})

raise_path_not_exist(config.data.host_path_data_dir)
Expand Down Expand Up @@ -401,7 +410,7 @@ def start_multiprocess_agents(
if config.master.cuda_visible_devices != 'all' and config.docker.use_gpu:
env_vars['CUDA_VISIBLE_DEVICES'] = config.master.cuda_visible_devices
env_vars['GRPC_ARBITER_HOST'] = grpc_arbiter_host
env_vars['STALACTITE_MLFLOW_HOST'] = get_mlflow_endpoint(config)
env_vars['STALACTITE_MLFLOW_URI'] = get_mlflow_endpoint(config)

create_and_start_container(
client=client,
Expand Down

0 comments on commit b41c662

Please sign in to comment.