work on pluggable API for worker-side monitoring radio

patch so far is overloading self.monitoring_radio for two different uses that should be clarified: submit side radio and worker side radio its a bit complicated for having a default radio (UDPRadio or HTEXRadio) even when monitoring is turned off: I should check that the radio receiver does not get activated in this case: for example: * test that broken radio activation causes a startup error * test that configuration with broken radio doesn't cause a startup error when monitoringhub is not configured zmq radio should always listen, and be the place where all radio receivers send their data. udp radio and filesystem radio should turn into separate... something... processes? for prototyping i guess it doesn't matter where I make them live, but the most behaviour preserving would keep them somehow separated?
Parsl · Jul 26, 2024 · d2ee35c · d2ee35c
1 parent dc94b8a
commit d2ee35c
Show file tree

Hide file tree

Showing 18 changed files with 516 additions and 347 deletions.
diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py
@@ -744,16 +744,16 @@ def launch_task(self, task_record: TaskRecord) -> Future:
 
         if self.monitoring is not None and self.monitoring.resource_monitoring_enabled:
             wrapper_logging_level = logging.DEBUG if self.monitoring.monitoring_debug else logging.INFO
+
             (function, args, kwargs) = monitor_wrapper(f=function,
                                                        args=args,
                                                        kwargs=kwargs,
                                                        x_try_id=try_id,
                                                        x_task_id=task_id,
-                                                       monitoring_hub_url=self.monitoring.monitoring_hub_url,
+                                                       radio_config=executor.remote_monitoring_radio_config,
                                                        run_id=self.run_id,
                                                        logging_level=wrapper_logging_level,
                                                        sleep_dur=self.monitoring.resource_monitoring_interval,
-                                                       radio_mode=executor.radio_mode,
                                                        monitor_resources=executor.monitor_resources(),
                                                        run_dir=self.run_dir)
 
@@ -1181,6 +1181,18 @@ def add_executors(self, executors: Sequence[ParslExecutor]) -> None:
                 executor.hub_address = self.monitoring.hub_address
                 executor.hub_zmq_port = self.monitoring.hub_zmq_port
                 executor.submit_monitoring_radio = self.monitoring.radio
+                # this will modify the radio config object: it will add relevant parameters needed
+                # for the particular remote radio sender to communicate back
+                logger.info("starting monitoring receiver "
+                            f"for executor {executor} "
+                            f"with remote monitoring radio config {executor.remote_monitoring_radio_config}")
+                executor.monitoring_receiver = self.monitoring.start_receiver(executor.remote_monitoring_radio_config,
+                                                                              ip=self.monitoring.hub_address)
+                # TODO: this is a weird way to start the receiver.
+                # Rather than in executor.start, but there's a tangle here
+                # trying to make the executors usable in a non-pure-parsl
+                # context where there is no DFK to grab config out of?
+                # (and no monitoring...)
             if hasattr(executor, 'provider'):
                 if hasattr(executor.provider, 'script_dir'):
                     executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')

diff --git a/parsl/executors/base.py b/parsl/executors/base.py
@@ -1,11 +1,19 @@
+import logging
 import os
 from abc import ABCMeta, abstractmethod
 from concurrent.futures import Future
 from typing import Any, Callable, Dict, Optional
 
 from typing_extensions import Literal, Self
 
-from parsl.monitoring.radios import MonitoringRadioSender
+from parsl.monitoring.radios.base import (
+    MonitoringRadioReceiver,
+    MonitoringRadioSender,
+    RadioConfig,
+)
+from parsl.monitoring.radios.udp import UDPRadio
+
+logger = logging.getLogger(__name__)
 
 
 class ParslExecutor(metaclass=ABCMeta):
@@ -19,15 +27,13 @@ class ParslExecutor(metaclass=ABCMeta):
     no arguments and re-raises any thrown exception.
 
     In addition to the listed methods, a ParslExecutor instance must always
-    have a member field:
+    have these member fields:
 
        label: str - a human readable label for the executor, unique
               with respect to other executors.
 
-    Per-executor monitoring behaviour can be influenced by exposing:
-
-       radio_mode: str - a string describing which radio mode should be used to
-              send task resource data back to the submit side.
+       remote_monitoring_radio_config: RadioConfig describing how tasks on this executor
+              should report task resource status
 
     An executor may optionally expose:
 
@@ -45,11 +51,16 @@ class ParslExecutor(metaclass=ABCMeta):
     """
 
     label: str = "undefined"
-    radio_mode: str = "udp"
 
     def __init__(
         self,
         *,
+
+        # TODO: I'd like these two to go away but they're needed right now
+        # to configure the interchange monitoring radio, that is
+        # in addition to the submit and worker monitoring radios (!). They
+        # are effectivley a third monitoring radio config, though, so what
+        # should that look like for the interchange?
         hub_address: Optional[str] = None,
         hub_zmq_port: Optional[int] = None,
         submit_monitoring_radio: Optional[MonitoringRadioSender] = None,
@@ -58,10 +69,19 @@ def __init__(
     ):
         self.hub_address = hub_address
         self.hub_zmq_port = hub_zmq_port
+
+        # these are parameters for the monitoring radio to be used on the remote side
+        # eg. in workers - to send results back, and they should end up encapsulated
+        # inside a RadioConfig.
         self.submit_monitoring_radio = submit_monitoring_radio
+        self.remote_monitoring_radio_config: RadioConfig = UDPRadio()
+
         self.run_dir = os.path.abspath(run_dir)
         self.run_id = run_id
 
+        # will be set externally later, which is pretty ugly
+        self.monitoring_receiver: Optional[MonitoringRadioReceiver] = None
+
     def __enter__(self) -> Self:
         return self
 
@@ -94,7 +114,13 @@ def shutdown(self) -> None:
 
         This includes all attached resources such as workers and controllers.
         """
-        pass
+        logger.debug("Starting base executor shutdown")
+        # logger.error(f"BENC: monitoring receiver on {self} is {self.monitoring_receiver}")
+        if self.monitoring_receiver is not None:
+            logger.debug("Starting monitoring receiver shutdown")
+            self.monitoring_receiver.shutdown()
+            logger.debug("Done with monitoring receiver shutdown")
+        logger.debug("Done with base executor shutdown")
 
     def monitor_resources(self) -> bool:
         """Should resource monitoring happen for tasks on running on this executor?

diff --git a/parsl/executors/high_throughput/executor.py b/parsl/executors/high_throughput/executor.py
@@ -26,6 +26,7 @@
 )
 from parsl.executors.status_handling import BlockProviderExecutor
 from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
+from parsl.monitoring.radios.base import HTEXRadio, RadioConfig
 from parsl.process_loggers import wrap_with_logs
 from parsl.providers import LocalProvider
 from parsl.providers.base import ExecutionProvider
@@ -262,11 +263,13 @@ def __init__(self,
                  enable_mpi_mode: bool = False,
                  mpi_launcher: str = "mpiexec",
                  block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
-                 encrypted: bool = False):
+                 encrypted: bool = False,
+                 remote_monitoring_radio_config: Optional[RadioConfig] = None):
 
         logger.debug("Initializing HighThroughputExecutor")
 
         BlockProviderExecutor.__init__(self, provider=provider, block_error_handler=block_error_handler)
+
         self.label = label
         self.worker_debug = worker_debug
         self.storage_access = storage_access
@@ -310,6 +313,12 @@ def __init__(self,
             self._workers_per_node = 1  # our best guess-- we do not have any provider hints
 
         self._task_counter = 0
+
+        if remote_monitoring_radio_config is not None:
+            self.remote_monitoring_radio_config = remote_monitoring_radio_config
+        else:
+            self.remote_monitoring_radio_config = HTEXRadio()
+
         self.worker_ports = worker_ports
         self.worker_port_range = worker_port_range
         self.interchange_proc: Optional[subprocess.Popen] = None
@@ -341,8 +350,6 @@ def __init__(self,
             interchange_launch_cmd = DEFAULT_INTERCHANGE_LAUNCH_CMD
         self.interchange_launch_cmd = interchange_launch_cmd
 
-    radio_mode = "htex"
-
     def _warn_deprecated(self, old: str, new: str):
         warnings.warn(
             f"{old} is deprecated and will be removed in a future release. "
@@ -845,6 +852,9 @@ def shutdown(self, timeout: float = 10.0):
             logger.info("Closing command client")
             self.command_client.close()
 
+        # TODO: implement this across all executors
+        super().shutdown()
+
         logger.info("Finished HighThroughputExecutor shutdown attempt")
 
     def get_usage_information(self):

diff --git a/parsl/executors/high_throughput/interchange.py b/parsl/executors/high_throughput/interchange.py
@@ -20,7 +20,7 @@
 from parsl.executors.high_throughput.errors import ManagerLost, VersionMismatch
 from parsl.executors.high_throughput.manager_record import ManagerRecord
 from parsl.monitoring.message_type import MessageType
-from parsl.monitoring.radios import MonitoringRadioSender, ZMQRadioSender
+from parsl.monitoring.radios.base import MonitoringRadioSender, ZMQRadioSender
 from parsl.process_loggers import wrap_with_logs
 from parsl.serialize import serialize as serialize_object
 from parsl.utils import setproctitle

diff --git a/parsl/executors/high_throughput/mpi_executor.py b/parsl/executors/high_throughput/mpi_executor.py
@@ -10,6 +10,7 @@
 )
 from parsl.executors.status_handling import BlockProviderExecutor
 from parsl.jobs.states import JobStatus
+from parsl.monitoring.radios.base import RadioConfig
 from parsl.providers import LocalProvider
 from parsl.providers.base import ExecutionProvider
 
@@ -56,7 +57,8 @@ def __init__(self,
                  worker_logdir_root: Optional[str] = None,
                  mpi_launcher: str = "mpiexec",
                  block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
-                 encrypted: bool = False):
+                 encrypted: bool = False,
+                 remote_monitoring_radio_config: Optional[RadioConfig] = None):
         super().__init__(
             # Hard-coded settings
             cores_per_worker=1e-9,  # Ensures there will be at least an absurd number of workers
@@ -84,7 +86,15 @@ def __init__(self,
             worker_logdir_root=worker_logdir_root,
             mpi_launcher=mpi_launcher,
             block_error_handler=block_error_handler,
-            encrypted=encrypted
+            encrypted=encrypted,
+
+            # TODO:
+            # worker-side monitoring in MPI-style code is probably going to be
+            # broken - resource monitoring won't see any worker processes
+            # most likely, as so perhaps it should have worker resource
+            # monitoring disabled like the thread pool executor has?
+            # (for related but different reasons...)
+            remote_monitoring_radio_config=remote_monitoring_radio_config
         )
 
         self.max_workers_per_block = max_workers_per_block
diff --git a/parsl/executors/taskvine/executor.py b/parsl/executors/taskvine/executor.py
@@ -601,6 +601,8 @@ def shutdown(self, *args, **kwargs):
         self._finished_task_queue.close()
         self._finished_task_queue.join_thread()
 
+        super().shutdown()
+
         logger.debug("TaskVine shutdown completed")
 
     @wrap_with_logs

diff --git a/parsl/executors/threads.py b/parsl/executors/threads.py
@@ -72,6 +72,7 @@ def shutdown(self, block=True):
         """
         logger.debug("Shutting down executor, which involves waiting for running tasks to complete")
         self.executor.shutdown(wait=block)
+        super().shutdown()
         logger.debug("Done with executor shutdown")
 
     def monitor_resources(self):

diff --git a/parsl/executors/workqueue/executor.py b/parsl/executors/workqueue/executor.py
@@ -713,6 +713,8 @@ def shutdown(self, *args, **kwargs):
         self.collector_queue.close()
         self.collector_queue.join_thread()
 
+        super().shutdown()
+
         logger.debug("Work Queue shutdown completed")
 
     @wrap_with_logs

diff --git a/parsl/monitoring/monitoring.py b/parsl/monitoring/monitoring.py
@@ -5,7 +5,7 @@
 import os
 import queue
 import time
-from multiprocessing import Event, Process
+from multiprocessing import Event
 from multiprocessing.queues import Queue
 from typing import TYPE_CHECKING, Any, Literal, Optional, Tuple, Union, cast
 
@@ -14,7 +14,7 @@
 from parsl.log_utils import set_file_logger
 from parsl.monitoring.errors import MonitoringHubStartError
 from parsl.monitoring.message_type import MessageType
-from parsl.monitoring.radios import MultiprocessingQueueRadioSender
+from parsl.monitoring.radios.base import MultiprocessingQueueRadioSender, RadioConfig
 from parsl.monitoring.router import router_starter
 from parsl.monitoring.types import AddressedMonitoringMessage
 from parsl.multiprocessing import ForkProcess, SizedQueue
@@ -129,7 +129,7 @@ def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> No
         # in the future, Queue will allow runtime subscripts.
 
         if TYPE_CHECKING:
-            comm_q: Queue[Union[Tuple[int, int], str]]
+            comm_q: Queue[Union[int, str]]
         else:
             comm_q: Queue
 
@@ -150,7 +150,6 @@ def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> No
                                                "resource_msgs": self.resource_msgs,
                                                "exit_event": self.router_exit_event,
                                                "hub_address": self.hub_address,
-                                               "udp_port": self.hub_port,
                                                "zmq_port_range": self.hub_port_range,
                                                "logdir": self.logdir,
                                                "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
@@ -172,13 +171,13 @@ def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> No
         self.dbm_proc.start()
         logger.info("Started the router process {} and DBM process {}".format(self.router_proc.pid, self.dbm_proc.pid))
 
-        self.filesystem_proc = Process(target=filesystem_receiver,
-                                       args=(self.logdir, self.resource_msgs, dfk_run_dir),
-                                       name="Monitoring-Filesystem-Process",
-                                       daemon=True
-                                       )
-        self.filesystem_proc.start()
-        logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
+        # self.filesystem_proc = Process(target=filesystem_receiver,
+        #                               args=(self.logdir, self.resource_msgs, dfk_run_dir),
+        #                               name="Monitoring-Filesystem-Process",
+        #                               daemon=True
+        #                               )
+        # self.filesystem_proc.start()
+        # logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
 
         self.radio = MultiprocessingQueueRadioSender(self.resource_msgs)
 
@@ -194,9 +193,23 @@ def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> No
             logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
             raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
 
-        udp_port, zmq_port = comm_q_result
+        zmq_port = comm_q_result
+
+        self.zmq_port = zmq_port
 
-        self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
+        # need to initialize radio configs, perhaps first time a radio config is used
+        # in each executor? (can't do that at startup because executor list is dynamic,
+        # don't know all the executors till later)
+        # self.radio_config.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
+        # How can this config be populated properly?
+        # There's a UDP port chosen right now by the monitoring router and
+        # sent back a line above...
+        # What does that look like for other radios? htexradio has no specific config at all,
+        # filesystem radio has a path (that should have been created?) for config, and a loop
+        # that needs to be running, started in this start method.
+        # so something like... radio_config.receive() generates the appropriate receiver object?
+        # which has a shutdown method on it for later. and also updates radio_config itself so
+        # it has the right info to send across the wire? or some state driving like that?
 
         logger.info("Monitoring Hub initialized")
 
@@ -228,7 +241,7 @@ def close(self) -> None:
                     )
                 self.router_proc.terminate()
                 self.dbm_proc.terminate()
-                self.filesystem_proc.terminate()
+                # self.filesystem_proc.terminate()
             logger.info("Setting router termination event")
             self.router_exit_event.set()
             logger.info("Waiting for router to terminate")
@@ -248,9 +261,9 @@ def close(self) -> None:
             # should this be message based? it probably doesn't need to be if
             # we believe we've received all messages
             logger.info("Terminating filesystem radio receiver process")
-            self.filesystem_proc.terminate()
-            self.filesystem_proc.join()
-            self.filesystem_proc.close()
+            # self.filesystem_proc.terminate()
+            # self.filesystem_proc.join()
+            # self.filesystem_proc.close()
 
             logger.info("Closing monitoring multiprocessing queues")
             self.exception_q.close()
@@ -259,6 +272,17 @@ def close(self) -> None:
             self.resource_msgs.join_thread()
             logger.info("Closed monitoring multiprocessing queues")
 
+    def start_receiver(self, radio_config: RadioConfig, ip: str) -> Any:
+        """somehow start a radio receiver here and update radioconfig to be sent over the wire, without
+        losing the info we need to shut down that receiver later...
+        """
+        r = radio_config.create_receiver(ip=ip, resource_msgs=self.resource_msgs)  # TODO: return a shutdownable...
+        logger.info(f"BENC: created receiver {r}")
+        # assert r is not None
+        return r
+        # ... that is, a thing we need to do a shutdown call on at shutdown, a "shutdownable"? without
+        # expecting any more structure on it?
+
 
 @wrap_with_logs
 def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]", run_dir: str) -> None: