Skip to content

Commit 5d5cb8a

Browse files
Merge pull request #171 from rosswhitfield/child_runs
Add abillity to set parent/child relation between different runs for portal.
2 parents b328a33 + 8387541 commit 5d5cb8a

10 files changed

+108
-22
lines changed

doc/user_guides/child_runs.png

65 KB
Loading

doc/user_guides/child_runs_trace.png

130 KB
Loading

doc/user_guides/portal_guides.rst

+41-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,11 @@ in either your :doc:`Platform Configuration File<platform>` or your
3535
Tracing
3636
-------
3737

38-
IPS (version >= 0.6.0) has the ability to capture a trace of the
38+
.. note::
39+
40+
New in IPS-Framework 0.6.0
41+
42+
IPS has the ability to capture a trace of the
3943
workflow to allow analysis and visualizations. The traces are captured
4044
in the `Zipkin Span format <https://zipkin.io/zipkin-api/>`_ and
4145
viewed within IPS portal using `Jaeger
@@ -60,3 +64,39 @@ The statistics can be further broken down by operation.
6064
.. note::
6165

6266
Self time (ST) is the total time spent in a span when it was not waiting on children. For example, a 10ms span with two 4ms non-overlapping children would have self-time = 10ms - 2 * 4ms = 2ms.
67+
68+
69+
Child Runs
70+
----------
71+
72+
.. note::
73+
74+
New in IPS-Framework 0.7.0
75+
76+
If you have a workflow where you are running ``ips`` as a task of
77+
another IPS simulation you can create a relation between them that
78+
will allow it to be viewed together in the IPS-portal and get a single
79+
trace for the entire collection.
80+
81+
To setup the hierarchical structure between different IPS runs, so if
82+
one run starts other runs as a separate simulation, you can set the
83+
``PARENT_PORTAL_RUNID`` parameter in the child simulation
84+
configuration. This can be done dynamically from the parent simulation
85+
like:
86+
87+
.. code-block:: python
88+
89+
child_conf['PARENT_PORTAL_RUNID'] = self.services.get_config_param("PORTAL_RUNID")
90+
91+
This is automatically configured when running
92+
``ips_dakota_dynamic.py``.
93+
94+
The child runs will not appear on the main runs list but will appear
95+
on a tab next to the events.
96+
97+
.. image:: child_runs.png
98+
99+
The trace of the primary simulation will contain the traces from all
100+
the simulations:
101+
102+
.. image:: child_runs_trace.png

ipsframework/configurationManager.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import uuid
1010
import logging
1111
import socket
12+
import time
1213
from multiprocessing import Queue, Process, set_start_method
1314
from .configobj import ConfigObj
1415
from . import ipsLogging
@@ -40,7 +41,8 @@ class SimulationData:
4041
entry in the configurationManager class
4142
"""
4243

43-
def __init__(self, sim_name):
44+
def __init__(self, sim_name, start_time=time.time()):
45+
self.start_time = start_time
4446
self.sim_name = sim_name
4547
self.portal_sim_name = None
4648
self.sim_root = None
@@ -281,7 +283,7 @@ def initialize(self, data_mgr, resource_mgr, task_mgr):
281283
sim_name_list.append(sim_name)
282284
sim_root_list.append(sim_root)
283285
log_file_list.append(log_file)
284-
new_sim = self.SimulationData(sim_name)
286+
new_sim = self.SimulationData(sim_name, self.fwk.start_time)
285287
conf['__PORTAL_SIM_NAME'] = sim_name
286288
new_sim.sim_conf = conf
287289
new_sim.config_file = conf_file
@@ -301,7 +303,7 @@ def initialize(self, data_mgr, resource_mgr, task_mgr):
301303
if not self.fwk_sim_name:
302304
fwk_sim_conf = conf.dict()
303305
fwk_sim_conf['SIM_NAME'] = '_'.join([conf['SIM_NAME'], 'FWK'])
304-
fwk_sim = self.SimulationData(fwk_sim_conf['SIM_NAME'])
306+
fwk_sim = self.SimulationData(fwk_sim_conf['SIM_NAME'], self.fwk.start_time)
305307
fwk_sim.sim_conf = fwk_sim_conf
306308
fwk_sim.sim_root = new_sim.sim_root
307309
fwk_sim.log_file = self.fwk.log_file # sys.stdout
@@ -380,6 +382,7 @@ def _initialize_fwk_components(self):
380382
portal_conf['USER'] = self.sim_map[self.fwk_sim_name].sim_conf['USER']
381383
except KeyError:
382384
portal_conf['USER'] = self.platform_conf['USER']
385+
portal_conf['HOST'] = self.platform_conf['HOST']
383386
if self.fwk.log_level == logging.DEBUG:
384387
portal_conf['LOG_LEVEL'] = 'DEBUG'
385388

@@ -502,7 +505,6 @@ def _create_component(self, comp_conf, sim_data):
502505

503506
# SIMYAN: removed else conditional, copying files in runspaceInit
504507
# component now
505-
506508
svc_response_q = Queue(0)
507509
invocation_q = Queue(0)
508510
component_id = ComponentID(class_name, sim_name)
@@ -512,7 +514,7 @@ def _create_component(self, comp_conf, sim_data):
512514
services_proxy = ServicesProxy(self.fwk, fwk_inq, svc_response_q,
513515
sim_data.sim_conf, log_pipe_name)
514516
new_component = component_class(services_proxy, comp_conf)
515-
new_component.__initialize__(component_id, invocation_q, self.fwk.start_time)
517+
new_component.__initialize__(component_id, invocation_q, sim_data.start_time)
516518
services_proxy.__initialize__(new_component)
517519
self.comp_registry.addEntry(component_id, svc_response_q,
518520
invocation_q, new_component,
@@ -643,7 +645,7 @@ def create_simulation(self, sim_name, config_file, override, sub_workflow=False)
643645
self.sim_name_list.append(sim_name)
644646
self.sim_root_list.append(sim_root)
645647
self.log_file_list.append(log_file)
646-
new_sim = self.SimulationData(sim_name)
648+
new_sim = self.SimulationData(sim_name, start_time=self.fwk.start_time if sub_workflow else time.time())
647649
new_sim.sim_conf = conf
648650
new_sim.config_file = config_file
649651
new_sim.sim_root = sim_root

ipsframework/dakota_bridge.py

+1
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ def step(self, timestamp=0, **keywords): # pragma: no cover
136136
self.old_master_conf['SIM_NAME'] = self.sim_name + '_%s' % (instance_id)
137137
self.old_master_conf['LOG_FILE'] = self.sim_logfile + '_%s' % (instance_id)
138138
self.old_master_conf['OUT_REDIRECT'] = 'TRUE'
139+
self.old_master_conf['PARENT_PORTAL_RUNID'] = services.get_config_param("PORTAL_RUNID")
139140
fname = "%s.out" % (self.old_master_conf['SIM_NAME'])
140141
fname = os.path.join(self.sim_root, fname)
141142
self.old_master_conf['OUT_REDIRECT_FNAME'] = fname

ipsframework/ips.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -603,15 +603,15 @@ def _send_monitor_event(self, sim_name='', eventType='', comment='', ok=True, ta
603603
portal_data['eventtype'] = eventType
604604
portal_data['ok'] = ok
605605
portal_data['comment'] = comment
606-
portal_data['walltime'] = '%.2f' % (event_time - self.start_time)
606+
portal_data['walltime'] = '%.2f' % (event_time - self.config_manager.sim_map[sim_name].start_time)
607607
portal_data['time'] = getTimeString(time.localtime(event_time))
608608

609609
topic_name = '_IPS_MONITOR'
610610
# portal_data['phystimestamp'] = self.timeStamp
611611
get_config = self.config_manager.get_config_parameter
612612
if eventType == 'IPS_START':
613613
portal_data['state'] = 'Running'
614-
portal_data['host'] = get_config(sim_name, 'HOST')
614+
portal_data['host'] = self.config_manager.get_platform_parameter('HOST')
615615
try:
616616
portal_data['outputprefix'] = get_config(sim_name, 'OUTPUT_PREFIX')
617617
except KeyError:
@@ -647,18 +647,24 @@ def _send_monitor_event(self, sim_name='', eventType='', comment='', ok=True, ta
647647
portal_data['sim_runid'] = get_config(sim_name, 'RUN_ID')
648648
except KeyError:
649649
pass
650-
portal_data['startat'] = getTimeString(time.localtime(self.start_time))
650+
portal_data['startat'] = getTimeString(time.localtime(self.config_manager.sim_map[sim_name].start_time))
651651
portal_data['ips_version'] = get_versions()['version']
652+
653+
try:
654+
portal_data['parent_portal_runid'] = get_config(sim_name, 'PARENT_PORTAL_RUNID')
655+
except KeyError:
656+
pass
657+
652658
elif eventType == 'IPS_END':
653659
portal_data['state'] = 'Completed'
654660
portal_data['stopat'] = getTimeString(time.localtime(event_time))
655661
# Zipkin json format
656-
portal_data['trace'] = {"timestamp": int(self.start_time*1e6),
657-
"duration": int((event_time - self.start_time)*1e6),
662+
portal_data['trace'] = {"timestamp": int(self.config_manager.sim_map[sim_name].start_time*1e6),
663+
"duration": int((event_time - self.config_manager.sim_map[sim_name].start_time)*1e6),
658664
"localEndpoint": {
659-
"serviceName": str(self.component_id)
665+
"serviceName": f'{sim_name}@{self.component_id}'
660666
},
661-
"id": hashlib.md5(str(self.component_id).encode()).hexdigest()[:16],
667+
"id": hashlib.md5(f'{sim_name}@{self.component_id}'.encode()).hexdigest()[:16],
662668
'tags': {'total_cores': str(self.resource_manager.total_cores)}}
663669
elif eventType == "IPS_CALL_END":
664670
trace = {} # Zipkin json format
@@ -669,7 +675,7 @@ def _send_monitor_event(self, sim_name='', eventType='', comment='', ok=True, ta
669675
trace['localEndpoint'] = {"serviceName": target}
670676
trace['name'] = operation
671677
trace['id'] = hashlib.md5(f"{target}:{operation}".encode()).hexdigest()[:16]
672-
trace["parentId"] = hashlib.md5(str(self.component_id).encode()).hexdigest()[:16]
678+
trace["parentId"] = hashlib.md5(f'{sim_name}@{self.component_id}'.encode()).hexdigest()[:16]
673679

674680
if trace:
675681
portal_data['trace'] = trace

ipsframework/portalBridge.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,6 @@ def __init__(self, services, config):
8181
:py:class:`component.Component` object.
8282
"""
8383
super().__init__(services, config)
84-
self.host = ''
8584
self.curTime = time.localtime()
8685
self.startTime = self.curTime
8786
self.sim_map = {}
@@ -109,7 +108,6 @@ def init(self, timestamp=0.0, **keywords):
109108
self.portal_url = self.PORTAL_URL
110109
except AttributeError:
111110
pass
112-
self.host = self.services.get_config_param('HOST')
113111
self.services.subscribe('_IPS_MONITOR', "process_event")
114112
try:
115113
freq = int(self.services.get_config_param("HTML_DUMP_FREQ", silent=True))
@@ -427,7 +425,7 @@ def init_simulation(self, sim_name, sim_root):
427425

428426
d = datetime.datetime.now()
429427
date_str = "%s.%03d" % (d.strftime("%Y-%m-%dT%H:%M:%S"), int(d.microsecond / 1000))
430-
sim_data.portal_runid = "_".join([self.host, "USER", date_str])
428+
sim_data.portal_runid = "_".join([sim_name, getattr(self, "HOST"), getattr(self, "USER"), date_str])
431429
try:
432430
self.services.set_config_param('PORTAL_RUNID', sim_data.portal_runid,
433431
target_sim_name=sim_name)
@@ -445,8 +443,7 @@ def init_simulation(self, sim_name, sim_root):
445443
(sim_log_dir, oserr.errno, oserr.strerror))
446444
raise
447445

448-
sim_data.monitor_file_name = os.path.join(sim_log_dir,
449-
sim_data.sim_name + '_' + sim_data.portal_runid + '.eventlog')
446+
sim_data.monitor_file_name = os.path.join(sim_log_dir, sim_data.portal_runid + '.eventlog')
450447
try:
451448
sim_data.monitor_file = open(sim_data.monitor_file_name, 'wb', 0)
452449
except IOError as oserr:

tests/dakota/dakota_test_Gaussian.ips

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
RUN_ID = DAKOTA_Rosenbrock # Identifier for this simulation run
1+
RUN_ID = DAKOTA_Gaussian # Identifier for this simulation run
22
TOKAMAK_ID = TEST
33
SHOT_NUMBER = 1 # Numerical identifier for specific case
44

tests/dakota/test_dakota.py

+39
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import shutil
44
import glob
55
import sys
6+
import json
67
import pytest
78
from ipsframework import ips_dakota_dynamic
89

@@ -48,6 +49,44 @@ def test_dakota(tmpdir):
4849

4950
assert float(X) == pytest.approx(0.5, rel=1e-4)
5051

52+
# Check PARENT CHILD relationship
53+
# Get parent PORTAL_RUNID
54+
json_files = glob.glob(str(tmpdir.join("DAKOTA_Gaussian_TEST_1").join("simulation_log").join("*.json")))
55+
assert len(json_files) == 1
56+
57+
with open(json_files[0], 'r') as json_file:
58+
lines = json_file.readlines()
59+
60+
lines = [json.loads(line.strip()) for line in lines]
61+
assert len(lines) == 9
62+
63+
# get portal_runid event
64+
portal_runid = lines[0]['portal_runid']
65+
sim_name, host, user, _ = portal_runid.rsplit('_', maxsplit=3)
66+
assert sim_name == "DAKOTA_Gaussian_TEST_1"
67+
assert host == "workstation"
68+
assert user == "user"
69+
70+
# Check child run
71+
json_files = glob.glob(str(tmpdir.join("DAKOTA_Gaussian_TEST_1").join("simulation_*_0000").join("simulation_log").join("*.json")))
72+
assert len(json_files) == 1
73+
with open(json_files[0], 'r') as json_file:
74+
lines = json_file.readlines()
75+
76+
lines = [json.loads(line.strip()) for line in lines]
77+
assert len(lines) == 8
78+
child_portal_runid = lines[0]['portal_runid']
79+
assert child_portal_runid != portal_runid
80+
81+
sim_name, host, user, _ = child_portal_runid.rsplit('_', maxsplit=3)
82+
assert sim_name.startswith("DAKOTA_Gaussian_TEST_1")
83+
assert len(sim_name) > len("DAKOTA_Gaussian_TEST_1")
84+
assert host == "workstation"
85+
assert user == "user"
86+
87+
parent_portal_runid = lines[0]['parent_portal_runid']
88+
assert parent_portal_runid == portal_runid
89+
5190

5291
@mock.patch('ipsframework.ips_dakota_dynamic.DakotaDynamic')
5392
def test_dakota_main(MockDakotaDynamic):

tests/dakota/workstation.conf

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
HOST = my_laptop
1+
HOST = workstation
2+
USER = user
23
MPIRUN = eval
34

45
#######################################

0 commit comments

Comments
 (0)