Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial workflow defs #163

Open
wants to merge 7 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions src/nomad_simulations/schema_packages/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import numpy as np
from nomad.datamodel.data import ArchiveSection
from nomad.datamodel.metainfo.annotations import ELNAnnotation
from nomad.metainfo import Datetime, Quantity


class Time(ArchiveSection):
"""
Contains time-related quantities.
"""

datetime_end = Quantity(
type=Datetime,
description="""
The date and time when this computation ended.
""",
a_eln=ELNAnnotation(component='DateTimeEditQuantity'),
)

cpu1_start = Quantity(
type=np.float64,
unit='second',
description="""
The starting time of the computation on the (first) CPU 1.
""",
a_eln=ELNAnnotation(component='NumberEditQuantity'),
)

cpu1_end = Quantity(
type=np.float64,
unit='second',
description="""
The end time of the computation on the (first) CPU 1.
""",
a_eln=ELNAnnotation(component='NumberEditQuantity'),
)

wall_start = Quantity(
type=np.float64,
unit='second',
description="""
The internal wall-clock time from the starting of the computation.
""",
a_eln=ELNAnnotation(component='NumberEditQuantity'),
)

wall_end = Quantity(
type=np.float64,
unit='second',
description="""
The internal wall-clock time from the end of the computation.
""",
a_eln=ELNAnnotation(component='NumberEditQuantity'),
)
50 changes: 4 additions & 46 deletions src/nomad_simulations/schema_packages/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from nomad.datamodel.data import Schema
from nomad.datamodel.metainfo.annotations import ELNAnnotation
from nomad.datamodel.metainfo.basesections import Activity, Entity
from nomad.metainfo import Datetime, Quantity, SchemaPackage, Section, SubSection
from nomad.metainfo import Quantity, SchemaPackage, Section, SubSection

from nomad_simulations.schema_packages.model_method import ModelMethod
from nomad_simulations.schema_packages.model_system import ModelSystem
Expand All @@ -21,6 +21,8 @@
is_not_representative,
)

from .common import Time

configuration = config.get_plugin_entry_point(
'nomad_simulations.schema_packages:nomad_simulations_plugin'
)
Expand Down Expand Up @@ -121,7 +123,7 @@ def normalize(self, archive: 'EntryArchive', logger: 'BoundLogger') -> None:
pass


class BaseSimulation(Activity):
class BaseSimulation(Activity, Time):
"""
A computational simulation that produces output data from a given input model system
and input methodological parameters.
Expand All @@ -135,50 +137,6 @@ class BaseSimulation(Activity):
links=['https://liusemweb.github.io/mdo/core/1.1/index.html#Calculation']
)

datetime_end = Quantity(
type=Datetime,
description="""
The date and time when this computation ended.
""",
a_eln=ELNAnnotation(component='DateTimeEditQuantity'),
)

cpu1_start = Quantity(
type=np.float64,
unit='second',
description="""
The starting time of the computation on the (first) CPU 1.
""",
a_eln=ELNAnnotation(component='NumberEditQuantity'),
)

cpu1_end = Quantity(
type=np.float64,
unit='second',
description="""
The end time of the computation on the (first) CPU 1.
""",
a_eln=ELNAnnotation(component='NumberEditQuantity'),
)

wall_start = Quantity(
type=np.float64,
unit='second',
description="""
The internal wall-clock time from the starting of the computation.
""",
a_eln=ELNAnnotation(component='NumberEditQuantity'),
)

wall_end = Quantity(
type=np.float64,
unit='second',
description="""
The internal wall-clock time from the end of the computation.
""",
a_eln=ELNAnnotation(component='NumberEditQuantity'),
)

program = SubSection(sub_section=Program.m_def, repeats=False)

def normalize(self, archive: 'EntryArchive', logger: 'BoundLogger') -> None:
Expand Down
5 changes: 3 additions & 2 deletions src/nomad_simulations/schema_packages/outputs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import TYPE_CHECKING, Optional

import numpy as np
from nomad.datamodel.data import ArchiveSection
from nomad.datamodel.metainfo.annotations import ELNAnnotation
from nomad.metainfo import Quantity, SubSection

Expand Down Expand Up @@ -38,8 +37,10 @@
XASSpectrum,
)

from .common import Time

class Outputs(ArchiveSection):

class Outputs(Time):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does Outputs need to inherit from Time? Is there some situation where the simulation package is outputting such detailed timing info?

I guess maybe you are thinking of the workflows. Is the plan to reuse the output class directly for workflows?

Copy link
Collaborator Author

@ladinesa ladinesa Jan 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The workflow tasks are built from the outputs and the time info are needed for determining the order of the tasks. There are a several codes that output the time for each step of a geometry optimization for example.

Copy link
Collaborator Author

@ladinesa ladinesa Jan 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed the task link at least for the general simulation workflow class references the output. For geometry optimization, I do a renormalization to extract the system to as I feel it is the natural input and output. Simulation.outputs as I understand, will not be extended to cover workflow methods and results, right? So I still would like to put the workflow-related quantities under workflow.method and workflow.results as in the old def.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, ok. I mean I am not against this I am just trying to understand.

The workflow tasks are built from the outputs
I guess you are talking about the standard simulation workflows here? And for a custom workflow, the user is specifying the ordering, so you don't need this, is that correct?

I do a renormalization to extract the system to as I feel it is the natural input and output
We should discuss this more generally, e.g., in the context of the workflow visualizer

Simulation.outputs as I understand, will not be extended to cover workflow methods and results, right?
I don't think we have explicitly discussed this. There should definitely be a separate outputs section under workflow (I thought of changing the name to outputs for consistency, but we should discuss). I don't think we made an explicit decision about this. From my side, I thought at some point it might make sense to re-use the outputs section definition under workflow, but this was more because of the flexibility of the Physical Property class, so probably that no longer makes sense.

I think I will put this on the agenda for tomorrow's meeting, and then we can resolve clearly this discussion? (You can probably continue as you think fit until then, it is unlikely that we override things that you deem necessary)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, ok. I mean I am not against this I am just trying to understand.

The workflow tasks are built from the outputs
I guess you are talking about the standard simulation workflows here? And for a custom workflow, the user is specifying the ordering, so you don't need this, is that correct?

I do a renormalization to extract the system to as I feel it is the natural input and output
We should discuss this more generally, e.g., in the context of the workflow visualizer

Simulation.outputs as I understand, will not be extended to cover workflow methods and results, right?
I don't think we have explicitly discussed this. There should definitely be a separate outputs section under workflow (I thought of changing the name to outputs for consistency, but we should discuss). I don't think we made an explicit decision about this. From my side, I thought at some point it might make sense to re-use the outputs section definition under workflow, but this was more because of the flexibility of the Physical Property class, so probably that no longer makes sense.

I think I will put this on the agenda for tomorrow's meeting, and then we can resolve clearly this discussion? (You can probably continue as you think fit until then, it is unlikely that we override things that you deem necessary)

This is for the automated workflows. Sure, we can discuss them in our meeting.

"""
Output properties of a simulation. This base class can be used for inheritance in any of the output properties
defined in this schema.
Expand Down
4 changes: 4 additions & 0 deletions src/nomad_simulations/schema_packages/workflow/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .general import SimulationWorkflow
from .geometry_optimization import GeometryOptimization
from .gw import DFTGWWorkflow
from .single_point import SinglePoint
71 changes: 71 additions & 0 deletions src/nomad_simulations/schema_packages/workflow/general.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from nomad.datamodel import EntryArchive
from nomad.datamodel.metainfo.workflow import Link, Task, Workflow
from nomad.metainfo.util import MSubSectionList
from structlog.stdlib import BoundLogger

INCORRECT_N_TASKS = 'Incorrect number of tasks found.'


class SimulationWorkflow(Workflow):
"""
Base class for simulation workflows.
"""

def normalize(self, archive: EntryArchive, logger: BoundLogger):
"""
Generate tasks from the archive data outputs.
"""
if not archive.data or not archive.data.outputs:
return

# generate tasks from outputs
if not self.tasks:
# default should to serial execution
times: list[tuple[float, float]] = list(
[
(o.wall_start or n, o.wall_end or n)
for n, o in enumerate(archive.data.outputs)
]
)
times.sort(key=lambda x: x[0])
# current parent task
parent_n = 0
parent_outputs: list[Link] = []
for n, time in enumerate(times):
task = Task(
outputs=[
Link(
name='Output',
section=archive.data.outputs[n],
)
],
)
self.tasks.append(task)
# link tasks based on overlap in execution time
if time[0] >= times[parent_n][1]:
# if no overlap, assign outputs of parent as input to next task
task.inputs.extend(
[
Link(name='Input', section=output.section)
for output in parent_outputs or task.outputs
]
)
# assign first parent outputs as workflow inputs
if not self.inputs:
self.inputs.extend(task.inputs)
# assign as new parent
parent_n = n
# reset outputs
parent_outputs = list(task.outputs)
else:
parent_outputs.extend(task.outputs)
# if overlap, assign parent outputs to task inputs
task.inputs.extend(
[
Link(name='Input', section=output.section)
for output in self.tasks[parent_n or n].outputs
]
)
if not self.outputs:
# assign parent outputs as workflow outputs
self.outputs.extend(parent_outputs)
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from nomad.datamodel import EntryArchive
from nomad.datamodel.metainfo.workflow import Link, Task
from structlog.stdlib import BoundLogger

from .general import SimulationWorkflow


class GeometryOptimization(SimulationWorkflow):
"""
Definitions for geometry optimization workflow.
"""

def normalize(self, archive: EntryArchive, logger: BoundLogger) -> None:
"""
Specify the inputs and outputs of the tasks as the model system.
"""
super().normalize(archive, logger)

def to_system_links(task: Task) -> None:
task.inputs = [
Link(name='Input system', section=link.section.model_system_ref)
for link in task.inputs
if link.section and link.section.model_system_ref
]
task.outputs = [
Link(name='Output system', section=link.section.model_system_ref)
for link in task.inputs
if link.section and link.section.model_system_ref
]

to_system_links(self)
for task in self.tasks:
to_system_links(task)
37 changes: 37 additions & 0 deletions src/nomad_simulations/schema_packages/workflow/gw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from nomad.datamodel import EntryArchive
from structlog.stdlib import BoundLogger

from .general import INCORRECT_N_TASKS, SimulationWorkflow


class DFTGWWorkflow(SimulationWorkflow):
"""
Definitions for GW calculation based on DFT workflow.
"""

def normalize(self, archive: EntryArchive, logger: BoundLogger) -> None:
"""
Link the DFT and GW single point workflows in the DFT-GW workflow.
"""
super().normalize(archive, logger)

if not self.name:
self.name: str = 'DFT+GW'

if len(self.tasks) != 2:
logger.error(INCORRECT_N_TASKS)
return

if not self.inputs:
# set inputs to inputs of DFT
self.inputs.extend(self.tasks[0].task.inputs)

if not self.outputs:
# set ouputs to outputs of GW
self.outputs.extend(self.tasks[1].task.outputs)

# link dft and gw workflows
self.tasks[0].inputs = self.inputs
self.tasks[0].outputs = self.tasks[0].task.outputs
self.tasks[1].inputs = self.tasks[0].outputs
self.tasks[1].outputs = self.outputs
40 changes: 40 additions & 0 deletions src/nomad_simulations/schema_packages/workflow/single_point.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from nomad.datamodel import EntryArchive
from nomad.datamodel.metainfo.workflow import Link
from structlog.stdlib import BoundLogger

from .general import INCORRECT_N_TASKS, SimulationWorkflow


class SinglePoint(SimulationWorkflow):
"""
Definitions for single point workflow.
"""

def normalize(self, archive: EntryArchive, logger: BoundLogger) -> None:
"""
Specify the method and system as inputs.
"""
super().normalize(archive, logger)
if len(self.tasks) != 1:
logger.error(INCORRECT_N_TASKS)
return

if not self.inputs:
self.inputs.extend(self.tasks[0].inputs)

inps: list[Link] = []
for inp in self.inputs:
if inp.section and inp.section.model_system_ref:
inps.append(
Link(name='Input system', section=inp.section.model_system_ref)
)
if inp.section and inp.section.model_method_ref:
inps.append(
Link(name='Input method', section=inp.section.model_method_ref)
)
self.inputs.clear()
self.inputs.extend(inps)

# reconnect inputs to link as these are redefined
self.tasks[0].inputs.clear()
self.tasks[0].inputs.extend(inps)
Loading