Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add static and runtime dag info, API to fetch ancestor and successor tasks #2124

Merged
merged 42 commits into from
Feb 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
255485b
Add static and runtime dag info, API to fetch ancestor tasks
talsperre Oct 21, 2024
bd377a9
Add API to get immediate successors
talsperre Oct 31, 2024
0214a61
Add API for getting closest siblings
talsperre Oct 31, 2024
480d360
Update metadata API params
talsperre Nov 1, 2024
0131f43
Refactor ancestor and successor client code
talsperre Nov 1, 2024
d70cf98
Remove unneccessary prints
talsperre Nov 1, 2024
70863e5
Support querying ancestors and successors in local metadata provider
talsperre Jan 4, 2025
b01fc7d
Refactor and simplify client code
talsperre Jan 4, 2025
f08be5a
Make query logic more descriptive
talsperre Jan 4, 2025
cf49ace
Add core tests for ancestor task API
talsperre Jan 7, 2025
6218ffe
Add core test for immediate successor API
talsperre Jan 7, 2025
6500813
Add endpoint in OSS metadata service
talsperre Jan 7, 2025
c997a0b
Add logs to tests
talsperre Jan 10, 2025
6eac1b0
Log for each stack to metadata, update query logic
talsperre Jan 12, 2025
5b739e7
Add more comments to code
talsperre Jan 12, 2025
53fb3b8
Run black formatting
talsperre Jan 12, 2025
9963ddd
Set monitor to None in filter tasks API
talsperre Jan 14, 2025
294f283
import urlencode
talsperre Jan 14, 2025
f39d140
Address comments
talsperre Jan 14, 2025
bec68f7
Update logic for siblings, make it work for static splits as well
talsperre Jan 14, 2025
e5f1ed7
update service url for filter task requests. update query param names.
saikonen Jan 15, 2025
959e6a3
Fix bug in parsing steps due to different data formats across metadat…
talsperre Jan 21, 2025
7c827b5
json serialize the ancestry metadata
saikonen Jan 27, 2025
f4936fa
Address comments
talsperre Jan 31, 2025
32a889e
Update docstrings
talsperre Jan 31, 2025
7b1d717
Remove duplicate code
talsperre Jan 31, 2025
bc9e456
Address comments
talsperre Feb 12, 2025
b03f0c4
Remove commented out code
talsperre Feb 12, 2025
67ee87e
Update docstrings
talsperre Feb 12, 2025
ff27d87
Remove spurious import in core
talsperre Feb 12, 2025
adc5bf7
Update OSS metadata service API call
talsperre Feb 12, 2025
6aae908
Remove commented code from parent task tests
talsperre Feb 12, 2025
035e3a3
Remove spurious function
talsperre Feb 12, 2025
f821d1e
Remove spurious comment
talsperre Feb 12, 2025
ffffee4
Address comments
talsperre Feb 19, 2025
9b93945
Update tests
talsperre Feb 19, 2025
81bac83
Address comments
talsperre Feb 20, 2025
997b8fd
Update docstrings
talsperre Feb 20, 2025
ea66248
Address black comments
talsperre Feb 20, 2025
618e23b
Address black comments
talsperre Feb 20, 2025
f3b63d5
Update docstrings, remove duplicate property
talsperre Feb 20, 2025
9541757
Return metadata service version needed for runtime dag apis
talsperre Feb 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 173 additions & 1 deletion metaflow/client/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ def __iter__(self) -> Iterator["MetaflowObject"]:
_CLASSES[self._CHILD_CLASS]._NAME,
query_filter,
self._attempt,
*self.path_components
*self.path_components,
)
unfiltered_children = unfiltered_children if unfiltered_children else []
children = filter(
Expand Down Expand Up @@ -1123,6 +1123,143 @@ def _iter_filter(self, x):
# exclude private data artifacts
return x.id[0] != "_"

def _iter_matching_tasks(self, steps, metadata_key, metadata_pattern):
"""
Yield tasks from specified steps matching a foreach path pattern.

Parameters
----------
steps : List[str]
List of step names to search for tasks
pattern : str
Regex pattern to match foreach-indices metadata

Returns
-------
Iterator[Task]
Tasks matching the foreach path pattern
"""
flow_id, run_id, _, _ = self.path_components

for step in steps:
task_pathspecs = self._metaflow.metadata.filter_tasks_by_metadata(
flow_id, run_id, step.id, metadata_key, metadata_pattern
)
for task_pathspec in task_pathspecs:
yield Task(pathspec=task_pathspec, _namespace_check=False)

@property
def parent_tasks(self) -> Iterator["Task"]:
"""
Yields all parent tasks of the current task if one exists.

Yields
------
Task
Parent task of the current task

"""
flow_id, run_id, _, _ = self.path_components

steps = list(self.parent.parent_steps)
if not steps:
return []

current_path = self.metadata_dict.get("foreach-execution-path", "")

if len(steps) > 1:
# Static join - use exact path matching
pattern = current_path or ".*"
yield from self._iter_matching_tasks(
steps, "foreach-execution-path", pattern
)
return

# Handle single step case
target_task = Step(
f"{flow_id}/{run_id}/{steps[0].id}", _namespace_check=False
).task
target_path = target_task.metadata_dict.get("foreach-execution-path")

if not target_path or not current_path:
# (Current task, "A:10") and (Parent task, "")
# Pattern: ".*"
pattern = ".*"
else:
current_depth = len(current_path.split(","))
target_depth = len(target_path.split(","))

if current_depth < target_depth:
# Foreach join
# (Current task, "A:10,B:13") and (Parent task, "A:10,B:13,C:21")
# Pattern: "A:10,B:13,.*"
pattern = f"{current_path},.*"
else:
# Foreach split or linear step
# Option 1:
# (Current task, "A:10,B:13,C:21") and (Parent task, "A:10,B:13")
# Option 2:
# (Current task, "A:10,B:13") and (Parent task, "A:10,B:13")
# Pattern: "A:10,B:13"
pattern = ",".join(current_path.split(",")[:target_depth])

yield from self._iter_matching_tasks(steps, "foreach-execution-path", pattern)

@property
def child_tasks(self) -> Iterator["Task"]:
"""
Yield all child tasks of the current task if one exists.

Yields
------
Task
Child task of the current task
"""
flow_id, run_id, _, _ = self.path_components
steps = list(self.parent.child_steps)
if not steps:
return []

current_path = self.metadata_dict.get("foreach-execution-path", "")

if len(steps) > 1:
# Static split - use exact path matching
pattern = current_path or ".*"
yield from self._iter_matching_tasks(
steps, "foreach-execution-path", pattern
)
return

# Handle single step case
target_task = Step(
f"{flow_id}/{run_id}/{steps[0].id}", _namespace_check=False
).task
target_path = target_task.metadata_dict.get("foreach-execution-path")

if not target_path or not current_path:
# (Current task, "A:10") and (Child task, "")
# Pattern: ".*"
pattern = ".*"
else:
current_depth = len(current_path.split(","))
target_depth = len(target_path.split(","))

if current_depth < target_depth:
# Foreach split
# (Current task, "A:10,B:13") and (Child task, "A:10,B:13,C:21")
# Pattern: "A:10,B:13,.*"
pattern = f"{current_path},.*"
else:
# Foreach join or linear step
# Option 1:
# (Current task, "A:10,B:13,C:21") and (Child task, "A:10,B:13")
# Option 2:
# (Current task, "A:10,B:13") and (Child task, "A:10,B:13")
# Pattern: "A:10,B:13"
pattern = ",".join(current_path.split(",")[:target_depth])

yield from self._iter_matching_tasks(steps, "foreach-execution-path", pattern)

@property
def metadata(self) -> List[Metadata]:
"""
Expand Down Expand Up @@ -1837,6 +1974,41 @@ def environment_info(self) -> Optional[Dict[str, Any]]:
for t in self:
return t.environment_info

@property
def parent_steps(self) -> Iterator["Step"]:
"""
Yields parent steps for the current step.

Yields
------
Step
Parent step
"""
graph_info = self.task["_graph_info"].data

if self.id != "start":
flow, run, _ = self.path_components
for node_name, attributes in graph_info["steps"].items():
if self.id in attributes["next"]:
yield Step(f"{flow}/{run}/{node_name}", _namespace_check=False)

@property
def child_steps(self) -> Iterator["Step"]:
"""
Yields child steps for the current step.

Yields
------
Step
Child step
"""
graph_info = self.task["_graph_info"].data

if self.id != "end":
flow, run, _ = self.path_components
for next_step in graph_info["steps"][self.id]["next"]:
yield Step(f"{flow}/{run}/{next_step}", _namespace_check=False)


class Run(MetaflowObject):
"""
Expand Down
33 changes: 33 additions & 0 deletions metaflow/metadata_provider/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from collections import namedtuple
from itertools import chain

from typing import List
from metaflow.exception import MetaflowInternalError, MetaflowTaggingError
from metaflow.tagging_util import validate_tag
from metaflow.util import get_username, resolve_identity_as_tuple, is_stringish
Expand Down Expand Up @@ -672,6 +673,38 @@ def _register_system_metadata(self, run_id, step_name, task_id, attempt):
if metadata:
self.register_metadata(run_id, step_name, task_id, metadata)

@classmethod
def filter_tasks_by_metadata(
cls,
flow_name: str,
run_id: str,
step_name: str,
field_name: str,
pattern: str,
) -> List[str]:
"""
Filter tasks by metadata field and pattern, returning task pathspecs that match criteria.

Parameters
----------
flow_name : str
Flow name, that the run belongs to.
run_id: str
Run id, together with flow_id, that identifies the specific Run whose tasks to query
step_name: str
Step name to query tasks from
field_name: str
Metadata field name to query
pattern: str
Pattern to match in metadata field value

Returns
-------
List[str]
List of task pathspecs that satisfy the query
"""
raise NotImplementedError()

@staticmethod
def _apply_filter(elts, filters):
if filters is None:
Expand Down
66 changes: 66 additions & 0 deletions metaflow/plugins/metadata_providers/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
import glob
import json
import os
import re
import random
import tempfile
import time
from collections import namedtuple
from typing import List

from metaflow.exception import MetaflowInternalError, MetaflowTaggingError
from metaflow.metadata_provider.metadata import ObjectOrder
Expand Down Expand Up @@ -202,6 +204,70 @@ def _optimistically_mutate():
"Tagging failed due to too many conflicting updates from other processes"
)

@classmethod
def filter_tasks_by_metadata(
cls,
flow_name: str,
run_id: str,
step_name: str,
field_name: str,
pattern: str,
) -> List[str]:
"""
Filter tasks by metadata field and pattern, returning task pathspecs that match criteria.

Parameters
----------
flow_name : str
Identifier for the flow
run_id : str
Identifier for the run
step_name : str
Name of the step to query tasks from
field_name : str
Name of metadata field to query
pattern : str
Pattern to match in metadata field value

Returns
-------
List[str]
List of task pathspecs that match the query criteria
"""
tasks = cls.get_object("step", "task", {}, None, flow_name, run_id, step_name)
if not tasks:
return []

regex = re.compile(pattern)
matching_task_pathspecs = []

for task in tasks:
task_id = task.get("task_id")
if not task_id:
continue

if pattern == ".*":
# If the pattern is ".*", we can match all tasks without reading metadata
matching_task_pathspecs.append(
f"{flow_name}/{run_id}/{step_name}/{task_id}"
)
continue

metadata = cls.get_object(
"task", "metadata", {}, None, flow_name, run_id, step_name, task_id
)

if any(
meta.get("field_name") == field_name
and regex.match(meta.get("value", ""))
for meta in metadata
):
matching_task_pathspecs.append(
f"{flow_name}/{run_id}/{step_name}/{task_id}"
)

return matching_task_pathspecs

@classmethod
def _get_object_internal(
cls, obj_type, obj_order, sub_type, sub_order, filters, attempt, *args
Expand Down
51 changes: 51 additions & 0 deletions metaflow/plugins/metadata_providers/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import requests

from typing import List
from metaflow.exception import (
MetaflowException,
MetaflowInternalError,
Expand All @@ -13,6 +14,7 @@
from metaflow.metadata_provider.heartbeat import HB_URL_KEY
from metaflow.metaflow_config import SERVICE_HEADERS, SERVICE_RETRY_COUNT, SERVICE_URL
from metaflow.sidecar import Message, MessageTypes, Sidecar
from urllib.parse import urlencode
from metaflow.util import version_parse


Expand Down Expand Up @@ -318,6 +320,55 @@ def _new_task(
self._register_system_metadata(run_id, step_name, task["task_id"], attempt)
return task["task_id"], did_create

@classmethod
def filter_tasks_by_metadata(
cls,
flow_name: str,
run_id: str,
step_name: str,
field_name: str,
pattern: str,
) -> List[str]:
"""
Filter tasks by metadata field and pattern, returning task pathspecs that match criteria.

Parameters
----------
flow_name : str
Flow name, that the run belongs to.
run_id: str
Run id, together with flow_id, that identifies the specific Run whose tasks to query
step_name: str
Step name to query tasks from
field_name: str
Metadata field name to query
pattern: str
Pattern to match in metadata field value

Returns
-------
List[str]
List of task pathspecs that satisfy the query
"""
query_params = {
"metadata_field_name": field_name,
"pattern": pattern,
"step_name": step_name,
}
url = ServiceMetadataProvider._obj_path(flow_name, run_id, step_name)
url = f"{url}/filtered_tasks?{urlencode(query_params)}"
try:
resp = cls._request(None, url, "GET")
except Exception as e:
if e.http_code == 404:
# filter_tasks_by_metadata endpoint does not exist in the version of metadata service
# deployed currently. Raise a more informative error message.
raise MetaflowInternalError(
"The version of metadata service deployed currently does not support filtering tasks by metadata. "
"Upgrade Metadata service to version 2.15 or greater to use this feature."
) from e
return resp

@staticmethod
def _obj_path(
flow_name,
Expand Down
Loading
Loading