Skip to content
This repository has been archived by the owner on Oct 9, 2023. It is now read-only.

Commit

Permalink
Add ray dashboard log link (#405)
Browse files Browse the repository at this point in the history
* Add ray dashboard log link

Signed-off-by: Haytham Abuelfutuh <[email protected]>

* A few fixes

Signed-off-by: Haytham Abuelfutuh <[email protected]>

* lint

Signed-off-by: Haytham Abuelfutuh <[email protected]>

* lint

Signed-off-by: Haytham Abuelfutuh <[email protected]>

* Add EnableUsageStats start parameter

Signed-off-by: Haytham Abuelfutuh <[email protected]>

* Add more tests

Signed-off-by: Haytham Abuelfutuh <[email protected]>

* more tests

Signed-off-by: Haytham Abuelfutuh <[email protected]>

---------

Signed-off-by: Haytham Abuelfutuh <[email protected]>
  • Loading branch information
EngHabu committed Sep 23, 2023
1 parent 2598c96 commit 8b417f4
Show file tree
Hide file tree
Showing 7 changed files with 243 additions and 50 deletions.
8 changes: 6 additions & 2 deletions go/tasks/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,19 @@ var (
rootSection = config.MustRegisterSection(configSectionKey, &Config{})
)

// Top level plugins config.
// Config is the top level plugins config.
type Config struct {
}

// Retrieves the current config value or default.
// GetConfig retrieves the current config value or default.
func GetConfig() *Config {
return rootSection.GetConfig().(*Config)
}

func MustRegisterSubSection(subSectionKey string, section config.Config) config.Section {
return rootSection.MustRegisterSection(subSectionKey, section)
}

func MustRegisterSubSectionWithUpdates(subSectionKey string, section config.Config, sectionUpdatedFn config.SectionUpdated) config.Section {
return rootSection.MustRegisterSectionWithUpdates(subSectionKey, section, sectionUpdatedFn)
}
4 changes: 2 additions & 2 deletions go/tasks/pluginmachinery/core/phase.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ func phaseInfo(p Phase, v uint32, err *core.ExecutionError, info *TaskInfo, clea
}
}

// Return in the case the plugin is not ready to start
// PhaseInfoNotReady represents the case the plugin is not ready to start
func PhaseInfoNotReady(t time.Time, version uint32, reason string) PhaseInfo {
pi := phaseInfo(PhaseNotReady, version, nil, &TaskInfo{OccurredAt: &t}, false)
pi.reason = reason
Expand All @@ -198,7 +198,7 @@ func PhaseInfoWaitingForResources(t time.Time, version uint32, reason string) Ph
return pi
}

// Return in the case the plugin is not ready to start
// PhaseInfoWaitingForResourcesInfo represents the case the plugin is not ready to start
func PhaseInfoWaitingForResourcesInfo(t time.Time, version uint32, reason string, info *TaskInfo) PhaseInfo {
pi := phaseInfo(PhaseWaitingForResources, version, nil, info, false)
pi.reason = reason
Expand Down
54 changes: 50 additions & 4 deletions go/tasks/plugins/k8s/ray/config.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
package ray

import (
"context"

pluginsConfig "github.com/flyteorg/flyteplugins/go/tasks/config"
"github.com/flyteorg/flyteplugins/go/tasks/logs"
pluginmachinery "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/k8s"
"github.com/flyteorg/flytestdlib/config"
)

//go:generate pflags Config --default-var=defaultConfig
Expand All @@ -14,10 +18,39 @@ var (
ServiceType: "NodePort",
IncludeDashboard: true,
DashboardHost: "0.0.0.0",
NodeIPAddress: "$MY_POD_IP",
EnableUsageStats: false,
Defaults: DefaultConfig{
HeadNode: NodeConfig{
StartParameters: map[string]string{
// Disable usage reporting by default: https://docs.ray.io/en/latest/cluster/usage-stats.html
DisableUsageStatsStartParameter: "true",
},
IPAddress: "$MY_POD_IP",
},
WorkerNode: NodeConfig{
StartParameters: map[string]string{
// Disable usage reporting by default: https://docs.ray.io/en/latest/cluster/usage-stats.html
DisableUsageStatsStartParameter: "true",
},
IPAddress: "$MY_POD_IP",
},
},
}

configSection = pluginsConfig.MustRegisterSubSection("ray", &defaultConfig)
configSection = pluginsConfig.MustRegisterSubSectionWithUpdates("ray", &defaultConfig,
func(ctx context.Context, newValue config.Config) {
if newValue == nil {
return
}

if len(newValue.(*Config).Defaults.HeadNode.IPAddress) == 0 {
newValue.(*Config).Defaults.HeadNode.IPAddress = newValue.(*Config).DeprecatedNodeIPAddress
}

if len(newValue.(*Config).Defaults.WorkerNode.IPAddress) == 0 {
newValue.(*Config).Defaults.WorkerNode.IPAddress = newValue.(*Config).DeprecatedNodeIPAddress
}
})
)

// Config is config for 'ray' plugin
Expand All @@ -39,11 +72,24 @@ type Config struct {
// or 0.0.0.0 (available from all interfaces). By default, this is localhost.
DashboardHost string `json:"dashboardHost,omitempty"`

// NodeIPAddress the IP address of the head node. By default, this is pod ip address.
NodeIPAddress string `json:"nodeIPAddress,omitempty"`
// DeprecatedNodeIPAddress the IP address of the head node. By default, this is pod ip address.
DeprecatedNodeIPAddress string `json:"nodeIPAddress,omitempty" pflag:"-,DEPRECATED. Please use DefaultConfig.[HeadNode|WorkerNode].IPAddress"`

// Remote Ray Cluster Config
RemoteClusterConfig pluginmachinery.ClusterConfig `json:"remoteClusterConfig" pflag:"Configuration of remote K8s cluster for ray jobs"`
Logs logs.LogConfig `json:"logs" pflag:"-,Log configuration for ray jobs"`
Defaults DefaultConfig `json:"defaults" pflag:"-,Default configuration for ray jobs"`
EnableUsageStats bool `json:"enableUsageStats" pflag:",Enable usage stats for ray jobs. These stats are submitted to usage-stats.ray.io per https://docs.ray.io/en/latest/cluster/usage-stats.html"`
}

type DefaultConfig struct {
HeadNode NodeConfig `json:"headNode,omitempty" pflag:"-,Default configuration for head node of ray jobs"`
WorkerNode NodeConfig `json:"workerNode,omitempty" pflag:"-,Default configuration for worker node of ray jobs"`
}

type NodeConfig struct {
StartParameters map[string]string `json:"startParameters,omitempty" pflag:"-,Start parameters for the node"`
IPAddress string `json:"ipAddress,omitempty" pflag:"-,IP address of the node"`
}

func GetConfig() *Config {
Expand Down
2 changes: 1 addition & 1 deletion go/tasks/plugins/k8s/ray/config_flags.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 14 additions & 14 deletions go/tasks/plugins/k8s/ray/config_flags_test.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

76 changes: 51 additions & 25 deletions go/tasks/plugins/k8s/ray/ray.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ import (
"fmt"
"strconv"
"strings"
"time"

"github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/core"
"github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/tasklog"

"github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins"
"github.com/flyteorg/flyteplugins/go/tasks/logs"
"github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery"
Expand All @@ -27,11 +27,12 @@ import (
)

const (
rayTaskType = "ray"
KindRayJob = "RayJob"
IncludeDashboard = "include-dashboard"
NodeIPAddress = "node-ip-address"
DashboardHost = "dashboard-host"
rayTaskType = "ray"
KindRayJob = "RayJob"
IncludeDashboard = "include-dashboard"
NodeIPAddress = "node-ip-address"
DashboardHost = "dashboard-host"
DisableUsageStatsStartParameter = "disable-usage-stats"
)

type rayJobResourceHandler struct {
Expand All @@ -57,7 +58,6 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC
}

podSpec, objectMeta, primaryContainerName, err := flytek8s.ToK8sPodSpec(ctx, taskCtx)

if err != nil {
return nil, flyteerr.Errorf(flyteerr.BadTaskSpecification, "Unable to create pod spec: [%v]", err.Error())
}
Expand All @@ -76,26 +76,36 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC
return nil, flyteerr.Errorf(flyteerr.BadTaskSpecification, "Unable to get primary container from the pod: [%v]", err.Error())
}

cfg := GetConfig()
headReplicas := int32(1)
headNodeRayStartParams := make(map[string]string)
if rayJob.RayCluster.HeadGroupSpec != nil && rayJob.RayCluster.HeadGroupSpec.RayStartParams != nil {
headNodeRayStartParams = rayJob.RayCluster.HeadGroupSpec.RayStartParams
} else if headNode := cfg.Defaults.HeadNode; len(headNode.StartParameters) > 0 {
headNodeRayStartParams = headNode.StartParameters
}

if _, exist := headNodeRayStartParams[IncludeDashboard]; !exist {
headNodeRayStartParams[IncludeDashboard] = strconv.FormatBool(GetConfig().IncludeDashboard)
}

if _, exist := headNodeRayStartParams[NodeIPAddress]; !exist {
headNodeRayStartParams[NodeIPAddress] = GetConfig().NodeIPAddress
headNodeRayStartParams[NodeIPAddress] = cfg.Defaults.HeadNode.IPAddress
}

if _, exist := headNodeRayStartParams[DashboardHost]; !exist {
headNodeRayStartParams[DashboardHost] = GetConfig().DashboardHost
headNodeRayStartParams[DashboardHost] = cfg.DashboardHost
}

if _, exists := headNodeRayStartParams[DisableUsageStatsStartParameter]; !exists && !cfg.EnableUsageStats {
headNodeRayStartParams[DisableUsageStatsStartParameter] = "true"
}

enableIngress := true
rayClusterSpec := rayv1alpha1.RayClusterSpec{
HeadGroupSpec: rayv1alpha1.HeadGroupSpec{
Template: buildHeadPodTemplate(&container, podSpec, objectMeta, taskCtx),
ServiceType: v1.ServiceType(GetConfig().ServiceType),
ServiceType: v1.ServiceType(cfg.ServiceType),
Replicas: &headReplicas,
EnableIngress: &enableIngress,
RayStartParams: headNodeRayStartParams,
Expand All @@ -111,16 +121,24 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC
if spec.MinReplicas != 0 {
minReplicas = spec.MinReplicas
}

if spec.MaxReplicas != 0 {
maxReplicas = spec.MaxReplicas
}

workerNodeRayStartParams := make(map[string]string)
if spec.RayStartParams != nil {
workerNodeRayStartParams = spec.RayStartParams
} else if workerNode := cfg.Defaults.WorkerNode; len(workerNode.StartParameters) > 0 {
workerNodeRayStartParams = workerNode.StartParameters
}

if _, exist := workerNodeRayStartParams[NodeIPAddress]; !exist {
workerNodeRayStartParams[NodeIPAddress] = GetConfig().NodeIPAddress
workerNodeRayStartParams[NodeIPAddress] = cfg.Defaults.WorkerNode.IPAddress
}

if _, exists := workerNodeRayStartParams[DisableUsageStatsStartParameter]; !exists && !cfg.EnableUsageStats {
workerNodeRayStartParams[DisableUsageStatsStartParameter] = "true"
}

workerNodeSpec := rayv1alpha1.WorkerGroupSpec{
Expand All @@ -145,8 +163,8 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC
jobSpec := rayv1alpha1.RayJobSpec{
RayClusterSpec: rayClusterSpec,
Entrypoint: strings.Join(container.Args, " "),
ShutdownAfterJobFinishes: GetConfig().ShutdownAfterJobFinishes,
TTLSecondsAfterFinished: &GetConfig().TTLSecondsAfterFinished,
ShutdownAfterJobFinishes: cfg.ShutdownAfterJobFinishes,
TTLSecondsAfterFinished: &cfg.TTLSecondsAfterFinished,
RuntimeEnv: rayJob.RuntimeEnv,
}

Expand Down Expand Up @@ -347,12 +365,10 @@ func (rayJobResourceHandler) BuildIdentityResource(ctx context.Context, taskCtx
}, nil
}

func getEventInfoForRayJob() (*pluginsCore.TaskInfo, error) {
taskLogs := make([]*core.TaskLog, 0, 3)
logPlugin, err := logs.InitializeLogPlugins(logs.GetLogConfig())

func getEventInfoForRayJob(logConfig logs.LogConfig, pluginContext k8s.PluginContext, rayJob *rayv1alpha1.RayJob) (*pluginsCore.TaskInfo, error) {
logPlugin, err := logs.InitializeLogPlugins(&logConfig)
if err != nil {
return nil, err
return nil, fmt.Errorf("failed to initialize log plugins. Error: %w", err)
}

if logPlugin == nil {
Expand All @@ -362,22 +378,31 @@ func getEventInfoForRayJob() (*pluginsCore.TaskInfo, error) {
// TODO: Retrieve the name of head pod from rayJob.status, and add it to task logs
// RayJob CRD does not include the name of the worker or head pod for now

// TODO: Add ray Dashboard URI to task logs
taskID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID().GetID()
logOutput, err := logPlugin.GetTaskLogs(tasklog.Input{
Namespace: rayJob.Namespace,
TaskExecutionIdentifier: &taskID,
})

if err != nil {
return nil, fmt.Errorf("failed to generate task logs. Error: %w", err)
}

return &pluginsCore.TaskInfo{
Logs: taskLogs,
Logs: logOutput.TaskLogs,
}, nil
}

func (rayJobResourceHandler) GetTaskPhase(ctx context.Context, pluginContext k8s.PluginContext, resource client.Object) (pluginsCore.PhaseInfo, error) {
func (plugin rayJobResourceHandler) GetTaskPhase(ctx context.Context, pluginContext k8s.PluginContext, resource client.Object) (pluginsCore.PhaseInfo, error) {
rayJob := resource.(*rayv1alpha1.RayJob)
info, err := getEventInfoForRayJob()
info, err := getEventInfoForRayJob(GetConfig().Logs, pluginContext, rayJob)
if err != nil {
return pluginsCore.PhaseInfoUndefined, err
}

switch rayJob.Status.JobStatus {
case rayv1alpha1.JobStatusPending:
return pluginsCore.PhaseInfoNotReady(time.Now(), pluginsCore.DefaultPhaseVersion, "job is pending"), nil
return pluginsCore.PhaseInfoInitializing(rayJob.Status.StartTime.Time, pluginsCore.DefaultPhaseVersion, "job is pending", info), nil
case rayv1alpha1.JobStatusFailed:
reason := fmt.Sprintf("Failed to create Ray job: %s", rayJob.Name)
return pluginsCore.PhaseInfoFailure(flyteerr.TaskFailedWithError, reason, info), nil
Expand All @@ -386,7 +411,8 @@ func (rayJobResourceHandler) GetTaskPhase(ctx context.Context, pluginContext k8s
case rayv1alpha1.JobStatusRunning:
return pluginsCore.PhaseInfoRunning(pluginsCore.DefaultPhaseVersion, info), nil
}
return pluginsCore.PhaseInfoQueued(time.Now(), pluginsCore.DefaultPhaseVersion, "JobCreated"), nil

return pluginsCore.PhaseInfoQueued(rayJob.CreationTimestamp.Time, pluginsCore.DefaultPhaseVersion, "JobCreated"), nil
}

func init() {
Expand Down
Loading

0 comments on commit 8b417f4

Please sign in to comment.