Skip to content

Commit 15512c2

Browse files
authored
Revert "Revert "Route core worker ERROR/FATAL logs to driver logs (#1… (ray-project#18604)
1 parent 31e1638 commit 15512c2

File tree

4 files changed

+49
-17
lines changed

4 files changed

+49
-17
lines changed

python/ray/tests/test_output.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,25 @@ def foo(out_str, err_str):
3232
assert err_str.split("\n")[-2].endswith("def")
3333

3434

35+
@pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.")
36+
def test_core_worker_error_message():
37+
script = """
38+
import ray
39+
import sys
40+
41+
ray.init(local_mode=True)
42+
43+
# In local mode this generates an ERROR level log.
44+
ray._private.utils.push_error_to_driver(
45+
ray.worker.global_worker, "type", "Hello there")
46+
"""
47+
48+
proc = run_string_as_driver_nonblocking(script)
49+
err_str = proc.stderr.read().decode("ascii")
50+
51+
assert "Hello there" in err_str, err_str
52+
53+
3554
@pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.")
3655
def test_disable_driver_logs_breakpoint():
3756
script = """

src/ray/common/ray_config_def.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,9 @@ RAY_CONFIG(bool, preallocate_plasma_memory, false)
9898
/// then spread via weighted (by critical resource usage).
9999
RAY_CONFIG(bool, scheduler_hybrid_scheduling, true)
100100

101+
/// The fraction of resource utilization on a node after which the scheduler starts
102+
/// to prefer spreading tasks to other nodes. This balances between locality and
103+
/// even balancing of load. Low values (min 0.0) encourage more load spreading.
101104
RAY_CONFIG(float, scheduler_spread_threshold,
102105
getenv("RAY_SCHEDULER_SPREAD_THRESHOLD") != nullptr
103106
? std::stof(getenv("RAY_SCHEDULER_SPREAD_THRESHOLD"))

src/ray/core_worker/core_worker.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -570,13 +570,13 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_
570570
// Retry after a delay to emulate the existing Raylet reconstruction
571571
// behaviour. TODO(ekl) backoff exponentially.
572572
uint32_t delay = RayConfig::instance().task_retry_delay_ms();
573-
RAY_LOG(ERROR) << "Will resubmit task after a " << delay
574-
<< "ms delay: " << spec.DebugString();
573+
RAY_LOG(INFO) << "Will resubmit task after a " << delay
574+
<< "ms delay: " << spec.DebugString();
575575
absl::MutexLock lock(&mutex_);
576576
to_resubmit_.push_back(std::make_pair(current_time_ms() + delay, spec));
577577
} else {
578-
RAY_LOG(ERROR) << "Resubmitting task that produced lost plasma object: "
579-
<< spec.DebugString();
578+
RAY_LOG(INFO) << "Resubmitting task that produced lost plasma object: "
579+
<< spec.DebugString();
580580
if (spec.IsActorTask()) {
581581
auto actor_handle = actor_manager_->GetActorHandle(spec.ActorId());
582582
actor_handle->SetResubmittedActorTaskSpec(spec, spec.ActorDummyObject());

src/ray/util/logging.cc

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,10 @@ void RayLog::StartRayLog(const std::string &app_name, RayLogLevel severity_thres
196196
app_name_ = app_name;
197197
log_dir_ = log_dir;
198198

199+
// All the logging sinks to add.
200+
std::vector<spdlog::sink_ptr> sinks;
201+
auto level = static_cast<spdlog::level::level_enum>(severity_threshold_);
202+
199203
if (!log_dir_.empty()) {
200204
// Enable log file if log_dir_ is not empty.
201205
std::string dir_ends_with_slash = log_dir_;
@@ -243,26 +247,32 @@ void RayLog::StartRayLog(const std::string &app_name, RayLogLevel severity_thres
243247
// logger.
244248
spdlog::drop(RayLog::GetLoggerName());
245249
}
246-
file_logger = spdlog::rotating_logger_mt(
247-
RayLog::GetLoggerName(),
250+
auto file_sink = std::make_shared<spdlog::sinks::rotating_file_sink_mt>(
248251
dir_ends_with_slash + app_name_without_path + "_" + std::to_string(pid) + ".log",
249252
log_rotation_max_size_, log_rotation_file_num_);
250-
spdlog::set_default_logger(file_logger);
253+
sinks.push_back(file_sink);
251254
} else {
252255
auto console_sink = std::make_shared<spdlog::sinks::stdout_color_sink_mt>();
253256
console_sink->set_pattern(log_format_pattern_);
254-
auto level = static_cast<spdlog::level::level_enum>(severity_threshold_);
255257
console_sink->set_level(level);
256-
257-
auto err_sink = std::make_shared<spdlog::sinks::stderr_color_sink_mt>();
258-
err_sink->set_pattern(log_format_pattern_);
259-
err_sink->set_level(spdlog::level::err);
260-
261-
auto logger = std::shared_ptr<spdlog::logger>(
262-
new spdlog::logger(RayLog::GetLoggerName(), {console_sink, err_sink}));
263-
logger->set_level(level);
264-
spdlog::set_default_logger(logger);
258+
sinks.push_back(console_sink);
265259
}
260+
261+
// In all cases, log errors to the console log so they are in driver logs.
262+
// https://github.com/ray-project/ray/issues/12893
263+
auto err_sink = std::make_shared<spdlog::sinks::stderr_color_sink_mt>();
264+
err_sink->set_pattern(log_format_pattern_);
265+
err_sink->set_level(spdlog::level::err);
266+
sinks.push_back(err_sink);
267+
268+
// Set the combined logger.
269+
auto logger = std::make_shared<spdlog::logger>(RayLog::GetLoggerName(), sinks.begin(),
270+
sinks.end());
271+
logger->set_level(level);
272+
logger->set_pattern(log_format_pattern_);
273+
spdlog::set_level(static_cast<spdlog::level::level_enum>(severity_threshold_));
274+
spdlog::set_pattern(log_format_pattern_);
275+
spdlog::set_default_logger(logger);
266276
}
267277

268278
void RayLog::UninstallSignalAction() {

0 commit comments

Comments
 (0)