From 1ca231c89a7c27ba1d14c715b1b7123f81884ccb Mon Sep 17 00:00:00 2001 From: David Anderson Date: Wed, 11 Dec 2024 17:00:45 -0800 Subject: [PATCH 1/9] web and scheduler: enable plan classes for BUDA If you make a variant of a BUDA app for a plan class (e.g. NVIDIA GPU with CUDA) this ensures that jobs submitted to that variant are sent only to capable hosts, and that the host usage and projected FLOPS are set correctly. On the web side, we add a element to workunit.xml_doc. This gets sent to the scheduler. On the scheduler this required some reorganization. As the scheduler scans jobs, it finds and caches a BEST_APP_VERSION for each app. This contains a HOST_USAGE. In the case of BUDA, the host usage depends on the workunit, not the app version. We might scan several BUDA jobs they'll all use the same APP_VERSION, but they could have different plan classes and therefore different HOST_USAGE. So if we're looking at a job to send, and the WU has a element, call app_plan() to check the host capability and get the host usage. Change add_result_to_reply() so that it takes a HOST_USAGE& argument, rather than getting it from the BEST_APP_VERSION. We do this in several places: - sched_array (old scheduling policy) - sched_score (new scheduling policy) - sched_locality (locality scheduling) - sched_resend (resending lost jobs) - sched_assign (assigned jobs) so all these functions work properly with BUDA apps. ----------------- Also: the input and output templates for a BUDA app variant depend only on the variant, not on batches or jobs. So generate them when the variant is created, and store them in the variant dir, rather than generating them on batch submission Also: fix bug in downloading batch output as .zip; need to do zip -q --- html/inc/submit_util.inc | 39 +++++++++++++- html/user/buda.php | 44 ++++++++++++++++ html/user/buda_submit.php | 105 ++++++-------------------------------- html/user/get_output3.php | 2 +- sched/sched_array.cpp | 6 ++- sched/sched_assign.cpp | 2 +- sched/sched_customize.cpp | 12 +++-- sched/sched_locality.cpp | 6 ++- sched/sched_nci.cpp | 2 +- sched/sched_resend.cpp | 8 ++- sched/sched_score.cpp | 9 +++- sched/sched_score.h | 1 + sched/sched_send.cpp | 56 +++++++++++++++++--- sched/sched_send.h | 7 +++ sched/sched_version.cpp | 6 ++- 15 files changed, 196 insertions(+), 109 deletions(-) diff --git a/html/inc/submit_util.inc b/html/inc/submit_util.inc index a3731be6f79..4d5ec9e726a 100644 --- a/html/inc/submit_util.inc +++ b/html/inc/submit_util.inc @@ -17,7 +17,7 @@ // You should have received a copy of the GNU Lesser General Public License // along with BOINC. If not, see . -// server-side utility functions for remote job submissions and control +// server-side utility functions for remote job submission and control require_once("../inc/submit_db.inc"); @@ -370,4 +370,41 @@ function parse_info_file($path) { return [$md5, $size]; } +///////////////// TEMPLATE CREATION ////////////// + +function file_ref_in($fname) { + return(sprintf( +' + %s + + +', + $fname + )); +} +function file_info_out($i) { + return sprintf( +' + + + + 5000000 + + +', + $i + ); +} + +function file_ref_out($i, $fname) { + return sprintf( +' + + %s + + +', $i, $fname + ); +} + ?> diff --git a/html/user/buda.php b/html/user/buda.php index aea8aeea7d6..e5cff7e8df0 100644 --- a/html/user/buda.php +++ b/html/user/buda.php @@ -165,6 +165,48 @@ function copy_and_stage_file($user, $fname, $dir, $app, $variant) { return $phys_name; } +// create templates and put them in variant dir +// +function create_templates($variant, $variant_desc, $dir) { + // input template + // + $x = "\n"; + $ninfiles = 1 + count($variant_desc->input_file_names) + count($variant_desc->app_files); + for ($i=0; $i<$ninfiles; $i++) { + $x .= " \n \n \n"; + } + $x .= " \n"; + $x .= file_ref_in($variant_desc->dockerfile); + foreach ($variant_desc->app_files as $fname) { + $x .= file_ref_in($fname); + } + foreach ($variant_desc->input_file_names as $fname) { + $x .= file_ref_in($fname); + } + if ($variant == 'cpu') { + $x .= " \n"; + } else { + $x .= " $variant\n"; + } + $x .= " \n\n"; + file_put_contents("$dir/template_in", $x); + + // output template + // + $x = "\n"; + $i = 0; + foreach ($variant_desc->output_file_names as $fname) { + $x .= file_info_out($i++); + } + $x .= " \n"; + $i = 0; + foreach ($variant_desc->output_file_names as $fname) { + $x .= file_ref_out($i++, $fname); + } + $x .= " \n\n"; + file_put_contents("$dir/template_out", $x); +} + // create variant // function variant_action($user) { @@ -219,6 +261,8 @@ function variant_action($user) { json_encode($desc, JSON_PRETTY_PRINT) ); + create_templates($variant, $desc, $dir); + // Note: we don't currently allow indirect file access. // If we did, we'd need to create job.toml to mount project dir diff --git a/html/user/buda_submit.php b/html/user/buda_submit.php index 26c9d40fbd6..6b97a276d14 100644 --- a/html/user/buda_submit.php +++ b/html/user/buda_submit.php @@ -147,13 +147,13 @@ function parse_batch_dir($batch_dir, $variant_desc) { } function create_batch($user, $njobs, $app, $variant) { - global $buda_app; + global $buda_boinc_app; $now = time(); $batch_name = sprintf('buda_%d_%d', $user->id, $now); $description = "$app ($variant)"; $batch_id = BoincBatch::insert(sprintf( "(user_id, create_time, logical_start_time, logical_end_time, est_completion_time, njobs, fraction_done, nerror_jobs, state, completion_time, credit_estimate, credit_canonical, credit_total, name, app_id, project_state, description, expire_time) values (%d, %d, 0, 0, 0, %d, 0, 0, %d, 0, 0, 0, 0, '%s', %d, 0, '%s', 0)", - $user->id, $now, $njobs, BATCH_STATE_INIT, $batch_name, $buda_app->id, + $user->id, $now, $njobs, BATCH_STATE_INIT, $batch_name, $buda_boinc_app->id, $description )); return BoincBatch::lookup_id($batch_id); @@ -188,17 +188,21 @@ function stage_input_files($batch_dir, $batch_desc, $batch_id) { // Use --stdin, where each job is described by a line // function create_jobs( - $variant_desc, $batch_desc, $batch_id, $batch_dir_name, + $app, $variant, $variant_desc, + $batch_desc, $batch_id, $batch_dir_name, $wrapper_verbose, $cmdline ) { - global $buda_app; + global $buda_boinc_app; - // get list of names of app files + // get list of physical names of app files // $app_file_names = $variant_desc->dockerfile_phys; foreach ($variant_desc->app_files_phys as $pname) { $app_file_names .= " $pname"; } + + // make per-job lines to pass as stdin + // $job_cmds = ''; foreach ($batch_desc->jobs as $job) { $job_cmd = sprintf('--wu_name batch_%d__job_%s', $batch_id, $job->dir); @@ -221,10 +225,10 @@ function create_jobs( ); $cmd = sprintf( 'cd ../..; bin/create_work --appname %s --batch %d --stdin --command_line %s --wu_template %s --result_template %s', - $buda_app->name, $batch_id, + $buda_boinc_app->name, $batch_id, $cw_cmdline, - "buda_batches/$batch_dir_name/template_in", - "buda_batches/$batch_dir_name/template_out" + "buda_apps/$app/$variant/template_in", + "buda_apps/$app/$variant/template_out" ); $cmd .= sprintf(' > %s 2<&1', "buda_batches/errfile"); @@ -242,80 +246,6 @@ function create_jobs( } } -///////////////// TEMPLATE CREATION ////////////// - -function file_ref_in($fname) { - return(sprintf( -' - %s - - -', - $fname - )); -} -function file_info_out($i) { - return sprintf( -' - - - - 5000000 - - -', - $i - ); -} - -function file_ref_out($i, $fname) { - return sprintf( -' - - %s - - -', $i, $fname - ); -} - -// create templates and put them in batch dir -// -function create_templates($variant_desc, $batch_dir) { - // input template - // - $x = "\n"; - $ninfiles = 1 + count($variant_desc->input_file_names) + count($variant_desc->app_files); - for ($i=0; $i<$ninfiles; $i++) { - $x .= " \n \n \n"; - } - $x .= " \n"; - $x .= file_ref_in($variant_desc->dockerfile); - foreach ($variant_desc->app_files as $fname) { - $x .= file_ref_in($fname); - } - foreach ($variant_desc->input_file_names as $fname) { - $x .= file_ref_in($fname); - } - $x .= " \n\n"; - file_put_contents("$batch_dir/template_in", $x); - - // output template - // - $x = "\n"; - $i = 0; - foreach ($variant_desc->output_file_names as $fname) { - $x .= file_info_out($i++); - } - $x .= " \n"; - $i = 0; - foreach ($variant_desc->output_file_names as $fname) { - $x .= file_ref_out($i++, $fname); - } - $x .= " \n\n"; - file_put_contents("$batch_dir/template_out", $x); -} - function handle_submit($user) { $app = get_str('app'); if (!is_valid_filename($app)) die('bad arg'); @@ -338,8 +268,6 @@ function handle_submit($user) { // scan batch dir; validate and return struct $batch_desc = parse_batch_dir($batch_dir, $variant_desc); - create_templates($variant_desc, $batch_dir); - $batch = create_batch( $user, count($batch_desc->jobs), $app, $variant ); @@ -349,7 +277,8 @@ function handle_submit($user) { stage_input_files($batch_dir, $batch_desc, $batch->id); create_jobs( - $variant_desc, $batch_desc, $batch->id, $batch_dir_name, + $app, $variant, $variant_desc, + $batch_desc, $batch->id, $batch_dir_name, $wrapper_verbose, $cmdline ); @@ -365,9 +294,9 @@ function handle_submit($user) { } $user = get_logged_in_user(); -$buda_app = BoincApp::lookup("name='buda'"); -if (!$buda_app) error_page('no buda app'); -if (!has_submit_access($user, $buda_app->id)) { +$buda_boinc_app = BoincApp::lookup("name='buda'"); +if (!$buda_boinc_app) error_page('no buda app'); +if (!has_submit_access($user, $buda_boinc_app->id)) { error_page('no access'); } $action = get_str('action', true); diff --git a/html/user/get_output3.php b/html/user/get_output3.php index 2da95847926..b8f8fb7385b 100644 --- a/html/user/get_output3.php +++ b/html/user/get_output3.php @@ -47,7 +47,7 @@ function get_batch() { $batch_id = get_str('batch_id'); $dir = "../../results/$batch_id"; $name = "batch_$batch_id.zip"; - $cmd = "cd $dir; rm -f $name; zip $name *"; + $cmd = "cd $dir; rm -f $name; zip -q $name *"; system($cmd); do_download("$dir/$name"); } diff --git a/sched/sched_array.cpp b/sched/sched_array.cpp index e28d19ceb52..36bc83fb7fd 100644 --- a/sched/sched_array.cpp +++ b/sched/sched_array.cpp @@ -323,7 +323,11 @@ static bool scan_work_array() { // result.id = wu_result.resultid; if (result_still_sendable(result, wu)) { - add_result_to_reply(result, wu, bavp, false); + HOST_USAGE hu; + if (!handle_wu_plan_class(wu, bavp, hu)) { + continue; + } + add_result_to_reply(result, wu, bavp, hu, false); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. diff --git a/sched/sched_assign.cpp b/sched/sched_assign.cpp index 260ed9a6db6..938e6eccfe5 100644 --- a/sched/sched_assign.cpp +++ b/sched/sched_assign.cpp @@ -154,7 +154,7 @@ static int send_assigned_job(ASSIGNMENT& asg) { DB_ID_TYPE result_id = boinc_db.insert_id(); SCHED_DB_RESULT result; retval = result.lookup_id(result_id); - add_result_to_reply(result, wu, bavp, false); + add_result_to_reply(result, wu, bavp, bavp->host_usage, false); if (config.debug_assignment) { log_messages.printf(MSG_NORMAL, diff --git a/sched/sched_customize.cpp b/sched/sched_customize.cpp index a424b9bac99..db960d05954 100644 --- a/sched/sched_customize.cpp +++ b/sched/sched_customize.cpp @@ -959,15 +959,21 @@ static inline bool app_plan_vbox( return true; } -static inline bool app_plan_wsl(SCHEDULER_REQUEST& sreq, char* plan_class, HOST_USAGE& hu) { +static inline bool app_plan_wsl( + SCHEDULER_REQUEST& sreq, char* plan_class, HOST_USAGE& hu +) { // no additional checks at the moment, just return true return true; } -// app planning function. +// if host can handle the plan class, populate host usage and return true +// // See https://github.com/BOINC/boinc/wiki/AppPlan // -bool app_plan(SCHEDULER_REQUEST& sreq, char* plan_class, HOST_USAGE& hu, const WORKUNIT* wu) { +bool app_plan( + SCHEDULER_REQUEST& sreq, char* plan_class, HOST_USAGE& hu, + const WORKUNIT* wu +) { char buf[256]; static bool check_plan_class_spec = true; static bool have_plan_class_spec = false; diff --git a/sched/sched_locality.cpp b/sched/sched_locality.cpp index a780d1f2456..3674fe00f3a 100644 --- a/sched/sched_locality.cpp +++ b/sched/sched_locality.cpp @@ -332,7 +332,11 @@ static int possibly_send_result(SCHED_DB_RESULT& result) { if (count > 0) return ERR_WU_USER_RULE; } - return add_result_to_reply(result, wu, bavp, true); + HOST_USAGE hu; + if (!handle_wu_plan_class(wu, bavp, hu)) { + return false; + } + return add_result_to_reply(result, wu, bavp, hu, true); } // Retrieves and returns a trigger instance identified by the given diff --git a/sched/sched_nci.cpp b/sched/sched_nci.cpp index de3082efabe..3792a4ba4ce 100644 --- a/sched/sched_nci.cpp +++ b/sched/sched_nci.cpp @@ -94,7 +94,7 @@ static int send_job_for_app(APP& app) { "Sending non-CPU-intensive job: %s\n", wu.name ); } - add_result_to_reply(result, wu, bavp, false); + add_result_to_reply(result, wu, bavp, bavp->host_usage, false); return 0; } log_messages.printf(MSG_NORMAL, diff --git a/sched/sched_resend.cpp b/sched/sched_resend.cpp index 1eedcecd42b..776bf5b8f39 100644 --- a/sched/sched_resend.cpp +++ b/sched/sched_resend.cpp @@ -249,7 +249,13 @@ bool resend_lost_work() { ); g_reply->insert_message(warning_msg, "low"); } else { - retval = add_result_to_reply(result, wu, bavp, false); + HOST_USAGE host_usage; + if (!handle_wu_plan_class(wu, bavp, host_usage)) { + continue; + } + retval = add_result_to_reply( + result, wu, bavp, host_usage, false + ); if (retval) { log_messages.printf(MSG_CRITICAL, "[HOST#%lu] failed to send [RESULT#%lu]\n", diff --git a/sched/sched_score.cpp b/sched/sched_score.cpp index fc738f407d3..abecf756820 100644 --- a/sched/sched_score.cpp +++ b/sched/sched_score.cpp @@ -202,6 +202,7 @@ void send_work_score_type(int rt) { } WORKUNIT wu = wu_result.workunit; JOB job; + job.app = ssp->lookup_app(wu.appid); if (job.app->non_cpu_intensive) { if (config.debug_send_job) { @@ -223,6 +224,12 @@ void send_work_score_type(int rt) { continue; } + // check WU plan class (for BUDA jobs) + // + if (!handle_wu_plan_class(wu, job.bavp, job.host_usage)) { + continue; + } + job.index = i; job.result_id = wu_result.resultid; if (!job.get_score(i)) { @@ -350,7 +357,7 @@ void send_work_score_type(int rt) { SCHED_DB_RESULT result; result.id = wu_result.resultid; if (result_still_sendable(result, wu)) { - add_result_to_reply(result, wu, job.bavp, false); + add_result_to_reply(result, wu, job.bavp, job.host_usage, false); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. diff --git a/sched/sched_score.h b/sched/sched_score.h index 693a55ac5d3..d14e5a26ce3 100644 --- a/sched/sched_score.h +++ b/sched/sched_score.h @@ -23,6 +23,7 @@ struct JOB { double score; APP* app; BEST_APP_VERSION* bavp; + HOST_USAGE host_usage; bool get_score(int); }; diff --git a/sched/sched_send.cpp b/sched/sched_send.cpp index 46ea751bf00..4f360335bad 100644 --- a/sched/sched_send.cpp +++ b/sched/sched_send.cpp @@ -885,10 +885,50 @@ inline static DB_ID_TYPE get_app_version_id(BEST_APP_VERSION* bavp) { } } +static bool wu_has_plan_class(WORKUNIT &wu, char* buf) { + char *p = strstr(wu.xml_doc, ""); + if (!p) return false; + p += strlen(""); + strncpy(buf, p, 256); + p = strstr(buf, ""); + if (!p) return false; + *p = 0; + return true; +} + +// if workunit has a plan class (e.g. BUDA), check it +// in any case, fill in the HOST_USAGE +// +bool handle_wu_plan_class( + WORKUNIT &wu, BEST_APP_VERSION *bavp, HOST_USAGE &hu +) { + char plan_class[256]; + if (wu_has_plan_class(wu, plan_class)) { + if (strlen(plan_class)) { + if (!app_plan(*g_request, plan_class, hu, &wu)) { + if (config.debug_version_select) { + log_messages.printf(MSG_NORMAL, + "[version] [AV#%lu] app_plan(%s) returned false\n", + bavp->avp->id, plan_class + ); + } + // can't send this job + return false; + } + } else { + hu.sequential_app(g_reply->host.p_fpops); + } + } else { + hu = bavp->host_usage; + } + return true; +} + int add_result_to_reply( SCHED_DB_RESULT& result, WORKUNIT& wu, BEST_APP_VERSION* bavp, + HOST_USAGE &host_usage, bool locality_scheduling ) { int retval; @@ -899,7 +939,7 @@ int add_result_to_reply( result.userid = g_reply->user.id; result.sent_time = time(0); result.report_deadline = result.sent_time + wu.delay_bound; - result.flops_estimate = bavp->host_usage.peak_flops; + result.flops_estimate = host_usage.peak_flops; result.app_version_id = get_app_version_id(bavp); // update WU DB record. @@ -978,7 +1018,7 @@ int add_result_to_reply( double est_dur = estimate_duration(wu, *bavp); if (config.debug_send) { - double max_time = wu.rsc_fpops_bound / bavp->host_usage.projected_flops; + double max_time = wu.rsc_fpops_bound / host_usage.projected_flops; char buf1[64],buf2[64]; secs_to_hmsf(est_dur, buf1); secs_to_hmsf(max_time, buf2); @@ -1017,11 +1057,11 @@ int add_result_to_reply( // because the scheduling of GPU jobs is constrained by the # of GPUs // if (g_wreq->rsc_spec_request) { - int pt = bavp->host_usage.proc_type; + int pt = host_usage.proc_type; if (pt == PROC_TYPE_CPU) { - double est_cpu_secs = est_dur*bavp->host_usage.avg_ncpus; + double est_cpu_secs = est_dur*host_usage.avg_ncpus; g_wreq->req_secs[PROC_TYPE_CPU] -= est_cpu_secs; - g_wreq->req_instances[PROC_TYPE_CPU] -= bavp->host_usage.avg_ncpus; + g_wreq->req_instances[PROC_TYPE_CPU] -= host_usage.avg_ncpus; if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] est_dur %f est_cpu_secs %f; new req_secs %f\n", @@ -1029,9 +1069,9 @@ int add_result_to_reply( ); } } else { - double est_gpu_secs = est_dur*bavp->host_usage.gpu_usage; + double est_gpu_secs = est_dur*host_usage.gpu_usage; g_wreq->req_secs[pt] -= est_gpu_secs; - g_wreq->req_instances[pt] -= bavp->host_usage.gpu_usage; + g_wreq->req_instances[pt] -= host_usage.gpu_usage; if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] est_dur %f est_gpu_secs %f; new req_secs %f\n", @@ -1045,7 +1085,7 @@ int add_result_to_reply( } update_estimated_delay(*bavp, est_dur); g_wreq->njobs_sent++; - config.max_jobs_in_progress.register_job(app, bavp->host_usage.proc_type); + config.max_jobs_in_progress.register_job(app, host_usage.proc_type); if (!resent_result) { DB_HOST_APP_VERSION* havp = bavp->host_app_version(); if (havp) { diff --git a/sched/sched_send.h b/sched/sched_send.h index 373533f0c8b..49a135fb19a 100644 --- a/sched/sched_send.h +++ b/sched/sched_send.h @@ -33,9 +33,16 @@ extern void send_work(); extern int add_result_to_reply( SCHED_DB_RESULT& result, WORKUNIT& wu, BEST_APP_VERSION* bavp, + HOST_USAGE&, bool locality_scheduling ); +// if WU has plan class, get corresponding host_usage +// +extern bool handle_wu_plan_class( + WORKUNIT &wu, BEST_APP_VERSION *bavp, HOST_USAGE &host_usage +); + inline bool is_anonymous(PLATFORM* platform) { return (!strcmp(platform->name, "anonymous")); } diff --git a/sched/sched_version.cpp b/sched/sched_version.cpp index c7ba2a0fdb1..59e0043d8fd 100644 --- a/sched/sched_version.cpp +++ b/sched/sched_version.cpp @@ -761,12 +761,14 @@ BEST_APP_VERSION* get_app_version( } } + // if app version has plan class, make sure host can handle it + // if (strlen(av.plan_class)) { if (!app_plan(*g_request, av.plan_class, host_usage, &wu)) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, - "[version] [AV#%lu] app_plan() returned false\n", - av.id + "[version] [AV#%lu] app_plan(%s) returned false\n", + av.id, av.plan_class ); } continue; From fc745ae0a03abd3ec2db90b7317aade9c2130d75 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Thu, 12 Dec 2024 15:35:58 -0800 Subject: [PATCH 2/9] web: maintain a file 'project/buda_plan_classes' with a list of BUDA variant names (i.e. plan classes). Update as variants are added and deleted. This is used in project preferences for 'Use NVIDIA' type buttons. feeder: the shared-mem segment has a list of resources types for which the project has work. Need to include BUDA variants also. Do this by scanning the 'buda_plan_classes' file (see above) Note: this means that when the set of BUDA variants changes, we need to restart the project plan_class_spec.xml.sample: The 'cuda' class had a max compute capability of 200. Remove it. --- html/inc/app_types.inc | 73 ++++++++++++++++++++++++++++++++ html/inc/prefs_project.inc | 3 +- html/inc/util.inc | 40 ----------------- html/user/buda.php | 34 ++++++++++++++- sched/plan_class_spec.xml.sample | 1 - sched/sched_customize.cpp | 6 ++- sched/sched_send.cpp | 9 +++- sched/sched_shmem.cpp | 20 +++++++++ 8 files changed, 139 insertions(+), 47 deletions(-) create mode 100644 html/inc/app_types.inc diff --git a/html/inc/app_types.inc b/html/inc/app_types.inc new file mode 100644 index 00000000000..de599dd0a01 --- /dev/null +++ b/html/inc/app_types.inc @@ -0,0 +1,73 @@ +. + +// code to get list of app types (CPU/GPU) supported by project + +require_once("../inc/boinc_db.inc"); + +// return a structure indicating whether project can use +// various resource types, and a count of apps. +// Include both non-deprecated app versions and BUDA app variants. +// +function get_app_types() { + $t = new StdClass; + $t->cpu = false; + $t->cuda = false; + $t->ati = false; + $t->intel_gpu = false; + $t->apple_gpu = false; + $t->count = 0; + + $avs = BoincAppVersion::enum("deprecated=0"); + foreach ($avs as $av) { + do_plan_class($av->plan_class, $t); + } + + $pcs = file('../../buda_plan_classes'); + foreach ($pcs as $pc) { + do_plan_class($pc, $t); + } + return $t; +} + +function do_plan_class($plan_class, &$t) { + if (strstr($plan_class, "ati")) { + $t->ati = true; + $t->count++; + } else if (strstr($plan_class, "amd")) { + $t->ati = true; + $t->count++; + } else if (strstr($plan_class, "cuda")) { + $t->cuda = true; + $t->count++; + } else if (strstr($plan_class, "nvidia")) { + $t->cuda = true; + $t->count++; + } else if (strstr($plan_class, "intel_gpu")) { + $t->intel_gpu = true; + $t->count++; + } else if (strstr($plan_class, "apple_gpu")) { + $t->apple_gpu = true; + $t->count++; + } else { + $t->cpu = true; + $t->count++; + } +} + +?> diff --git a/html/inc/prefs_project.inc b/html/inc/prefs_project.inc index e0009735d14..c9bb4d68e12 100644 --- a/html/inc/prefs_project.inc +++ b/html/inc/prefs_project.inc @@ -44,6 +44,7 @@ // (send_email and show_hosts) that are treated as project preferences include_once("../inc/prefs_util.inc"); +include_once("../inc/app_types.inc"); include_once("../project/project_specific_prefs.inc"); global $app_types; @@ -70,7 +71,7 @@ if (!empty($accelerate_gpu_apps_pref)) { if ($app_types->cpu) { $project_pref_descs[] = new PREF_BOOL ( - tra("Use CPU"), + tra("Allow CPU-only tasks"), "Request CPU-only tasks from this project.", "no_cpu", false, diff --git a/html/inc/util.inc b/html/inc/util.inc index c0be13d822e..9e27168b6bc 100644 --- a/html/inc/util.inc +++ b/html/inc/util.inc @@ -1018,46 +1018,6 @@ function db_init($try_replica=false) { return 0; } -// return a structure indicating whether project has non-deprecated -// apps versions for various resource types, -// and with a count of app versions -// -function get_app_types() { - $t = new StdClass; - $t->cpu = false; - $t->cuda = false; - $t->ati = false; - $t->intel_gpu = false; - $t->apple_gpu = false; - $t->count = 0; - $avs = BoincAppVersion::enum("deprecated=0"); - foreach ($avs as $av) { - if (strstr($av->plan_class, "ati")) { - $t->ati = true; - $t->count++; - } else if (strstr($av->plan_class, "amd")) { - $t->ati = true; - $t->count++; - } else if (strstr($av->plan_class, "cuda")) { - $t->cuda = true; - $t->count++; - } else if (strstr($av->plan_class, "nvidia")) { - $t->cuda = true; - $t->count++; - } else if (strstr($av->plan_class, "intel_gpu")) { - $t->intel_gpu = true; - $t->count++; - } else if (strstr($av->plan_class, "apple_gpu")) { - $t->apple_gpu = true; - $t->count++; - } else { - $t->cpu = true; - $t->count++; - } - } - return $t; -} - // Functions to sanitize GET and POST args // "next_url" arguments (must be local, not full URLs) diff --git a/html/user/buda.php b/html/user/buda.php index e5cff7e8df0..0af8322c16d 100644 --- a/html/user/buda.php +++ b/html/user/buda.php @@ -29,6 +29,32 @@ $buda_root = "../../buda_apps"; +// scan BUDA apps and variants, and write a file 'buda_plan_classes' +// in the project dir with list of plan classes +// +function write_plan_class_file() { + $pcs = []; + global $buda_root; + if (is_dir($buda_root)) { + $apps = scandir($buda_root); + foreach ($apps as $app) { + if ($app[0] == '.') continue; + if (!is_dir("$buda_root/$app")) continue; + $vars = scandir("$buda_root/$app"); + foreach ($vars as $var) { + if ($var[0] == '.') continue; + if (!is_dir("$buda_root/$app/$var")) continue; + $pcs[] = $var; + } + } + } + $pcs = array_unique($pcs); + file_put_contents( + "../../buda_plan_classes", + implode("\n", $pcs)."\n" + ); +} + // show list of BUDA apps and variants, // w/ buttons for adding and deleting // @@ -394,9 +420,13 @@ function view_file() { case 'variant_form': variant_form($user); break; case 'variant_action': - variant_action($user); break; + variant_action($user); + write_plan_class_file(); + break; case 'variant_delete': - variant_delete(); break; + variant_delete(); + write_plan_class_file(); + break; case 'view_file': view_file(); break; case null: diff --git a/sched/plan_class_spec.xml.sample b/sched/plan_class_spec.xml.sample index 345b2a6b942..13cc6b8850e 100644 --- a/sched/plan_class_spec.xml.sample +++ b/sched/plan_class_spec.xml.sample @@ -29,7 +29,6 @@ nvidia 100 - 200 3000 17700 254 diff --git a/sched/sched_customize.cpp b/sched/sched_customize.cpp index db960d05954..4a315f3a1b5 100644 --- a/sched/sched_customize.cpp +++ b/sched/sched_customize.cpp @@ -981,7 +981,11 @@ bool app_plan( if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, - "[version] Checking plan class '%s'\n", plan_class + "[version] Checking plan class '%s' check %d have %d bad %d\n", + plan_class, + check_plan_class_spec, + have_plan_class_spec, + bad_plan_class_spec ); } diff --git a/sched/sched_send.cpp b/sched/sched_send.cpp index 4f360335bad..3885d4f28ab 100644 --- a/sched/sched_send.cpp +++ b/sched/sched_send.cpp @@ -896,14 +896,19 @@ static bool wu_has_plan_class(WORKUNIT &wu, char* buf) { return true; } -// if workunit has a plan class (e.g. BUDA), check it -// in any case, fill in the HOST_USAGE +// If workunit has a plan class (e.g. BUDA), check it. +// In any case, fill in the HOST_USAGE // bool handle_wu_plan_class( WORKUNIT &wu, BEST_APP_VERSION *bavp, HOST_USAGE &hu ) { char plan_class[256]; if (wu_has_plan_class(wu, plan_class)) { + if (config.debug_version_select) { + log_messages.printf(MSG_NORMAL, + "[version] plan class: %s\n", plan_class + ); + } if (strlen(plan_class)) { if (!app_plan(*g_request, plan_class, hu, &wu)) { if (config.debug_version_select) { diff --git a/sched/sched_shmem.cpp b/sched/sched_shmem.cpp index 66faaa30c1d..3ea6e0070ad 100644 --- a/sched/sched_shmem.cpp +++ b/sched/sched_shmem.cpp @@ -28,6 +28,7 @@ #include using std::vector; +using std::string; #include "boinc_db.h" #include "error_numbers.h" @@ -107,6 +108,19 @@ static void overflow(const char* table, const char* param_name) { exit(1); } +void get_buda_plan_classes(vector &pcs) { + pcs.clear(); + FILE *f = boinc::fopen("../buda_plan_classes", "r"); + if (!f) return; + char buf[256]; + while (fgets(buf, 256, f)) { + strip_whitespace(buf); + pcs.push_back(buf); + } +} + +// scan various DB tables and populate shared-memory arrays +// int SCHED_SHMEM::scan_tables() { DB_PLATFORM platform; DB_APP app; @@ -243,6 +257,12 @@ int SCHED_SHMEM::scan_tables() { int rt = plan_class_to_proc_type(av.plan_class); have_apps_for_proc_type[rt] = true; } + vector buda_plan_classes; + get_buda_plan_classes(buda_plan_classes); + for (string pc: buda_plan_classes) { + int rt = plan_class_to_proc_type(pc.c_str()); + have_apps_for_proc_type[rt] = true; + } for (i=0; i Date: Fri, 13 Dec 2024 10:08:54 -0800 Subject: [PATCH 3/9] scheduler: if a job is BUDA, we need to return usage info (CPU, GPU) with the workunit rather than the app version. This commit lays the groundword for this. --- db/boinc_db_types.h | 4 ++++ sched/sched_array.cpp | 10 +++++--- sched/sched_assign.cpp | 10 +++++++- sched/sched_locality.cpp | 10 +++++--- sched/sched_nci.cpp | 2 +- sched/sched_resend.cpp | 14 ++++++----- sched/sched_score.cpp | 21 +++++++++++++--- sched/sched_score.h | 4 ++++ sched/sched_send.cpp | 52 ++++++++++++++++++++++------------------ sched/sched_send.h | 7 +++--- sched/sched_types.h | 5 ++++ 11 files changed, 96 insertions(+), 43 deletions(-) diff --git a/db/boinc_db_types.h b/db/boinc_db_types.h index 3586820dd85..c798764a54e 100644 --- a/db/boinc_db_types.h +++ b/db/boinc_db_types.h @@ -15,6 +15,10 @@ // You should have received a copy of the GNU Lesser General Public License // along with BOINC. If not, see . +// structures corresponding to various DB tables. +// In some cases the structures have extra fields, +// used by the server code but not stored in the DB + #ifndef _BOINC_DB_TYPES_ #define _BOINC_DB_TYPES_ diff --git a/sched/sched_array.cpp b/sched/sched_array.cpp index 36bc83fb7fd..86dfaf9034d 100644 --- a/sched/sched_array.cpp +++ b/sched/sched_array.cpp @@ -323,11 +323,15 @@ static bool scan_work_array() { // result.id = wu_result.resultid; if (result_still_sendable(result, wu)) { + bool is_buda, is_ok; HOST_USAGE hu; - if (!handle_wu_plan_class(wu, bavp, hu)) { - continue; + check_buda_plan_class(wu, hu, is_buda, is_ok); + if (is_buda) { + if (!is_ok) continue; + } else { + hu = bavp->host_usage; } - add_result_to_reply(result, wu, bavp, hu, false); + add_result_to_reply(result, wu, bavp, hu, is_buda, false); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. diff --git a/sched/sched_assign.cpp b/sched/sched_assign.cpp index 938e6eccfe5..75115578d63 100644 --- a/sched/sched_assign.cpp +++ b/sched/sched_assign.cpp @@ -154,7 +154,15 @@ static int send_assigned_job(ASSIGNMENT& asg) { DB_ID_TYPE result_id = boinc_db.insert_id(); SCHED_DB_RESULT result; retval = result.lookup_id(result_id); - add_result_to_reply(result, wu, bavp, bavp->host_usage, false); + bool is_buda, is_ok; + HOST_USAGE hu; + check_buda_plan_class(wu, hu, is_buda, is_ok); + if (is_buda) { + if (!is_ok) return -1; + } else { + hu = bavp->host_usage; + } + add_result_to_reply(result, wu, bavp, hu, is_buda, false); if (config.debug_assignment) { log_messages.printf(MSG_NORMAL, diff --git a/sched/sched_locality.cpp b/sched/sched_locality.cpp index 3674fe00f3a..5869a8d8a2c 100644 --- a/sched/sched_locality.cpp +++ b/sched/sched_locality.cpp @@ -332,11 +332,15 @@ static int possibly_send_result(SCHED_DB_RESULT& result) { if (count > 0) return ERR_WU_USER_RULE; } + bool is_buda, is_ok; HOST_USAGE hu; - if (!handle_wu_plan_class(wu, bavp, hu)) { - return false; + check_buda_plan_class(wu, hu, is_buda, is_ok); + if (is_buda) { + if (!is_ok) ERR_NO_APP_VERSION; + } else { + hu = bavp->host_usage; } - return add_result_to_reply(result, wu, bavp, hu, true); + return add_result_to_reply(result, wu, bavp, hu, is_buda, false); } // Retrieves and returns a trigger instance identified by the given diff --git a/sched/sched_nci.cpp b/sched/sched_nci.cpp index 3792a4ba4ce..b6f2fe7952d 100644 --- a/sched/sched_nci.cpp +++ b/sched/sched_nci.cpp @@ -94,7 +94,7 @@ static int send_job_for_app(APP& app) { "Sending non-CPU-intensive job: %s\n", wu.name ); } - add_result_to_reply(result, wu, bavp, bavp->host_usage, false); + add_result_to_reply(result, wu, bavp, bavp->host_usage, false, false); return 0; } log_messages.printf(MSG_NORMAL, diff --git a/sched/sched_resend.cpp b/sched/sched_resend.cpp index 776bf5b8f39..d4ed3f12cce 100644 --- a/sched/sched_resend.cpp +++ b/sched/sched_resend.cpp @@ -249,13 +249,15 @@ bool resend_lost_work() { ); g_reply->insert_message(warning_msg, "low"); } else { - HOST_USAGE host_usage; - if (!handle_wu_plan_class(wu, bavp, host_usage)) { - continue; + bool is_buda, is_ok; + HOST_USAGE hu; + check_buda_plan_class(wu, hu, is_buda, is_ok); + if (is_buda) { + if (!is_ok) continue; + } else { + hu = bavp->host_usage; } - retval = add_result_to_reply( - result, wu, bavp, host_usage, false - ); + retval = add_result_to_reply(result, wu, bavp, hu, is_buda, false); if (retval) { log_messages.printf(MSG_CRITICAL, "[HOST#%lu] failed to send [RESULT#%lu]\n", diff --git a/sched/sched_score.cpp b/sched/sched_score.cpp index abecf756820..e97d45c55b6 100644 --- a/sched/sched_score.cpp +++ b/sched/sched_score.cpp @@ -50,6 +50,10 @@ static int get_size_class(APP& app, double es) { return app.n_size_classes - 1; } +JOB::JOB() { + memset(this, 0, sizeof(JOB)); +} + // Assign a score to this job, // representing the value of sending the job to this host. // Also do some initial screening, @@ -226,8 +230,15 @@ void send_work_score_type(int rt) { // check WU plan class (for BUDA jobs) // - if (!handle_wu_plan_class(wu, job.bavp, job.host_usage)) { - continue; + bool is_buda, is_ok; + HOST_USAGE hu; + check_buda_plan_class(wu, hu, is_buda, is_ok); + if (is_buda) { + if (!is_ok) continue; + job.host_usage = hu; + job.is_buda = true; + } else { + job.host_usage = job.bavp->host_usage; } job.index = i; @@ -357,7 +368,11 @@ void send_work_score_type(int rt) { SCHED_DB_RESULT result; result.id = wu_result.resultid; if (result_still_sendable(result, wu)) { - add_result_to_reply(result, wu, job.bavp, job.host_usage, false); + add_result_to_reply( + result, wu, job.bavp, job.host_usage, + job.is_buda, + false // locality scheduling + ); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. diff --git a/sched/sched_score.h b/sched/sched_score.h index d14e5a26ce3..10d4458a22a 100644 --- a/sched/sched_score.h +++ b/sched/sched_score.h @@ -23,8 +23,12 @@ struct JOB { double score; APP* app; BEST_APP_VERSION* bavp; + bool is_buda; HOST_USAGE host_usage; + // if is_buda, usage returned by WU plan class + // else a copy of bavp->host_usage + JOB(); bool get_score(int); }; diff --git a/sched/sched_send.cpp b/sched/sched_send.cpp index 3885d4f28ab..5fd5f1b066f 100644 --- a/sched/sched_send.cpp +++ b/sched/sched_send.cpp @@ -896,37 +896,42 @@ static bool wu_has_plan_class(WORKUNIT &wu, char* buf) { return true; } -// If workunit has a plan class (e.g. BUDA), check it. -// In any case, fill in the HOST_USAGE +// If workunit has a plan class (e.g. BUDA) +// return false if host not capable +// plan class computes host usage +// is_buda = true +// else +// host usage is from app version +// is_buda = false // -bool handle_wu_plan_class( - WORKUNIT &wu, BEST_APP_VERSION *bavp, HOST_USAGE &hu +void check_buda_plan_class( + WORKUNIT &wu, HOST_USAGE &hu, bool &is_buda, bool &is_ok ) { char plan_class[256]; - if (wu_has_plan_class(wu, plan_class)) { + if (!wu_has_plan_class(wu, plan_class)) { + is_buda = false; + return; + } + if (config.debug_version_select) { + log_messages.printf(MSG_NORMAL, + "[version] plan class: %s\n", plan_class + ); + } + is_buda = true; + is_ok = true; + if (!strlen(plan_class)) { + hu.sequential_app(g_reply->host.p_fpops); + return; + } + if (!app_plan(*g_request, plan_class, hu, &wu)) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, - "[version] plan class: %s\n", plan_class + "[version] app_plan(%s) returned false\n", plan_class ); } - if (strlen(plan_class)) { - if (!app_plan(*g_request, plan_class, hu, &wu)) { - if (config.debug_version_select) { - log_messages.printf(MSG_NORMAL, - "[version] [AV#%lu] app_plan(%s) returned false\n", - bavp->avp->id, plan_class - ); - } - // can't send this job - return false; - } - } else { - hu.sequential_app(g_reply->host.p_fpops); - } - } else { - hu = bavp->host_usage; + // can't send this job + is_ok = false; } - return true; } int add_result_to_reply( @@ -934,6 +939,7 @@ int add_result_to_reply( WORKUNIT& wu, BEST_APP_VERSION* bavp, HOST_USAGE &host_usage, + bool is_buda, bool locality_scheduling ) { int retval; diff --git a/sched/sched_send.h b/sched/sched_send.h index 49a135fb19a..77e18449a6e 100644 --- a/sched/sched_send.h +++ b/sched/sched_send.h @@ -34,13 +34,14 @@ extern void send_work(); extern int add_result_to_reply( SCHED_DB_RESULT& result, WORKUNIT& wu, BEST_APP_VERSION* bavp, HOST_USAGE&, + bool is_buda, bool locality_scheduling ); -// if WU has plan class, get corresponding host_usage +// if WU has plan class, check host, and get corresponding host_usage // -extern bool handle_wu_plan_class( - WORKUNIT &wu, BEST_APP_VERSION *bavp, HOST_USAGE &host_usage +extern void check_buda_plan_class( + WORKUNIT &wu, HOST_USAGE &host_usage, bool &is_buda, bool &is_ok ); inline bool is_anonymous(PLATFORM* platform) { diff --git a/sched/sched_types.h b/sched/sched_types.h index 71795bb98f2..919aac69bcb 100644 --- a/sched/sched_types.h +++ b/sched/sched_types.h @@ -61,6 +61,11 @@ struct USER_MESSAGE { USER_MESSAGE(const char* m, const char*p); }; +// The resource usage (CPU, GPU, RAM) of a job, +// and estimates of its speed +// Populated by plan-class functions if have plan class, +// else by HOST_USAGE::sequential_app() +// struct HOST_USAGE { int proc_type; double gpu_usage; From 81514455056c92ab5526f1283f0beafd331f11af Mon Sep 17 00:00:00 2001 From: David Anderson Date: Fri, 13 Dec 2024 15:19:41 -0800 Subject: [PATCH 4/9] scheduler: for BUDA GPU jobs, put resource usage info in the element. --- sched/sched_locality.cpp | 2 +- sched/sched_send.cpp | 57 ++++++++++++++++++++++++++++++++++++++-- sched/sched_types.h | 6 ++--- sched/sched_version.cpp | 1 - 4 files changed, 59 insertions(+), 7 deletions(-) diff --git a/sched/sched_locality.cpp b/sched/sched_locality.cpp index 5869a8d8a2c..b8b93bd32d0 100644 --- a/sched/sched_locality.cpp +++ b/sched/sched_locality.cpp @@ -336,7 +336,7 @@ static int possibly_send_result(SCHED_DB_RESULT& result) { HOST_USAGE hu; check_buda_plan_class(wu, hu, is_buda, is_ok); if (is_buda) { - if (!is_ok) ERR_NO_APP_VERSION; + if (!is_ok) return ERR_NO_APP_VERSION; } else { hu = bavp->host_usage; } diff --git a/sched/sched_send.cpp b/sched/sched_send.cpp index 5fd5f1b066f..0ce0f9f88e5 100644 --- a/sched/sched_send.cpp +++ b/sched/sched_send.cpp @@ -570,10 +570,58 @@ static int insert_wu_tags(WORKUNIT& wu, APP& app) { return insert_after(wu.xml_doc, "\n", buf); } +// add host usage into to WU's xml_doc (for BUDA jobs) +// +static int add_usage_to_wu(WORKUNIT &wu, HOST_USAGE &hu) { + char buf[2048], buf2[2048]; + snprintf(buf, sizeof(buf), + " %f\n" + " %f\n", + hu.avg_ncpus, + hu.projected_flops + ); + if (hu.proc_type != PROC_TYPE_CPU) { + snprintf(buf2, sizeof(buf2), + " \n" + " %s\n" + " %f\n" + " \n", + proc_type_name_xml(hu.proc_type), + hu.gpu_usage + ); + strcat(buf, buf2); + } + if (strlen(hu.cmdline)) { + snprintf(buf2, sizeof(buf2), + " %s\n", + hu.cmdline + ); + strcat(buf, buf2); + } + + char *p = wu.xml_doc; + if (strlen(p) + strlen(buf) + 10 > sizeof(wu.xml_doc)) { + log_messages.printf(MSG_CRITICAL, + "add_usage_to_wu(): field too small: %ld %ld %ld\n", + strlen(p), strlen(buf), sizeof(wu.xml_doc) + ); + return -1; + } + p = strstr(p, ""); + if (!p) { + log_messages.printf(MSG_CRITICAL, "add_usage_to_wu(): no end tag\n"); + return -1; + } + strcpy(p, buf); + strcat(p, ""); + return 0; +} + // Add the given workunit, app, and app version to a reply. // static int add_wu_to_reply( - WORKUNIT& wu, SCHEDULER_REPLY&, APP* app, BEST_APP_VERSION* bavp + WORKUNIT& wu, SCHEDULER_REPLY&, APP* app, BEST_APP_VERSION* bavp, + bool is_buda, HOST_USAGE &hu ) { int retval; WORKUNIT wu2, wu3; @@ -626,6 +674,11 @@ static int add_wu_to_reply( ); return retval; } + + if (is_buda) { + retval = add_usage_to_wu(wu2, hu); + if (retval) return retval; + } wu3 = wu2; if (strlen(config.replace_download_url_by_timezone)) { process_wu_timezone(wu2, wu3); @@ -1015,7 +1068,7 @@ int add_result_to_reply( // done with DB updates. // - retval = add_wu_to_reply(wu, *g_reply, app, bavp); + retval = add_wu_to_reply(wu, *g_reply, app, bavp, is_buda, host_usage); if (retval) return retval; // Adjust available disk space. diff --git a/sched/sched_types.h b/sched/sched_types.h index 919aac69bcb..03522db506b 100644 --- a/sched/sched_types.h +++ b/sched/sched_types.h @@ -63,13 +63,13 @@ struct USER_MESSAGE { // The resource usage (CPU, GPU, RAM) of a job, // and estimates of its speed -// Populated by plan-class functions if have plan class, +// Populated by plan-class functions if there's a plan class, // else by HOST_USAGE::sequential_app() // struct HOST_USAGE { int proc_type; double gpu_usage; - double gpu_ram; + double gpu_ram; // not currently used by client double avg_ncpus; double mem_usage; // mem usage if specified by the plan class @@ -439,7 +439,7 @@ struct WORK_REQ_BASE { req_instances[proc_type] = 0; } - // older clients send send a single number, the requested duration of jobs + // older clients send a single number, the requested duration of jobs // double seconds_to_fill; diff --git a/sched/sched_version.cpp b/sched/sched_version.cpp index 59e0043d8fd..5396f185d40 100644 --- a/sched/sched_version.cpp +++ b/sched/sched_version.cpp @@ -982,4 +982,3 @@ BEST_APP_VERSION* get_app_version( } return bavp; } - From a4f07a169d3091e7cedc4dd562c21f3953b46aec Mon Sep 17 00:00:00 2001 From: David Anderson Date: Sat, 14 Dec 2024 01:05:59 -0800 Subject: [PATCH 5/9] client: enable it to handle BUDA GPU and multithread apps original: Info about resource usage (GPU usage, #cpus) is stored in APP_VERSION. When we need this info for a RESULT, we look at rp->avp new: For BUDA apps, the info about the actual app (not the docker wrapper) comes with the workunit, not the app version. So create a new structure, RESOURCE_USAGE. APP_VERSION has one, WORKUNIT has one. So does RESULT; when we create the result we copy the struct either from the app version or (for BUDA jobs) the workunit. Then the code can just reference rp->resource_usage. Nice. This enables BUDA/GPU functionality with almost no additional complexity. Add code to parse resource usage items in Note: info about missing GPUs (or GPUS without needed libraries) is also stored in RESOURCE_USAGE. --- client/app.cpp | 4 +- client/app_config.cpp | 12 +-- client/app_control.cpp | 6 +- client/app_start.cpp | 16 ++-- client/client_state.cpp | 12 +-- client/client_types.cpp | 177 +++++++++++++++++++++++++--------------- client/client_types.h | 51 ++++++++---- client/coproc_sched.cpp | 10 +-- client/coproc_sched.h | 14 ++-- client/cpu_sched.cpp | 48 ++++++----- client/cs_scheduler.cpp | 24 +++--- client/cs_statefile.cpp | 12 +-- client/log_flags.cpp | 9 +- client/project.cpp | 6 +- client/result.cpp | 47 ++++++----- client/result.h | 28 ++++--- client/rr_sim.cpp | 41 +++++----- client/work_fetch.cpp | 26 +++--- 18 files changed, 299 insertions(+), 244 deletions(-) diff --git a/client/app.cpp b/client/app.cpp index 284fb7d4e29..16bac18ffa1 100644 --- a/client/app.cpp +++ b/client/app.cpp @@ -285,7 +285,7 @@ int ACTIVE_TASK::init(RESULT* rp) { result = rp; wup = rp->wup; app_version = rp->avp; - max_elapsed_time = rp->wup->rsc_fpops_bound/rp->avp->flops; + max_elapsed_time = rp->wup->rsc_fpops_bound/rp->resource_usage.flops; if (max_elapsed_time < MIN_TIME_BOUND) { msg_printf(wup->project, MSG_INFO, "Elapsed time limit %f < %f; setting to %f", @@ -790,7 +790,7 @@ int ACTIVE_TASK::write_gui(MIOFILE& fout) { // double fd = fraction_done; if (((fd<=0)||(fd>1)) && elapsed_time > 60) { - double est_time = wup->rsc_fpops_est/app_version->flops; + double est_time = wup->rsc_fpops_est/result->resource_usage.flops; double x = elapsed_time/est_time; fd = 1 - exp(-x); } diff --git a/client/app_config.cpp b/client/app_config.cpp index 0f57af97608..4f843557021 100644 --- a/client/app_config.cpp +++ b/client/app_config.cpp @@ -56,9 +56,9 @@ int APP_CONFIGS::config_app_versions(PROJECT* p, bool show_warnings) { for (unsigned int j=0; japp != app) continue; - if (!avp->gpu_usage.rsc_type) continue; - avp->gpu_usage.usage = ac.gpu_gpu_usage; - avp->avg_ncpus = ac.gpu_cpu_usage; + if (!avp->resource_usage.rsc_type) continue; + avp->resource_usage.coproc_usage = ac.gpu_gpu_usage; + avp->resource_usage.avg_ncpus = ac.gpu_cpu_usage; } } for (i=0; iplan_class, avc.plan_class)) continue; found = true; if (cmdline_len) { - safe_strcpy(avp->cmdline, avc.cmdline); + safe_strcpy(avp->resource_usage.cmdline, avc.cmdline); } if (avc.avg_ncpus) { - avp->avg_ncpus = avc.avg_ncpus; + avp->resource_usage.avg_ncpus = avc.avg_ncpus; } if (avc.ngpus) { - avp->gpu_usage.usage = avc.ngpus; + avp->resource_usage.coproc_usage = avc.ngpus; } } if (!found) { diff --git a/client/app_control.cpp b/client/app_control.cpp index a86c4c597c4..7319a27ea97 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -351,11 +351,11 @@ static void limbo_message(ACTIVE_TASK& at) { // that use the GPU type, in case they're waiting for GPU RAM // static void clear_schedule_backoffs(ACTIVE_TASK* atp) { - int rt = atp->result->avp->rsc_type(); + int rt = atp->result->resource_usage.rsc_type; if (rt == RSC_TYPE_CPU) return; for (unsigned int i=0; iavp->rsc_type() == rt) { + if (rp->resource_usage.rsc_type == rt) { rp->schedule_backoff = 0; } } @@ -895,7 +895,7 @@ bool ACTIVE_TASK_SET::check_rsc_limits_exceeded() { snprintf(buf, sizeof(buf), "exceeded elapsed time limit %.2f (%.2fG/%.2fG)", atp->max_elapsed_time, atp->result->wup->rsc_fpops_bound/1e9, - atp->result->avp->flops/1e9 + atp->result->resource_usage.flops/1e9 ); msg_printf(atp->result->project, MSG_INFO, "Aborting task %s: %s", atp->result->name, buf diff --git a/client/app_start.cpp b/client/app_start.cpp index 328862a5f91..832a6719d9c 100644 --- a/client/app_start.cpp +++ b/client/app_start.cpp @@ -233,7 +233,7 @@ void ACTIVE_TASK::init_app_init_data(APP_INIT_DATA& aid) { aid.rsc_memory_bound = wup->rsc_memory_bound; aid.rsc_disk_bound = wup->rsc_disk_bound; aid.computation_deadline = result->computation_deadline(); - int rt = app_version->gpu_usage.rsc_type; + int rt = result->resource_usage.rsc_type; if (rt) { COPROC& cp = coprocs.coprocs[rt]; if (coproc_type_name_to_num(cp.type) >= 0) { @@ -252,14 +252,14 @@ void ACTIVE_TASK::init_app_init_data(APP_INIT_DATA& aid) { } aid.gpu_device_num = cp.device_nums[k]; aid.gpu_opencl_dev_index = cp.opencl_device_indexes[k]; - aid.gpu_usage = app_version->gpu_usage.usage; + aid.gpu_usage = result->resource_usage.coproc_usage; } else { safe_strcpy(aid.gpu_type, ""); aid.gpu_device_num = -1; aid.gpu_opencl_dev_index = -1; aid.gpu_usage = 0; } - aid.ncpus = app_version->avg_ncpus; + aid.ncpus = result->resource_usage.avg_ncpus; aid.vbox_window = cc_config.vbox_window; aid.checkpoint_period = gstate.global_prefs.disk_interval; aid.fraction_done_start = 0; @@ -671,8 +671,8 @@ int ACTIVE_TASK::start() { // - is a wrapper // high_priority = false; - if (app_version->rsc_type()) high_priority = true; - if (app_version->avg_ncpus < 1) high_priority = true; + if (result->resource_usage.rsc_type) high_priority = true; + if (result->resource_usage.avg_ncpus < 1) high_priority = true; if (app_version->is_wrapper) high_priority = true; current_cpu_time = checkpoint_cpu_time; @@ -968,13 +968,13 @@ int ACTIVE_TASK::start() { snprintf(cmdline, sizeof(cmdline), "%s %s", - wup->command_line.c_str(), app_version->cmdline + wup->command_line.c_str(), result->resource_usage.cmdline ); if (!app_version->api_version_at_least(7, 5)) { - int rt = app_version->gpu_usage.rsc_type; + int rt = result->resource_usage.rsc_type; if (rt) { - coproc_cmdline(rt, result, app_version->gpu_usage.usage, cmdline, sizeof(cmdline)); + coproc_cmdline(rt, result, result->resource_usage.coproc_usage, cmdline, sizeof(cmdline)); } } diff --git a/client/client_state.cpp b/client/client_state.cpp index f14e48a0704..7204365a906 100644 --- a/client/client_state.cpp +++ b/client/client_state.cpp @@ -713,17 +713,17 @@ int CLIENT_STATE::init() { // for (i=0; iflops) { - if (!avp->avg_ncpus) { - avp->avg_ncpus = 1; + if (!avp->resource_usage.flops) { + if (!avp->resource_usage.avg_ncpus) { + avp->resource_usage.avg_ncpus = 1; } - avp->flops = avp->avg_ncpus * host_info.p_fpops; + avp->resource_usage.flops = avp->resource_usage.avg_ncpus * host_info.p_fpops; // for GPU apps, use conservative estimate: // assume GPU runs at 10X peak CPU speed // - if (avp->gpu_usage.rsc_type) { - avp->flops += avp->gpu_usage.usage * 10 * host_info.p_fpops; + if (avp->resource_usage.rsc_type) { + avp->resource_usage.flops += avp->resource_usage.coproc_usage * 10 * host_info.p_fpops; } } } diff --git a/client/client_types.cpp b/client/client_types.cpp index c636a1a7da5..ff1900c5d00 100644 --- a/client/client_types.cpp +++ b/client/client_types.cpp @@ -782,18 +782,54 @@ int FILE_INFO::gunzip(char* md5_buf) { } #endif // SIM +void RESOURCE_USAGE::clear() { + avg_ncpus = 1; + rsc_type = 0; + coproc_usage = 0; + gpu_ram = 0; + flops = gstate.host_info.p_fpops; + cmdline[0] = 0; + missing_coproc = false; + missing_coproc_name[0] = 0; +} + +void RESOURCE_USAGE::check_gpu(char* plan_class) { + int rt = rsc_type; + if (!rt) return; + if (strstr(plan_class, "opencl")) { + if (!coprocs.coprocs[rt].have_opencl) { + msg_printf(0, MSG_INFO, + "App version needs OpenCL but GPU doesn't support it" + ); + missing_coproc = true; + safe_strcpy(missing_coproc_name, coprocs.coprocs[rt].type); + } + } else if (strstr(plan_class, "cuda")) { + if (!coprocs.coprocs[rt].have_cuda) { + msg_printf(0, MSG_INFO, + "App version needs CUDA but GPU doesn't support it" + ); + missing_coproc = true; + safe_strcpy(missing_coproc_name, coprocs.coprocs[rt].type); + } + } else if (strstr(plan_class, "ati")) { + if (!coprocs.coprocs[rt].have_cal) { + msg_printf(0, MSG_INFO, + "App version needs CAL but GPU doesn't support it" + ); + missing_coproc = true; + safe_strcpy(missing_coproc_name, coprocs.coprocs[rt].type); + } + } +} + void APP_VERSION::init() { safe_strcpy(app_name, ""); version_num = 0; platform[0] = 0; plan_class[0] = 0; api_version[0] = 0; - avg_ncpus = 1; - gpu_usage.rsc_type = 0; - gpu_usage.usage = 0; - gpu_ram = 0; - flops = gstate.host_info.p_fpops; - cmdline[0] = 0; + resource_usage.clear(); file_prefix[0] = 0; needs_network = false; app = NULL; @@ -803,10 +839,6 @@ void APP_VERSION::init() { graphics_exec_path[0] = 0; graphics_exec_file[0] = 0; max_working_set_size = 0; - missing_coproc = false; - missing_coproc_usage = 0.0; - missing_coproc_name[0] = 0; - dont_throttle = false; is_vm_app = false; is_wrapper = false; index = 0; @@ -818,42 +850,13 @@ void APP_VERSION::init() { int APP_VERSION::parse(XML_PARSER& xp) { FILE_REF file_ref; double dtemp; - int rt; init(); while (!xp.get_tag()) { if (xp.match_tag("/app_version")) { - rt = gpu_usage.rsc_type; - if (rt) { - dont_throttle = true; // don't throttle GPU apps - if (strstr(plan_class, "opencl")) { - if (!coprocs.coprocs[rt].have_opencl) { - msg_printf(0, MSG_INFO, - "App version needs OpenCL but GPU doesn't support it" - ); - missing_coproc = true; - missing_coproc_usage = gpu_usage.usage; - safe_strcpy(missing_coproc_name, coprocs.coprocs[rt].type); - } - } else if (strstr(plan_class, "cuda")) { - if (!coprocs.coprocs[rt].have_cuda) { - msg_printf(0, MSG_INFO, - "App version needs CUDA but GPU doesn't support it" - ); - missing_coproc = true; - missing_coproc_usage = gpu_usage.usage; - safe_strcpy(missing_coproc_name, coprocs.coprocs[rt].type); - } - } else if (strstr(plan_class, "ati")) { - if (!coprocs.coprocs[rt].have_cal) { - msg_printf(0, MSG_INFO, - "App version needs CAL but GPU doesn't support it" - ); - missing_coproc = true; - missing_coproc_usage = gpu_usage.usage; - safe_strcpy(missing_coproc_name, coprocs.coprocs[rt].type); - } - } + resource_usage.check_gpu(plan_class); + if (resource_usage.rsc_type || is_wrapper) { + dont_throttle = true; } if (strstr(plan_class, "vbox")) { is_vm_app = true; @@ -879,7 +882,7 @@ int APP_VERSION::parse(XML_PARSER& xp) { if (xp.parse_str("api_version", api_version, sizeof(api_version))) continue; if (xp.parse_str("platform", platform, sizeof(platform))) continue; if (xp.parse_str("plan_class", plan_class, sizeof(plan_class))) continue; - if (xp.parse_double("avg_ncpus", avg_ncpus)) continue; + if (xp.parse_double("avg_ncpus", resource_usage.avg_ncpus)) continue; if (xp.parse_double("max_ncpus", dtemp)) continue; if (xp.parse_double("flops", dtemp)) { if (dtemp <= 0) { @@ -887,29 +890,29 @@ int APP_VERSION::parse(XML_PARSER& xp) { "non-positive FLOPS in app version" ); } else { - flops = dtemp; + resource_usage.flops = dtemp; } continue; } - if (xp.parse_str("cmdline", cmdline, sizeof(cmdline))) continue; + if (xp.parse_str("cmdline", resource_usage.cmdline, sizeof(resource_usage.cmdline))) continue; if (xp.parse_str("file_prefix", file_prefix, sizeof(file_prefix))) continue; - if (xp.parse_double("gpu_ram", gpu_ram)) continue; + if (xp.parse_double("resource_usage.gpu_ram", resource_usage.gpu_ram)) continue; if (xp.match_tag("coproc")) { COPROC_REQ cp; int retval = cp.parse(xp); if (!retval) { - rt = rsc_index(cp.type); + int rt = rsc_index(cp.type); if (rt <= 0) { msg_printf(0, MSG_INFO, "app version refers to missing GPU type %s", cp.type ); - missing_coproc = true; - missing_coproc_usage = cp.count; - safe_strcpy(missing_coproc_name, cp.type); + resource_usage.missing_coproc = true; + resource_usage.coproc_usage = cp.count; + safe_strcpy(resource_usage.missing_coproc_name, cp.type); continue; } - gpu_usage.rsc_type = rt; - gpu_usage.usage = cp.count; + resource_usage.rsc_type = rt; + resource_usage.coproc_usage = cp.count; } else { msg_printf(0, MSG_INTERNAL_ERROR, "Error parsing "); } @@ -943,8 +946,8 @@ int APP_VERSION::write(MIOFILE& out, bool write_file_info) { app_name, version_num, platform, - avg_ncpus, - flops + resource_usage.avg_ncpus, + resource_usage.flops ); if (strlen(plan_class)) { out.printf(" %s\n", plan_class); @@ -952,8 +955,8 @@ int APP_VERSION::write(MIOFILE& out, bool write_file_info) { if (strlen(api_version)) { out.printf(" %s\n", api_version); } - if (strlen(cmdline)) { - out.printf(" %s\n", cmdline); + if (strlen(resource_usage.cmdline)) { + out.printf(" %s\n", resource_usage.cmdline); } if (strlen(file_prefix)) { out.printf(" %s\n", file_prefix); @@ -964,30 +967,30 @@ int APP_VERSION::write(MIOFILE& out, bool write_file_info) { if (retval) return retval; } } - if (gpu_usage.rsc_type) { + if (resource_usage.rsc_type) { out.printf( " \n" " %s\n" " %f\n" " \n", - rsc_name(gpu_usage.rsc_type), - gpu_usage.usage + rsc_name(resource_usage.rsc_type), + resource_usage.coproc_usage ); } - if (missing_coproc && strlen(missing_coproc_name)) { + if (resource_usage.missing_coproc && strlen(resource_usage.missing_coproc_name)) { out.printf( " \n" " %s\n" " %f\n" " \n", - missing_coproc_name, - missing_coproc_usage + resource_usage.missing_coproc_name, + resource_usage.coproc_usage ); } - if (gpu_ram) { + if (resource_usage.gpu_ram) { out.printf( " %f\n", - gpu_ram + resource_usage.gpu_ram ); } if (dont_throttle) { @@ -1150,9 +1153,9 @@ int WORKUNIT::parse(XML_PARSER& xp) { safe_strcpy(app_name, ""); version_num = 0; command_line.clear(); - //strcpy(env_vars, ""); app = NULL; project = NULL; + has_resource_usage = false; // Default these to very large values (1 week on a 1 cobblestone machine) // so we don't keep asking the server for more work rsc_fpops_est = 1e9*SECONDS_PER_DAY*7; @@ -1160,7 +1163,15 @@ int WORKUNIT::parse(XML_PARSER& xp) { rsc_memory_bound = 1e8; rsc_disk_bound = 1e9; while (!xp.get_tag()) { - if (xp.match_tag("/workunit")) return 0; + if (xp.match_tag("/workunit")) { + has_resource_usage = resource_usage.avg_ncpus>0 + || resource_usage.rsc_type!=0 + || resource_usage.missing_coproc; + if (has_resource_usage) { + resource_usage.check_gpu(plan_class); + } + return 0; + } if (xp.parse_str("name", name, sizeof(name))) continue; if (xp.parse_str("app_name", app_name, sizeof(app_name))) continue; if (xp.parse_int("version_num", version_num)) continue; @@ -1187,6 +1198,40 @@ int WORKUNIT::parse(XML_PARSER& xp) { #endif continue; } + if (xp.parse_str("plan_class", plan_class, sizeof(plan_class))) continue; + if (xp.parse_double("avg_ncpus", resource_usage.avg_ncpus)) continue; + if (xp.parse_double("flops", dtemp)) { + if (dtemp <= 0) { + msg_printf(0, MSG_INTERNAL_ERROR, "non-positive FLOPS in WU"); + } else { + resource_usage.flops = dtemp; + } + continue; + } + if (xp.parse_str("cmdline", resource_usage.cmdline, sizeof(resource_usage.cmdline))) continue; + if (xp.parse_double("resource_usage.gpu_ram", resource_usage.gpu_ram)) continue; + if (xp.match_tag("coproc")) { + COPROC_REQ cp; + retval = cp.parse(xp); + if (!retval) { + int rt = rsc_index(cp.type); + if (rt <= 0) { + msg_printf(0, MSG_INFO, + "WU refers to missing GPU type %s", cp.type + ); + resource_usage.missing_coproc = true; + resource_usage.coproc_usage = cp.count; + safe_strcpy(resource_usage.missing_coproc_name, cp.type); + continue; + } + resource_usage.rsc_type = rt; + resource_usage.coproc_usage = cp.count; + } else { + msg_printf(0, MSG_INTERNAL_ERROR, "Error parsing "); + } + continue; + } + if (xp.parse_str("job_keyword_ids", buf, sizeof(buf))) { job_keyword_ids.parse_str(buf ); continue; diff --git a/client/client_types.h b/client/client_types.h index d804b04d8ed..c10bd31773b 100644 --- a/client/client_types.h +++ b/client/client_types.h @@ -311,9 +311,26 @@ struct APP { int write(MIOFILE&); }; -struct GPU_USAGE { +// items returned by a plan class function +// +struct RESOURCE_USAGE { + double avg_ncpus; int rsc_type; // index into COPROCS array - double usage; + double coproc_usage; + double gpu_ram; + double flops; + char cmdline[256]; + // additional cmdline args + + // an app version or WU may refer to a missing GPU + // e.g. the GPU board was plugged in before but was removed. + // We don't discard them, since the board may be plugged in later. + // Instead we flag it as missing, and don't run those jobs + bool missing_coproc; + char missing_coproc_name[256]; + + void clear(); + void check_gpu(char* plan_class); }; // if you add anything, initialize it in init() @@ -324,16 +341,14 @@ struct APP_VERSION { char platform[256]; char plan_class[64]; char api_version[16]; - double avg_ncpus; - GPU_USAGE gpu_usage; // can only use 1 GPU type - double gpu_ram; - double flops; - char cmdline[256]; - // additional cmdline args + RESOURCE_USAGE resource_usage; char file_prefix[256]; // prepend this to input/output file logical names // (e.g. "share" for VM apps) bool needs_network; + bool dont_throttle; + // jobs with this app version are exempt from CPU throttling + // Set for coprocessor apps and wrapper apps APP* app; PROJECT* project; @@ -353,12 +368,6 @@ struct APP_VERSION { // to use this much RAM, // so that we don't run a long sequence of jobs, // each of which turns out not to fit in available RAM - bool missing_coproc; - double missing_coproc_usage; - char missing_coproc_name[256]; - bool dont_throttle; - // jobs of this app version are exempt from CPU throttling - // Set for coprocessor apps bool is_vm_app; // currently this set if plan class includes "vbox" (kludge) bool is_wrapper; @@ -381,11 +390,11 @@ struct APP_VERSION { void clear_errors(); bool api_version_at_least(int major, int minor); inline bool uses_coproc(int rt) { - return (gpu_usage.rsc_type == rt); - } - inline int rsc_type() { - return gpu_usage.rsc_type; + return (resource_usage.rsc_type == rt); } + //inline int rsc_type() { + // return resource_usage.rsc_type; + //} inline bool is_opencl() { return (strstr(plan_class, "opencl") != NULL); } @@ -398,6 +407,9 @@ struct WORKUNIT { int version_num; // Deprecated, but need to keep around to let people revert // to versions before multi-platform support + bool has_resource_usage; + char plan_class[256]; + RESOURCE_USAGE resource_usage; std::string command_line; std::vector input_files; PROJECT* project; @@ -413,6 +425,9 @@ struct WORKUNIT { safe_strcpy(name, ""); safe_strcpy(app_name, ""); version_num = 0; + has_resource_usage = false; + plan_class[0] = 0; + resource_usage.clear(); command_line.clear(); input_files.clear(); job_keyword_ids.clear(); diff --git a/client/coproc_sched.cpp b/client/coproc_sched.cpp index 1f1c65356bf..0c72ffb7c95 100644 --- a/client/coproc_sched.cpp +++ b/client/coproc_sched.cpp @@ -292,10 +292,9 @@ void assign_coprocs(vector& jobs) { // for (i=0; iavp; - int rt = avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; if (rt) { - usage = avp->gpu_usage.usage; + usage = rp->resource_usage.coproc_usage; cp = &coprocs.coprocs[rt]; } else { continue; @@ -311,10 +310,9 @@ void assign_coprocs(vector& jobs) { job_iter = jobs.begin(); while (job_iter != jobs.end()) { RESULT* rp = *job_iter; - APP_VERSION* avp = rp->avp; - int rt = avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; if (rt) { - usage = avp->gpu_usage.usage; + usage = rp->resource_usage.coproc_usage; cp = &coprocs.coprocs[rt]; } else { ++job_iter; diff --git a/client/coproc_sched.h b/client/coproc_sched.h index d15a7ca54a3..bcdd4291efd 100644 --- a/client/coproc_sched.h +++ b/client/coproc_sched.h @@ -54,14 +54,13 @@ struct SPORADIC_RESOURCES { return false; } RESULT *rp = atp->result; - APP_VERSION *avp = rp->avp; - if (ncpus_used + avp->avg_ncpus > ncpus_max) { + if (ncpus_used + rp->resource_usage.avg_ncpus > ncpus_max) { return false; } - int rt = avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; bool found = false; if (rt) { - double u = avp->gpu_usage.usage; + double u = rp->resource_usage.coproc_usage; COPROC& cp = sr_coprocs.coprocs[rt]; for (int i=0; iapp, cp, i)) continue; @@ -78,12 +77,11 @@ struct SPORADIC_RESOURCES { // reserve resources for the task void reserve(ACTIVE_TASK *atp) { RESULT *rp = atp->result; - APP_VERSION *avp = rp->avp; mem_used += atp->procinfo.working_set_size_smoothed; - ncpus_used+= avp->avg_ncpus; - int rt = avp->gpu_usage.rsc_type; + ncpus_used+= rp->resource_usage.avg_ncpus; + int rt = rp->resource_usage.rsc_type; if (rt) { - double u = avp->gpu_usage.usage; + double u = rp->resource_usage.coproc_usage; COPROC& cp = sr_coprocs.coprocs[rt]; for (int i=0; iapp, cp, i)) continue; diff --git a/client/cpu_sched.cpp b/client/cpu_sched.cpp index 1a93ec72ffb..403eb211b47 100644 --- a/client/cpu_sched.cpp +++ b/client/cpu_sched.cpp @@ -156,9 +156,9 @@ struct PROC_RESOURCES { } else { return false; } - } else if (rp->avp->avg_ncpus > 1) { + } else if (rp->resource_usage.avg_ncpus > 1) { if (ncpus_used_mt == 0) return true; - return (ncpus_used_mt + rp->avp->avg_ncpus <= ncpus); + return (ncpus_used_mt + rp->resource_usage.avg_ncpus <= ncpus); } else { return (ncpus_used_st < ncpus); } @@ -167,7 +167,7 @@ struct PROC_RESOURCES { // we've decided to add this to the runnable list; update bookkeeping // void schedule(RESULT* rp, ACTIVE_TASK* atp, bool is_edf) { - int rt = rp->avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; // see if it's possible this job will be ruled out // when we try to actually run it @@ -195,10 +195,10 @@ struct PROC_RESOURCES { // - we end up running the uncheckpointed job // - this causes all or part of a CPU to be idle // - } else if (rp->avp->avg_ncpus > 1) { - ncpus_used_mt += rp->avp->avg_ncpus; + } else if (rp->resource_usage.avg_ncpus > 1) { + ncpus_used_mt += rp->resource_usage.avg_ncpus; } else { - ncpus_used_st += rp->avp->avg_ncpus; + ncpus_used_st += rp->resource_usage.avg_ncpus; } } if (log_flags.cpu_sched_debug) { @@ -215,10 +215,9 @@ struct PROC_RESOURCES { } bool sufficient_coprocs(RESULT& r) { - APP_VERSION& av = *r.avp; - int rt = av.gpu_usage.rsc_type; + int rt = r.resource_usage.rsc_type; if (!rt) return true; - double x = av.gpu_usage.usage; + double x = r.resource_usage.coproc_usage; COPROC& cp = pr_coprocs.coprocs[rt]; for (int i=0; iavp->gpu_usage.rsc_type) { + if (rp->resource_usage.rsc_type) { rp->coproc_missing = true; } } @@ -291,7 +289,7 @@ bool check_coprocs_usable() { gpus_usable = true; for (i=0; iavp->gpu_usage.rsc_type) { + if (rp->resource_usage.rsc_type) { rp->coproc_missing = false; } } @@ -614,12 +612,12 @@ static void update_rec() { } } -static double peak_flops(APP_VERSION* avp) { +static double peak_flops(RESULT *rp) { double f = gstate.host_info.p_fpops; - double x = f * avp->avg_ncpus; - int rt = avp->gpu_usage.rsc_type; + double x = f * rp->resource_usage.avg_ncpus; + int rt = rp->resource_usage.rsc_type; if (rt) { - x += f * avp->gpu_usage.usage * rsc_work_fetch[rt].relative_speed; + x += f * rp->resource_usage.coproc_usage * rsc_work_fetch[rt].relative_speed; } return x; } @@ -698,7 +696,7 @@ void PROJECT::compute_sched_priority() { // void adjust_rec_sched(RESULT* rp) { PROJECT* p = rp->project; - p->pwf.rec_temp += peak_flops(rp->avp)/total_peak_flops() * rec_sum/24; + p->pwf.rec_temp += peak_flops(rp)/total_peak_flops() * rec_sum/24; p->compute_sched_priority(); } @@ -803,7 +801,7 @@ static void promote_once_ran_edf() { if (atp->once_ran_edf) { RESULT* rp = atp->result; PROJECT* p = rp->project; - if (p->deadlines_missed(rp->avp->rsc_type())) { + if (p->deadlines_missed(rp->resource_usage.rsc_type)) { if (log_flags.cpu_sched_debug) { msg_printf(p, MSG_INFO, "[cpu_sched_debug] domino prevention: mark %s as deadline miss", @@ -1085,8 +1083,8 @@ static inline bool more_important(RESULT* r0, RESULT* r1) { // for CPU jobs, favor jobs that use more CPUs // if (!cp0) { - if (r0->avp->avg_ncpus > r1->avp->avg_ncpus) return true; - if (r1->avp->avg_ncpus > r0->avp->avg_ncpus) return false; + if (r0->resource_usage.avg_ncpus > r1->resource_usage.avg_ncpus) return true; + if (r1->resource_usage.avg_ncpus > r0->resource_usage.avg_ncpus) return false; } // favor jobs selected first by schedule_cpus() @@ -1277,7 +1275,7 @@ bool CLIENT_STATE::enforce_run_list(vector& run_list) { // if (ncpus_used >= n_usable_cpus) { if (rp->uses_coprocs()) { - if (ncpus_used + rp->avp->avg_ncpus > n_usable_cpus+1) { + if (ncpus_used + rp->resource_usage.avg_ncpus > n_usable_cpus+1) { if (log_flags.cpu_sched_debug) { msg_printf(rp->project, MSG_INFO, "[cpu_sched_debug] skipping GPU job %s; CPU committed", @@ -1370,7 +1368,7 @@ bool CLIENT_STATE::enforce_run_list(vector& run_list) { continue; } - ncpus_used += rp->avp->avg_ncpus; + ncpus_used += rp->resource_usage.avg_ncpus; atp->next_scheduler_state = CPU_SCHED_SCHEDULED; ram_left -= ewss; if (have_max_concurrent) { diff --git a/client/cs_scheduler.cpp b/client/cs_scheduler.cpp index b36dfe1d453..808807e7441 100644 --- a/client/cs_scheduler.cpp +++ b/client/cs_scheduler.cpp @@ -305,22 +305,22 @@ int CLIENT_STATE::make_scheduler_request(PROJECT* p) { double x = rp->estimated_runtime_remaining(); if (x == 0) continue; safe_strcpy(buf, ""); - int rt = rp->avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; if (rt) { if (rt == rsc_index(GPU_TYPE_NVIDIA)) { snprintf(buf, sizeof(buf), " %f\n", - rp->avp->gpu_usage.usage + rp->resource_usage.coproc_usage ); } else if (rt == rsc_index(GPU_TYPE_ATI)) { snprintf(buf, sizeof(buf), " %f\n", - rp->avp->gpu_usage.usage + rp->resource_usage.coproc_usage ); } else if (rt == rsc_index(GPU_TYPE_INTEL)) { snprintf(buf, sizeof(buf), " %f\n", - rp->avp->gpu_usage.usage + rp->resource_usage.coproc_usage ); } } @@ -335,7 +335,7 @@ int CLIENT_STATE::make_scheduler_request(PROJECT* p) { rp->name, rp->report_deadline, x, - rp->avp->avg_ncpus, + rp->resource_usage.avg_ncpus, buf ); } @@ -912,10 +912,10 @@ int CLIENT_STATE::handle_scheduler_reply( continue; } } - if (avpp.missing_coproc) { + if (avpp.resource_usage.missing_coproc) { msg_printf(project, MSG_INTERNAL_ERROR, "App version uses non-existent %s GPU", - avpp.missing_coproc_name + avpp.resource_usage.missing_coproc_name ); } APP* app = lookup_app(project, avpp.app_name); @@ -931,10 +931,7 @@ int CLIENT_STATE::handle_scheduler_reply( if (avp) { // update app version attributes in case they changed on server // - avp->avg_ncpus = avpp.avg_ncpus; - avp->flops = avpp.flops; - safe_strcpy(avp->cmdline, avpp.cmdline); - avp->gpu_usage = avpp.gpu_usage; + avp->resource_usage = avpp.resource_usage; strlcpy(avp->api_version, avpp.api_version, sizeof(avp->api_version)); avp->dont_throttle = avpp.dont_throttle; avp->needs_network = avpp.needs_network; @@ -1016,7 +1013,8 @@ int CLIENT_STATE::handle_scheduler_reply( delete rp; continue; } - if (rp->avp->missing_coproc) { + rp->init_resource_usage(); + if (rp->resource_usage.missing_coproc) { msg_printf(project, MSG_INTERNAL_ERROR, "Missing coprocessor for task %s; aborting", rp->name ); @@ -1024,7 +1022,7 @@ int CLIENT_STATE::handle_scheduler_reply( } else { rp->set_state(RESULT_NEW, "handle_scheduler_reply"); got_work_for_rsc[0] = true; - int rt = rp->avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; if (rt > 0) { est_rsc_runtime[rt] += rp->estimated_runtime(); got_work_for_rsc[rt] = true; diff --git a/client/cs_statefile.cpp b/client/cs_statefile.cpp index 09a34d17aea..d3e1b53cf30 100644 --- a/client/cs_statefile.cpp +++ b/client/cs_statefile.cpp @@ -292,18 +292,18 @@ int CLIENT_STATE::parse_state_file_aux(const char* fname) { safe_strcpy(avp->platform, get_primary_platform()); } } - if (avp->missing_coproc) { - if (strstr(avp->missing_coproc_name, "Apple ")) { + if (avp->resource_usage.missing_coproc) { + if (strstr(avp->resource_usage.missing_coproc_name, "Apple ")) { msg_printf(project, MSG_INFO, "App version uses deprecated GPU type '%s' - discarding", - avp->missing_coproc_name + avp->resource_usage.missing_coproc_name ); delete avp; continue; } else { msg_printf(project, MSG_INFO, "App version uses missing GPU '%s'", - avp->missing_coproc_name + avp->resource_usage.missing_coproc_name ); } } @@ -394,11 +394,11 @@ int CLIENT_STATE::parse_state_file_aux(const char* fname) { delete rp; continue; } - if (rp->avp->missing_coproc) { + rp->init_resource_usage(); + if (rp->resource_usage.missing_coproc) { msg_printf(project, MSG_INFO, "Missing coprocessor for task %s", rp->name ); - rp->coproc_missing = true; } rp->wup->version_num = rp->version_num; results.push_back(rp); diff --git a/client/log_flags.cpp b/client/log_flags.cpp index 74c11ca5253..a08943857ad 100644 --- a/client/log_flags.cpp +++ b/client/log_flags.cpp @@ -770,8 +770,8 @@ void process_gpu_exclusions() { for (i=0; imissing_coproc) continue; - int rt = avp->gpu_usage.rsc_type; + if (avp->resource_usage.missing_coproc) continue; + int rt = avp->resource_usage.rsc_type; if (!rt) continue; COPROC& cp = coprocs.coprocs[rt]; bool found = false; @@ -782,12 +782,11 @@ void process_gpu_exclusions() { } } if (found) continue; - avp->missing_coproc = true; - safe_strcpy(avp->missing_coproc_name, ""); + avp->resource_usage.missing_coproc = true; + safe_strcpy(avp->resource_usage.missing_coproc_name, ""); for (j=0; javp != avp) continue; - rp->coproc_missing = true; msg_printf(avp->project, MSG_INFO, "marking %s as coproc missing", rp->name diff --git a/client/project.cpp b/client/project.cpp index 591da195f47..7116ef2021b 100644 --- a/client/project.cpp +++ b/client/project.cpp @@ -698,7 +698,7 @@ void PROJECT::get_task_durs(double& not_started_dur, double& in_progress_dur) { RESULT* rp = gstate.results[i]; if (rp->project != this) continue; double d = rp->estimated_runtime_remaining(); - d /= gstate.time_stats.availability_frac(rp->avp->gpu_usage.rsc_type); + d /= gstate.time_stats.availability_frac(rp->resource_usage.rsc_type); if (rp->is_not_started()) { not_started_dur += d; } else { @@ -827,7 +827,7 @@ bool PROJECT::runnable(int rsc_type) { RESULT* rp = gstate.results[i]; if (rp->project != this) continue; if (rsc_type != RSC_TYPE_ANY) { - if (rp->avp->gpu_usage.rsc_type != rsc_type) { + if (rp->resource_usage.rsc_type != rsc_type) { continue; } } @@ -981,7 +981,7 @@ void PROJECT::check_no_apps() { for (unsigned int i=0; iproject != this) continue; - no_rsc_apps[avp->gpu_usage.rsc_type] = false; + no_rsc_apps[avp->resource_usage.rsc_type] = false; } } diff --git a/client/result.cpp b/client/result.cpp index 0a780872d4f..eb68f109564 100644 --- a/client/result.cpp +++ b/client/result.cpp @@ -78,7 +78,6 @@ void RESULT::clear() { exit_status = 0; stderr_out.clear(); suspended_via_gui = false; - coproc_missing = false; report_immediately = false; not_started = false; name_md5.clear(); @@ -389,7 +388,7 @@ int RESULT::write_gui(MIOFILE& out, bool check_resources) { if (project->suspended_via_gui) out.printf(" \n"); if (report_immediately) out.printf(" \n"); if (edf_scheduled) out.printf(" \n"); - if (coproc_missing) out.printf(" \n"); + if (resource_usage.missing_coproc) out.printf(" \n"); if (schedule_backoff > gstate.now) { out.printf(" \n"); if (strlen(schedule_backoff_reason)) { @@ -405,35 +404,35 @@ int RESULT::write_gui(MIOFILE& out, bool check_resources) { atp->write_gui(out); } if (!strlen(resources) || check_resources) { // update resource string only when zero or when app_config is updated. - if (avp->gpu_usage.rsc_type) { - if (avp->gpu_usage.usage == 1) { + if (resource_usage.rsc_type) { + if (resource_usage.coproc_usage == 1) { snprintf(resources, sizeof(resources), "%.3g %s + 1 %s", - avp->avg_ncpus, - cpu_string(avp->avg_ncpus), - rsc_name_long(avp->gpu_usage.rsc_type) + resource_usage.avg_ncpus, + cpu_string(resource_usage.avg_ncpus), + rsc_name_long(resource_usage.rsc_type) ); } else { snprintf(resources, sizeof(resources), "%.3g %s + %.3g %ss", - avp->avg_ncpus, - cpu_string(avp->avg_ncpus), - avp->gpu_usage.usage, - rsc_name_long(avp->gpu_usage.rsc_type) + resource_usage.avg_ncpus, + cpu_string(resource_usage.avg_ncpus), + resource_usage.coproc_usage, + rsc_name_long(resource_usage.rsc_type) ); } - } else if (avp->missing_coproc) { + } else if (resource_usage.missing_coproc) { snprintf(resources, sizeof(resources), "%.3g %s + %.12s GPU (missing)", - avp->avg_ncpus, - cpu_string(avp->avg_ncpus), - avp->missing_coproc_name + resource_usage.avg_ncpus, + cpu_string(resource_usage.avg_ncpus), + resource_usage.missing_coproc_name ); - } else if (!project->non_cpu_intensive && (avp->avg_ncpus != 1)) { + } else if (!project->non_cpu_intensive && (resource_usage.avg_ncpus != 1)) { snprintf(resources, sizeof(resources), "%.3g %s", - avp->avg_ncpus, - cpu_string(avp->avg_ncpus) + resource_usage.avg_ncpus, + cpu_string(resource_usage.avg_ncpus) ); } else { safe_strcpy(resources, " "); @@ -444,13 +443,13 @@ int RESULT::write_gui(MIOFILE& out, bool check_resources) { char buf[256]; safe_strcpy(buf, ""); if (atp && atp->scheduler_state == CPU_SCHED_SCHEDULED) { - if (avp->gpu_usage.rsc_type) { - COPROC& cp = coprocs.coprocs[avp->gpu_usage.rsc_type]; + if (resource_usage.rsc_type) { + COPROC& cp = coprocs.coprocs[resource_usage.rsc_type]; if (cp.count > 1) { // if there are multiple GPUs of this type, // show the user which one(s) are being used // - int n = (int)ceil(avp->gpu_usage.usage); + int n = (int)ceil(resource_usage.coproc_usage); safe_strcpy(buf, n>1?" (devices ":" (device "); for (int i=0; isuspended_via_gui) return false; if (state() != RESULT_FILES_DOWNLOADED) return false; - if (coproc_missing) return false; + if (resource_usage.missing_coproc) return false; if (schedule_backoff > gstate.now) return false; if (avp->needs_network && gstate.file_xfers_suspended) { // check file_xfers_suspended rather than network_suspended; @@ -618,7 +617,7 @@ bool RESULT::nearly_runnable() { default: return false; } - if (coproc_missing) return false; + if (resource_usage.missing_coproc) return false; if (schedule_backoff > gstate.now) return false; return true; } @@ -635,7 +634,7 @@ bool RESULT::downloading() { } double RESULT::estimated_runtime_uncorrected() { - return wup->rsc_fpops_est/avp->flops; + return wup->rsc_fpops_est/resource_usage.flops; } // estimate how long a result will take on this host diff --git a/client/result.h b/client/result.h index 555e735eb34..22b4e5a4a05 100644 --- a/client/result.h +++ b/client/result.h @@ -81,9 +81,6 @@ struct RESULT { // // - X, where X is the app's stderr output bool suspended_via_gui; - bool coproc_missing; - // a coproc needed by this job is missing - // (e.g. because user removed their GPU board). bool report_immediately; bool not_started; // temp for CPU sched @@ -92,6 +89,8 @@ struct RESULT { APP* app; WORKUNIT* wup; + RESOURCE_USAGE resource_usage; + // copied from either app version or workunit PROJECT* project; RESULT(){ @@ -124,10 +123,17 @@ struct RESULT { #ifdef SIM return sim_flops_left; #else - return estimated_runtime_remaining()*avp->flops; + return estimated_runtime_remaining()*resource_usage.flops; #endif } + inline void init_resource_usage() { + if (wup->has_resource_usage) { + resource_usage = wup->resource_usage; + } else { + resource_usage = avp->resource_usage; + } + } inline bool computing_done() { if (state() >= RESULT_COMPUTE_ERROR) return true; if (ready_to_report) return true; @@ -144,16 +150,16 @@ struct RESULT { // some input or app file is downloading, and backed off // i.e. it may be a long time before we can run this result inline bool uses_coprocs() { - return (avp->gpu_usage.rsc_type != 0); + return (resource_usage.rsc_type != 0); } inline bool uses_gpu() { - int rt = avp->gpu_usage.rsc_type; + int rt = resource_usage.rsc_type; if (!rt) return false; if (coprocs.coprocs[rt].non_gpu) return false; return true; } inline int resource_type() { - return avp->gpu_usage.rsc_type; + return resource_usage.rsc_type; } inline bool non_cpu_intensive() { if (project->non_cpu_intensive) return true; @@ -172,14 +178,14 @@ struct RESULT { } // make a string describing resource usage inline void rsc_string(char* buf, int len) { - if (avp->gpu_usage.rsc_type) { + if (resource_usage.rsc_type) { snprintf(buf, len, "%.2f CPU + %.2f %s", - avp->avg_ncpus, avp->gpu_usage.usage, - rsc_name_long(avp->gpu_usage.rsc_type) + resource_usage.avg_ncpus, resource_usage.coproc_usage, + rsc_name_long(resource_usage.rsc_type) ); } else { - snprintf(buf, len, "%.2f CPU", avp->avg_ncpus); + snprintf(buf, len, "%.2f CPU", resource_usage.avg_ncpus); } } diff --git a/client/rr_sim.cpp b/client/rr_sim.cpp index c9a4edc9a5f..f9d57090408 100644 --- a/client/rr_sim.cpp +++ b/client/rr_sim.cpp @@ -80,20 +80,20 @@ struct RR_SIM { inline void activate(RESULT* rp) { PROJECT* p = rp->project; active_jobs.push_back(rp); - int rt = rp->avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; // if this is a GPU app and GPU computing is suspended, // don't count its CPU usage. // That way we'll fetch more CPU work if needed. // if (!rt || !gpu_suspend_reason) { - rsc_work_fetch[0].sim_nused += rp->avp->avg_ncpus; - p->rsc_pwf[0].sim_nused += rp->avp->avg_ncpus; + rsc_work_fetch[0].sim_nused += rp->resource_usage.avg_ncpus; + p->rsc_pwf[0].sim_nused += rp->resource_usage.avg_ncpus; } if (rt) { - rsc_work_fetch[rt].sim_nused += rp->avp->gpu_usage.usage; - p->rsc_pwf[rt].sim_nused += rp->avp->gpu_usage.usage; + rsc_work_fetch[rt].sim_nused += rp->resource_usage.coproc_usage; + p->rsc_pwf[rt].sim_nused += rp->resource_usage.coproc_usage; if (rsc_work_fetch[rt].has_exclusions) { set_bits( rp->app->non_excluded_instances[rt], @@ -130,11 +130,11 @@ void set_rrsim_flops(RESULT* rp) { // For coproc jobs, use app version estimate // if (rp->uses_gpu()) { - rp->rrsim_flops = rp->avp->flops * gstate.overall_gpu_frac(); + rp->rrsim_flops = rp->resource_usage.flops * gstate.overall_gpu_frac(); } else if (rp->avp->needs_network) { - rp->rrsim_flops = rp->avp->flops * gstate.overall_cpu_and_network_frac(); + rp->rrsim_flops = rp->resource_usage.flops * gstate.overall_cpu_and_network_frac(); } else { - rp->rrsim_flops = rp->avp->flops * gstate.overall_cpu_frac(); + rp->rrsim_flops = rp->resource_usage.flops * gstate.overall_cpu_frac(); } if (rp->rrsim_flops == 0) { rp->rrsim_flops = 1e6; // just in case @@ -195,11 +195,11 @@ void RR_SIM::init_pending_lists() { PROJECT* p = rp->project; p->pwf.n_runnable_jobs++; - p->rsc_pwf[0].nused_total += rp->avp->avg_ncpus; + p->rsc_pwf[0].nused_total += rp->resource_usage.avg_ncpus; set_rrsim_flops(rp); - int rt = rp->avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; if (rt) { - p->rsc_pwf[rt].nused_total += rp->avp->gpu_usage.usage; + p->rsc_pwf[rt].nused_total += rp->resource_usage.coproc_usage; p->rsc_pwf[rt].n_runnable_jobs++; p->rsc_pwf[rt].queue_est += rp->rrsim_flops_left/rp->rrsim_flops; } @@ -407,13 +407,13 @@ static void handle_missed_deadline(RESULT* rpbest, double diff, double ar) { } } else { rpbest->rr_sim_misses_deadline = true; - int rt = rpbest->avp->gpu_usage.rsc_type; + int rt = rpbest->resource_usage.rsc_type; if (rt) { pbest->rsc_pwf[rt].deadlines_missed++; - rsc_work_fetch[rt].deadline_missed_instances += rpbest->avp->gpu_usage.usage; + rsc_work_fetch[rt].deadline_missed_instances += rpbest->resource_usage.coproc_usage; } else { pbest->rsc_pwf[0].deadlines_missed++; - rsc_work_fetch[0].deadline_missed_instances += rpbest->avp->avg_ncpus; + rsc_work_fetch[0].deadline_missed_instances += rpbest->resource_usage.avg_ncpus; } if (log_flags.rr_simulation) { msg_printf(pbest, MSG_INFO, @@ -561,10 +561,10 @@ void RR_SIM::simulate() { // double frac = rpbest->uses_gpu()?gstate.overall_gpu_frac():gstate.overall_cpu_frac(); double dur = rpbest->estimated_runtime_remaining() / frac; - rsc_work_fetch[0].update_busy_time(dur, rpbest->avp->avg_ncpus); - int rt = rpbest->avp->gpu_usage.rsc_type; + rsc_work_fetch[0].update_busy_time(dur, rpbest->resource_usage.avg_ncpus); + int rt = rpbest->resource_usage.rsc_type; if (rt) { - rsc_work_fetch[rt].update_busy_time(dur, rpbest->avp->gpu_usage.usage); + rsc_work_fetch[rt].update_busy_time(dur, rpbest->resource_usage.coproc_usage); } } } @@ -698,20 +698,19 @@ int n_idle_resources() { RESULT* rp = gstate.results[i]; if (!rp->nearly_runnable()) continue; if (rp->some_download_stalled()) continue; - APP_VERSION* avp = rp->avp; if (rsc_work_fetch[0].nidle_now) { - rsc_work_fetch[0].nidle_now -= avp->avg_ncpus; + rsc_work_fetch[0].nidle_now -= rp->resource_usage.avg_ncpus; if (rsc_work_fetch[0].nidle_now <= 0) { nidle_rsc--; rsc_work_fetch[0].nidle_now = 0; } } - int j = avp->gpu_usage.rsc_type; + int j = rp->resource_usage.rsc_type; if (!j) { continue; } if (rsc_work_fetch[j].nidle_now) { - rsc_work_fetch[j].nidle_now -= avp->gpu_usage.usage; + rsc_work_fetch[j].nidle_now -= rp->resource_usage.coproc_usage; if (rsc_work_fetch[j].nidle_now <= 0) { nidle_rsc--; rsc_work_fetch[j].nidle_now = 0; diff --git a/client/work_fetch.cpp b/client/work_fetch.cpp index 01b5ea59c51..9c12dcf2743 100644 --- a/client/work_fetch.cpp +++ b/client/work_fetch.cpp @@ -60,7 +60,7 @@ inline bool has_coproc_app(PROJECT* p, int rsc_type) { for (i=0; iproject != p) continue; - if (avp->gpu_usage.rsc_type == rsc_type) return true; + if (avp->resource_usage.rsc_type == rsc_type) return true; } return false; } @@ -82,10 +82,10 @@ void RSC_PROJECT_WORK_FETCH::rr_init(PROJECT *p) { for (i=0; iproject != p) continue; - if (rsc_type && (avp->gpu_usage.rsc_type == rsc_type)) { - if (avp->gpu_usage.usage > x) x = avp->gpu_usage.usage; + if (rsc_type && (avp->resource_usage.rsc_type == rsc_type)) { + if (avp->resource_usage.coproc_usage > x) x = avp->resource_usage.coproc_usage; } else { - if (avp->avg_ncpus > x) x = avp->avg_ncpus; + if (avp->resource_usage.avg_ncpus > x) x = avp->resource_usage.avg_ncpus; } } @@ -442,7 +442,7 @@ void WORK_FETCH::rr_init() { RESULT* rp = gstate.results[i]; if (rp->schedule_backoff) { if (rp->schedule_backoff > gstate.now) { - int rt = rp->avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; rp->project->rsc_pwf[rt].has_deferred_job = true; } else { rp->schedule_backoff = 0; @@ -947,14 +947,14 @@ PROJECT* WORK_FETCH::choose_project() { // in last dt sec, and add to project totals // void WORK_FETCH::accumulate_inst_sec(ACTIVE_TASK* atp, double dt) { - APP_VERSION* avp = atp->result->avp; - PROJECT* p = atp->result->project; - double x = dt*avp->avg_ncpus; + RESULT *rp = atp->result; + PROJECT* p = rp->project; + double x = dt*rp->resource_usage.avg_ncpus; p->rsc_pwf[0].secs_this_rec_interval += x; rsc_work_fetch[0].secs_this_rec_interval += x; - int rt = avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; if (rt) { - x = dt*avp->gpu_usage.usage; + x = dt*rp->resource_usage.coproc_usage; p->rsc_pwf[rt].secs_this_rec_interval += x; rsc_work_fetch[rt].secs_this_rec_interval += x; } @@ -1049,7 +1049,7 @@ void WORK_FETCH::handle_reply( } for (unsigned int i=0; iavp->gpu_usage.rsc_type] = true; + got_work[rp->resource_usage.rsc_type] = true; } for (int i=0; iproject != p) continue; - p->rsc_pwf[avp->gpu_usage.rsc_type].anonymous_platform_no_apps = false; + p->rsc_pwf[avp->resource_usage.rsc_type].anonymous_platform_no_apps = false; } } } @@ -1135,7 +1135,7 @@ void WORK_FETCH::init() { // clear backoff for app's resource // void WORK_FETCH::clear_backoffs(APP_VERSION& av) { - av.project->rsc_pwf[av.gpu_usage.rsc_type].clear_backoff(); + av.project->rsc_pwf[av.resource_usage.rsc_type].clear_backoff(); } //////////////////////// From f5fc8b960b146d2a8f126c2f431636668179c77a Mon Sep 17 00:00:00 2001 From: davidpanderson Date: Sat, 14 Dec 2024 11:52:23 -0800 Subject: [PATCH 6/9] win build fixes --- client/app_start.cpp | 6 +++--- client/cpu_sched.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/client/app_start.cpp b/client/app_start.cpp index 832a6719d9c..842391fceab 100644 --- a/client/app_start.cpp +++ b/client/app_start.cpp @@ -767,12 +767,12 @@ int ACTIVE_TASK::start() { snprintf(cmdline, sizeof(cmdline), "%s %s %s", - exec_path, wup->command_line.c_str(), app_version->cmdline + exec_path, wup->command_line.c_str(), result->resource_usage.cmdline ); if (!app_version->api_version_at_least(7, 5)) { - int rt = app_version->gpu_usage.rsc_type; + int rt = result->resource_usage.rsc_type; if (rt) { - coproc_cmdline(rt, result, app_version->gpu_usage.usage, cmdline, sizeof(cmdline)); + coproc_cmdline(rt, result, result->resource_usage.coproc_usage, cmdline, sizeof(cmdline)); } } diff --git a/client/cpu_sched.cpp b/client/cpu_sched.cpp index 403eb211b47..e512a4c31e0 100644 --- a/client/cpu_sched.cpp +++ b/client/cpu_sched.cpp @@ -276,7 +276,7 @@ bool check_coprocs_usable() { for (i=0; iresource_usage.rsc_type) { - rp->coproc_missing = true; + rp->resource_usage.missing_coproc = true; } } msg_printf(NULL, MSG_INFO, @@ -290,7 +290,7 @@ bool check_coprocs_usable() { for (i=0; iresource_usage.rsc_type) { - rp->coproc_missing = false; + rp->resource_usage.missing_coproc = false; } } msg_printf(NULL, MSG_INFO, From 1e77aeb9e21fce310affaa7e7c9f64a2cccf22f3 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Sun, 15 Dec 2024 12:22:26 -0800 Subject: [PATCH 7/9] Fix client simulator build --- client/result.cpp | 2 +- client/sim.cpp | 65 ++++++++++++++++++++++++--------------------- client/sim_util.cpp | 2 +- 3 files changed, 36 insertions(+), 33 deletions(-) diff --git a/client/result.cpp b/client/result.cpp index eb68f109564..64d25c14080 100644 --- a/client/result.cpp +++ b/client/result.cpp @@ -664,7 +664,7 @@ double RESULT::estimated_runtime_remaining() { if (atp) { #ifdef SIM - return sim_flops_left/avp->flops; + return sim_flops_left/resource_usage.flops; #else return atp->est_dur() - atp->elapsed_time; #endif diff --git a/client/sim.cpp b/client/sim.cpp index 7f8d5a528d6..6258fb73f78 100644 --- a/client/sim.cpp +++ b/client/sim.cpp @@ -133,13 +133,13 @@ void usage(char* prog) { exit(1); } -// peak flops of an app version +// peak flops of a result // -double app_peak_flops(APP_VERSION* avp, double cpu_scale) { - double x = avp->avg_ncpus*cpu_scale; - int rt = avp->gpu_usage.rsc_type; +double app_peak_flops(RESULT* rp, double cpu_scale) { + double x = rp->resource_usage.avg_ncpus*cpu_scale; + int rt = rp->resource_usage.rsc_type; if (rt) { - x += avp->gpu_usage.usage * rsc_work_fetch[rt].relative_speed; + x += rp->resource_usage.coproc_usage * rsc_work_fetch[rt].relative_speed; } x *= gstate.host_info.p_fpops; return x; @@ -184,7 +184,7 @@ APP* choose_app(vector& apps) { bool app_version_needs_work(APP_VERSION* avp) { if (avp->dont_use) return false; - int rt = avp->gpu_usage.rsc_type; + int rt = avp->resource_usage.rsc_type; if (rt) { return (rsc_work_fetch[rt].req_secs>0 || rsc_work_fetch[rt].req_instances>0); } @@ -210,7 +210,7 @@ APP_VERSION* choose_app_version(APP* app) { if (!app_version_needs_work(avp)) continue; if (!best_avp) { best_avp = avp; - } else if (avp->flops > best_avp->flops) { + } else if (avp->resource_usage.flops > best_avp->resource_usage.flops) { best_avp = avp; } } @@ -325,18 +325,21 @@ void decrement_request_rsc( } void decrement_request(RESULT* rp) { - APP_VERSION* avp = rp->avp; - double est_runtime = rp->wup->rsc_fpops_est/avp->flops; + double est_runtime = rp->wup->rsc_fpops_est/rp->resource_usage.flops; est_runtime /= (gstate.time_stats.on_frac*gstate.time_stats.active_frac); - decrement_request_rsc(rsc_work_fetch[0], avp->avg_ncpus, est_runtime); - int rt = avp->gpu_usage.rsc_type; + decrement_request_rsc( + rsc_work_fetch[0], rp->resource_usage.avg_ncpus, est_runtime + ); + int rt = rp->resource_usage.rsc_type; if (rt) { - decrement_request_rsc(rsc_work_fetch[rt], avp->gpu_usage.usage, est_runtime); + decrement_request_rsc( + rsc_work_fetch[rt], rp->resource_usage.coproc_usage, est_runtime + ); } } double get_estimated_delay(RESULT* rp) { - int rt = rp->avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; return rsc_work_fetch[rt].estimated_delay; } @@ -415,7 +418,7 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) { WORKUNIT* wup = new WORKUNIT; make_job(p, wup, rp, wapps); - double et = wup->rsc_fpops_est / rp->avp->flops; + double et = wup->rsc_fpops_est / rp->resource_usage.flops; if (server_uses_workload) { IP_RESULT c(rp->name, rp->report_deadline-now, et); if (check_candidate(c, n_usable_cpus, ip_results)) { @@ -596,10 +599,10 @@ bool ACTIVE_TASK_SET::poll() { RESULT* rp = atp->result; if (rp->uses_gpu()) { if (gpu_active) { - cpu_usage_gpu += rp->avp->avg_ncpus; + cpu_usage_gpu += rp->resource_usage.avg_ncpus; } } else { - cpu_usage_cpu += rp->avp->avg_ncpus; + cpu_usage_cpu += rp->resource_usage.avg_ncpus; } } double cpu_usage = cpu_usage_cpu + cpu_usage_gpu; @@ -620,7 +623,7 @@ bool ACTIVE_TASK_SET::poll() { continue; } atp->elapsed_time += diff; - double flops = rp->avp->flops; + double flops = rp->resource_usage.flops; if (!rp->uses_gpu()) { flops *= cpu_scale; } @@ -641,7 +644,7 @@ bool ACTIVE_TASK_SET::poll() { html_msg += buf; action = true; } - double pf = diff * app_peak_flops(rp->avp, cpu_scale); + double pf = diff * app_peak_flops(rp, cpu_scale); rp->project->project_results.flops_used += pf; rp->peak_flop_count += pf; sim_results.flops_used += pf; @@ -852,10 +855,10 @@ void show_resource(int rsc_type) { PROJECT* p = rp->project; double ninst=0; if (rsc_type) { - if (rp->avp->gpu_usage.rsc_type != rsc_type) continue; - ninst = rp->avp->gpu_usage.usage; + if (rp->resource_usage.rsc_type != rsc_type) continue; + ninst = rp->resource_usage.coproc_usage; } else { - ninst = rp->avp->avg_ncpus; + ninst = rp->resource_usage.avg_ncpus; } if (!found) { @@ -1127,8 +1130,8 @@ void simulate() { " %s %s (%s)\n time left %s deadline %s\n", rp->project->project_name, rp->name, - rsc_name_long(rp->avp->gpu_usage.rsc_type), - timediff_format(rp->sim_flops_left/rp->avp->flops).c_str(), + rsc_name_long(rp->resource_usage.rsc_type), + timediff_format(rp->sim_flops_left/rp->resource_usage.flops).c_str(), timediff_format(rp->report_deadline - START_TIME).c_str() ); } @@ -1209,23 +1212,23 @@ void show_app(APP* app) { for (unsigned int i=0; iapp != app) continue; - if (avp->gpu_usage.rsc_type) { + if (avp->resource_usage.rsc_type) { fprintf(summary_file, " app version %d (%s)\n" " %.2f CPUs, %.2f %s GPUs, %.0f GFLOPS\n", avp->version_num, avp->plan_class, - avp->avg_ncpus, - avp->gpu_usage.usage, - rsc_name(avp->gpu_usage.rsc_type), - avp->flops/1e9 + avp->resource_usage.avg_ncpus, + avp->resource_usage.coproc_usage, + rsc_name(avp->resource_usage.rsc_type), + avp->resource_usage.flops/1e9 ); } else { fprintf(summary_file, " app version %d (%s)\n" " %.2f CPUs, %.0f GFLOPS\n", avp->version_num, avp->plan_class, - avp->avg_ncpus, - avp->flops/1e9 + avp->resource_usage.avg_ncpus, + avp->resource_usage.flops/1e9 ); } } @@ -1266,7 +1269,7 @@ void get_app_params() { } for (i=0; imissing_coproc) continue; + if (avp->resource_usage.missing_coproc) continue; avp->app->ignore = false; } fprintf(summary_file, "Applications and version\n"); diff --git a/client/sim_util.cpp b/client/sim_util.cpp index 03a693cc399..c589dd52c0d 100644 --- a/client/sim_util.cpp +++ b/client/sim_util.cpp @@ -141,7 +141,7 @@ int ACTIVE_TASK::init(RESULT* rp) { result = rp; wup = rp->wup; app_version = rp->avp; - max_elapsed_time = rp->wup->rsc_fpops_bound/result->avp->flops; + max_elapsed_time = rp->wup->rsc_fpops_bound/result->resource_usage.flops; max_disk_usage = rp->wup->rsc_disk_bound; max_mem_usage = rp->wup->rsc_memory_bound; _task_state = PROCESS_UNINITIALIZED; From 280c838b2992cf6e4e0243b3df89d97e5db880e5 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Sun, 15 Dec 2024 13:22:02 -0800 Subject: [PATCH 8/9] scheduler: fix FCGI build --- sched/sched_shmem.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sched/sched_shmem.cpp b/sched/sched_shmem.cpp index 3ea6e0070ad..d5403f4bd6a 100644 --- a/sched/sched_shmem.cpp +++ b/sched/sched_shmem.cpp @@ -113,7 +113,7 @@ void get_buda_plan_classes(vector &pcs) { FILE *f = boinc::fopen("../buda_plan_classes", "r"); if (!f) return; char buf[256]; - while (fgets(buf, 256, f)) { + while (boinc::fgets(buf, 256, f)) { strip_whitespace(buf); pcs.push_back(buf); } From 8e8ccf8b98ba73fff9001d29787846bde2b8c59a Mon Sep 17 00:00:00 2001 From: David Anderson Date: Sun, 15 Dec 2024 13:23:49 -0800 Subject: [PATCH 9/9] trailing white space --- html/user/buda.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html/user/buda.php b/html/user/buda.php index 0af8322c16d..0068293ebdc 100644 --- a/html/user/buda.php +++ b/html/user/buda.php @@ -29,7 +29,7 @@ $buda_root = "../../buda_apps"; -// scan BUDA apps and variants, and write a file 'buda_plan_classes' +// scan BUDA apps and variants, and write a file 'buda_plan_classes' // in the project dir with list of plan classes // function write_plan_class_file() {