diff --git a/client/app.cpp b/client/app.cpp index 284fb7d4e29..16bac18ffa1 100644 --- a/client/app.cpp +++ b/client/app.cpp @@ -285,7 +285,7 @@ int ACTIVE_TASK::init(RESULT* rp) { result = rp; wup = rp->wup; app_version = rp->avp; - max_elapsed_time = rp->wup->rsc_fpops_bound/rp->avp->flops; + max_elapsed_time = rp->wup->rsc_fpops_bound/rp->resource_usage.flops; if (max_elapsed_time < MIN_TIME_BOUND) { msg_printf(wup->project, MSG_INFO, "Elapsed time limit %f < %f; setting to %f", @@ -790,7 +790,7 @@ int ACTIVE_TASK::write_gui(MIOFILE& fout) { // double fd = fraction_done; if (((fd<=0)||(fd>1)) && elapsed_time > 60) { - double est_time = wup->rsc_fpops_est/app_version->flops; + double est_time = wup->rsc_fpops_est/result->resource_usage.flops; double x = elapsed_time/est_time; fd = 1 - exp(-x); } diff --git a/client/app_config.cpp b/client/app_config.cpp index 0f57af97608..4f843557021 100644 --- a/client/app_config.cpp +++ b/client/app_config.cpp @@ -56,9 +56,9 @@ int APP_CONFIGS::config_app_versions(PROJECT* p, bool show_warnings) { for (unsigned int j=0; japp != app) continue; - if (!avp->gpu_usage.rsc_type) continue; - avp->gpu_usage.usage = ac.gpu_gpu_usage; - avp->avg_ncpus = ac.gpu_cpu_usage; + if (!avp->resource_usage.rsc_type) continue; + avp->resource_usage.coproc_usage = ac.gpu_gpu_usage; + avp->resource_usage.avg_ncpus = ac.gpu_cpu_usage; } } for (i=0; iplan_class, avc.plan_class)) continue; found = true; if (cmdline_len) { - safe_strcpy(avp->cmdline, avc.cmdline); + safe_strcpy(avp->resource_usage.cmdline, avc.cmdline); } if (avc.avg_ncpus) { - avp->avg_ncpus = avc.avg_ncpus; + avp->resource_usage.avg_ncpus = avc.avg_ncpus; } if (avc.ngpus) { - avp->gpu_usage.usage = avc.ngpus; + avp->resource_usage.coproc_usage = avc.ngpus; } } if (!found) { diff --git a/client/app_control.cpp b/client/app_control.cpp index a86c4c597c4..7319a27ea97 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -351,11 +351,11 @@ static void limbo_message(ACTIVE_TASK& at) { // that use the GPU type, in case they're waiting for GPU RAM // static void clear_schedule_backoffs(ACTIVE_TASK* atp) { - int rt = atp->result->avp->rsc_type(); + int rt = atp->result->resource_usage.rsc_type; if (rt == RSC_TYPE_CPU) return; for (unsigned int i=0; iavp->rsc_type() == rt) { + if (rp->resource_usage.rsc_type == rt) { rp->schedule_backoff = 0; } } @@ -895,7 +895,7 @@ bool ACTIVE_TASK_SET::check_rsc_limits_exceeded() { snprintf(buf, sizeof(buf), "exceeded elapsed time limit %.2f (%.2fG/%.2fG)", atp->max_elapsed_time, atp->result->wup->rsc_fpops_bound/1e9, - atp->result->avp->flops/1e9 + atp->result->resource_usage.flops/1e9 ); msg_printf(atp->result->project, MSG_INFO, "Aborting task %s: %s", atp->result->name, buf diff --git a/client/app_start.cpp b/client/app_start.cpp index 328862a5f91..842391fceab 100644 --- a/client/app_start.cpp +++ b/client/app_start.cpp @@ -233,7 +233,7 @@ void ACTIVE_TASK::init_app_init_data(APP_INIT_DATA& aid) { aid.rsc_memory_bound = wup->rsc_memory_bound; aid.rsc_disk_bound = wup->rsc_disk_bound; aid.computation_deadline = result->computation_deadline(); - int rt = app_version->gpu_usage.rsc_type; + int rt = result->resource_usage.rsc_type; if (rt) { COPROC& cp = coprocs.coprocs[rt]; if (coproc_type_name_to_num(cp.type) >= 0) { @@ -252,14 +252,14 @@ void ACTIVE_TASK::init_app_init_data(APP_INIT_DATA& aid) { } aid.gpu_device_num = cp.device_nums[k]; aid.gpu_opencl_dev_index = cp.opencl_device_indexes[k]; - aid.gpu_usage = app_version->gpu_usage.usage; + aid.gpu_usage = result->resource_usage.coproc_usage; } else { safe_strcpy(aid.gpu_type, ""); aid.gpu_device_num = -1; aid.gpu_opencl_dev_index = -1; aid.gpu_usage = 0; } - aid.ncpus = app_version->avg_ncpus; + aid.ncpus = result->resource_usage.avg_ncpus; aid.vbox_window = cc_config.vbox_window; aid.checkpoint_period = gstate.global_prefs.disk_interval; aid.fraction_done_start = 0; @@ -671,8 +671,8 @@ int ACTIVE_TASK::start() { // - is a wrapper // high_priority = false; - if (app_version->rsc_type()) high_priority = true; - if (app_version->avg_ncpus < 1) high_priority = true; + if (result->resource_usage.rsc_type) high_priority = true; + if (result->resource_usage.avg_ncpus < 1) high_priority = true; if (app_version->is_wrapper) high_priority = true; current_cpu_time = checkpoint_cpu_time; @@ -767,12 +767,12 @@ int ACTIVE_TASK::start() { snprintf(cmdline, sizeof(cmdline), "%s %s %s", - exec_path, wup->command_line.c_str(), app_version->cmdline + exec_path, wup->command_line.c_str(), result->resource_usage.cmdline ); if (!app_version->api_version_at_least(7, 5)) { - int rt = app_version->gpu_usage.rsc_type; + int rt = result->resource_usage.rsc_type; if (rt) { - coproc_cmdline(rt, result, app_version->gpu_usage.usage, cmdline, sizeof(cmdline)); + coproc_cmdline(rt, result, result->resource_usage.coproc_usage, cmdline, sizeof(cmdline)); } } @@ -968,13 +968,13 @@ int ACTIVE_TASK::start() { snprintf(cmdline, sizeof(cmdline), "%s %s", - wup->command_line.c_str(), app_version->cmdline + wup->command_line.c_str(), result->resource_usage.cmdline ); if (!app_version->api_version_at_least(7, 5)) { - int rt = app_version->gpu_usage.rsc_type; + int rt = result->resource_usage.rsc_type; if (rt) { - coproc_cmdline(rt, result, app_version->gpu_usage.usage, cmdline, sizeof(cmdline)); + coproc_cmdline(rt, result, result->resource_usage.coproc_usage, cmdline, sizeof(cmdline)); } } diff --git a/client/client_state.cpp b/client/client_state.cpp index f14e48a0704..7204365a906 100644 --- a/client/client_state.cpp +++ b/client/client_state.cpp @@ -713,17 +713,17 @@ int CLIENT_STATE::init() { // for (i=0; iflops) { - if (!avp->avg_ncpus) { - avp->avg_ncpus = 1; + if (!avp->resource_usage.flops) { + if (!avp->resource_usage.avg_ncpus) { + avp->resource_usage.avg_ncpus = 1; } - avp->flops = avp->avg_ncpus * host_info.p_fpops; + avp->resource_usage.flops = avp->resource_usage.avg_ncpus * host_info.p_fpops; // for GPU apps, use conservative estimate: // assume GPU runs at 10X peak CPU speed // - if (avp->gpu_usage.rsc_type) { - avp->flops += avp->gpu_usage.usage * 10 * host_info.p_fpops; + if (avp->resource_usage.rsc_type) { + avp->resource_usage.flops += avp->resource_usage.coproc_usage * 10 * host_info.p_fpops; } } } diff --git a/client/client_types.cpp b/client/client_types.cpp index c636a1a7da5..ff1900c5d00 100644 --- a/client/client_types.cpp +++ b/client/client_types.cpp @@ -782,18 +782,54 @@ int FILE_INFO::gunzip(char* md5_buf) { } #endif // SIM +void RESOURCE_USAGE::clear() { + avg_ncpus = 1; + rsc_type = 0; + coproc_usage = 0; + gpu_ram = 0; + flops = gstate.host_info.p_fpops; + cmdline[0] = 0; + missing_coproc = false; + missing_coproc_name[0] = 0; +} + +void RESOURCE_USAGE::check_gpu(char* plan_class) { + int rt = rsc_type; + if (!rt) return; + if (strstr(plan_class, "opencl")) { + if (!coprocs.coprocs[rt].have_opencl) { + msg_printf(0, MSG_INFO, + "App version needs OpenCL but GPU doesn't support it" + ); + missing_coproc = true; + safe_strcpy(missing_coproc_name, coprocs.coprocs[rt].type); + } + } else if (strstr(plan_class, "cuda")) { + if (!coprocs.coprocs[rt].have_cuda) { + msg_printf(0, MSG_INFO, + "App version needs CUDA but GPU doesn't support it" + ); + missing_coproc = true; + safe_strcpy(missing_coproc_name, coprocs.coprocs[rt].type); + } + } else if (strstr(plan_class, "ati")) { + if (!coprocs.coprocs[rt].have_cal) { + msg_printf(0, MSG_INFO, + "App version needs CAL but GPU doesn't support it" + ); + missing_coproc = true; + safe_strcpy(missing_coproc_name, coprocs.coprocs[rt].type); + } + } +} + void APP_VERSION::init() { safe_strcpy(app_name, ""); version_num = 0; platform[0] = 0; plan_class[0] = 0; api_version[0] = 0; - avg_ncpus = 1; - gpu_usage.rsc_type = 0; - gpu_usage.usage = 0; - gpu_ram = 0; - flops = gstate.host_info.p_fpops; - cmdline[0] = 0; + resource_usage.clear(); file_prefix[0] = 0; needs_network = false; app = NULL; @@ -803,10 +839,6 @@ void APP_VERSION::init() { graphics_exec_path[0] = 0; graphics_exec_file[0] = 0; max_working_set_size = 0; - missing_coproc = false; - missing_coproc_usage = 0.0; - missing_coproc_name[0] = 0; - dont_throttle = false; is_vm_app = false; is_wrapper = false; index = 0; @@ -818,42 +850,13 @@ void APP_VERSION::init() { int APP_VERSION::parse(XML_PARSER& xp) { FILE_REF file_ref; double dtemp; - int rt; init(); while (!xp.get_tag()) { if (xp.match_tag("/app_version")) { - rt = gpu_usage.rsc_type; - if (rt) { - dont_throttle = true; // don't throttle GPU apps - if (strstr(plan_class, "opencl")) { - if (!coprocs.coprocs[rt].have_opencl) { - msg_printf(0, MSG_INFO, - "App version needs OpenCL but GPU doesn't support it" - ); - missing_coproc = true; - missing_coproc_usage = gpu_usage.usage; - safe_strcpy(missing_coproc_name, coprocs.coprocs[rt].type); - } - } else if (strstr(plan_class, "cuda")) { - if (!coprocs.coprocs[rt].have_cuda) { - msg_printf(0, MSG_INFO, - "App version needs CUDA but GPU doesn't support it" - ); - missing_coproc = true; - missing_coproc_usage = gpu_usage.usage; - safe_strcpy(missing_coproc_name, coprocs.coprocs[rt].type); - } - } else if (strstr(plan_class, "ati")) { - if (!coprocs.coprocs[rt].have_cal) { - msg_printf(0, MSG_INFO, - "App version needs CAL but GPU doesn't support it" - ); - missing_coproc = true; - missing_coproc_usage = gpu_usage.usage; - safe_strcpy(missing_coproc_name, coprocs.coprocs[rt].type); - } - } + resource_usage.check_gpu(plan_class); + if (resource_usage.rsc_type || is_wrapper) { + dont_throttle = true; } if (strstr(plan_class, "vbox")) { is_vm_app = true; @@ -879,7 +882,7 @@ int APP_VERSION::parse(XML_PARSER& xp) { if (xp.parse_str("api_version", api_version, sizeof(api_version))) continue; if (xp.parse_str("platform", platform, sizeof(platform))) continue; if (xp.parse_str("plan_class", plan_class, sizeof(plan_class))) continue; - if (xp.parse_double("avg_ncpus", avg_ncpus)) continue; + if (xp.parse_double("avg_ncpus", resource_usage.avg_ncpus)) continue; if (xp.parse_double("max_ncpus", dtemp)) continue; if (xp.parse_double("flops", dtemp)) { if (dtemp <= 0) { @@ -887,29 +890,29 @@ int APP_VERSION::parse(XML_PARSER& xp) { "non-positive FLOPS in app version" ); } else { - flops = dtemp; + resource_usage.flops = dtemp; } continue; } - if (xp.parse_str("cmdline", cmdline, sizeof(cmdline))) continue; + if (xp.parse_str("cmdline", resource_usage.cmdline, sizeof(resource_usage.cmdline))) continue; if (xp.parse_str("file_prefix", file_prefix, sizeof(file_prefix))) continue; - if (xp.parse_double("gpu_ram", gpu_ram)) continue; + if (xp.parse_double("resource_usage.gpu_ram", resource_usage.gpu_ram)) continue; if (xp.match_tag("coproc")) { COPROC_REQ cp; int retval = cp.parse(xp); if (!retval) { - rt = rsc_index(cp.type); + int rt = rsc_index(cp.type); if (rt <= 0) { msg_printf(0, MSG_INFO, "app version refers to missing GPU type %s", cp.type ); - missing_coproc = true; - missing_coproc_usage = cp.count; - safe_strcpy(missing_coproc_name, cp.type); + resource_usage.missing_coproc = true; + resource_usage.coproc_usage = cp.count; + safe_strcpy(resource_usage.missing_coproc_name, cp.type); continue; } - gpu_usage.rsc_type = rt; - gpu_usage.usage = cp.count; + resource_usage.rsc_type = rt; + resource_usage.coproc_usage = cp.count; } else { msg_printf(0, MSG_INTERNAL_ERROR, "Error parsing "); } @@ -943,8 +946,8 @@ int APP_VERSION::write(MIOFILE& out, bool write_file_info) { app_name, version_num, platform, - avg_ncpus, - flops + resource_usage.avg_ncpus, + resource_usage.flops ); if (strlen(plan_class)) { out.printf(" %s\n", plan_class); @@ -952,8 +955,8 @@ int APP_VERSION::write(MIOFILE& out, bool write_file_info) { if (strlen(api_version)) { out.printf(" %s\n", api_version); } - if (strlen(cmdline)) { - out.printf(" %s\n", cmdline); + if (strlen(resource_usage.cmdline)) { + out.printf(" %s\n", resource_usage.cmdline); } if (strlen(file_prefix)) { out.printf(" %s\n", file_prefix); @@ -964,30 +967,30 @@ int APP_VERSION::write(MIOFILE& out, bool write_file_info) { if (retval) return retval; } } - if (gpu_usage.rsc_type) { + if (resource_usage.rsc_type) { out.printf( " \n" " %s\n" " %f\n" " \n", - rsc_name(gpu_usage.rsc_type), - gpu_usage.usage + rsc_name(resource_usage.rsc_type), + resource_usage.coproc_usage ); } - if (missing_coproc && strlen(missing_coproc_name)) { + if (resource_usage.missing_coproc && strlen(resource_usage.missing_coproc_name)) { out.printf( " \n" " %s\n" " %f\n" " \n", - missing_coproc_name, - missing_coproc_usage + resource_usage.missing_coproc_name, + resource_usage.coproc_usage ); } - if (gpu_ram) { + if (resource_usage.gpu_ram) { out.printf( " %f\n", - gpu_ram + resource_usage.gpu_ram ); } if (dont_throttle) { @@ -1150,9 +1153,9 @@ int WORKUNIT::parse(XML_PARSER& xp) { safe_strcpy(app_name, ""); version_num = 0; command_line.clear(); - //strcpy(env_vars, ""); app = NULL; project = NULL; + has_resource_usage = false; // Default these to very large values (1 week on a 1 cobblestone machine) // so we don't keep asking the server for more work rsc_fpops_est = 1e9*SECONDS_PER_DAY*7; @@ -1160,7 +1163,15 @@ int WORKUNIT::parse(XML_PARSER& xp) { rsc_memory_bound = 1e8; rsc_disk_bound = 1e9; while (!xp.get_tag()) { - if (xp.match_tag("/workunit")) return 0; + if (xp.match_tag("/workunit")) { + has_resource_usage = resource_usage.avg_ncpus>0 + || resource_usage.rsc_type!=0 + || resource_usage.missing_coproc; + if (has_resource_usage) { + resource_usage.check_gpu(plan_class); + } + return 0; + } if (xp.parse_str("name", name, sizeof(name))) continue; if (xp.parse_str("app_name", app_name, sizeof(app_name))) continue; if (xp.parse_int("version_num", version_num)) continue; @@ -1187,6 +1198,40 @@ int WORKUNIT::parse(XML_PARSER& xp) { #endif continue; } + if (xp.parse_str("plan_class", plan_class, sizeof(plan_class))) continue; + if (xp.parse_double("avg_ncpus", resource_usage.avg_ncpus)) continue; + if (xp.parse_double("flops", dtemp)) { + if (dtemp <= 0) { + msg_printf(0, MSG_INTERNAL_ERROR, "non-positive FLOPS in WU"); + } else { + resource_usage.flops = dtemp; + } + continue; + } + if (xp.parse_str("cmdline", resource_usage.cmdline, sizeof(resource_usage.cmdline))) continue; + if (xp.parse_double("resource_usage.gpu_ram", resource_usage.gpu_ram)) continue; + if (xp.match_tag("coproc")) { + COPROC_REQ cp; + retval = cp.parse(xp); + if (!retval) { + int rt = rsc_index(cp.type); + if (rt <= 0) { + msg_printf(0, MSG_INFO, + "WU refers to missing GPU type %s", cp.type + ); + resource_usage.missing_coproc = true; + resource_usage.coproc_usage = cp.count; + safe_strcpy(resource_usage.missing_coproc_name, cp.type); + continue; + } + resource_usage.rsc_type = rt; + resource_usage.coproc_usage = cp.count; + } else { + msg_printf(0, MSG_INTERNAL_ERROR, "Error parsing "); + } + continue; + } + if (xp.parse_str("job_keyword_ids", buf, sizeof(buf))) { job_keyword_ids.parse_str(buf ); continue; diff --git a/client/client_types.h b/client/client_types.h index d804b04d8ed..c10bd31773b 100644 --- a/client/client_types.h +++ b/client/client_types.h @@ -311,9 +311,26 @@ struct APP { int write(MIOFILE&); }; -struct GPU_USAGE { +// items returned by a plan class function +// +struct RESOURCE_USAGE { + double avg_ncpus; int rsc_type; // index into COPROCS array - double usage; + double coproc_usage; + double gpu_ram; + double flops; + char cmdline[256]; + // additional cmdline args + + // an app version or WU may refer to a missing GPU + // e.g. the GPU board was plugged in before but was removed. + // We don't discard them, since the board may be plugged in later. + // Instead we flag it as missing, and don't run those jobs + bool missing_coproc; + char missing_coproc_name[256]; + + void clear(); + void check_gpu(char* plan_class); }; // if you add anything, initialize it in init() @@ -324,16 +341,14 @@ struct APP_VERSION { char platform[256]; char plan_class[64]; char api_version[16]; - double avg_ncpus; - GPU_USAGE gpu_usage; // can only use 1 GPU type - double gpu_ram; - double flops; - char cmdline[256]; - // additional cmdline args + RESOURCE_USAGE resource_usage; char file_prefix[256]; // prepend this to input/output file logical names // (e.g. "share" for VM apps) bool needs_network; + bool dont_throttle; + // jobs with this app version are exempt from CPU throttling + // Set for coprocessor apps and wrapper apps APP* app; PROJECT* project; @@ -353,12 +368,6 @@ struct APP_VERSION { // to use this much RAM, // so that we don't run a long sequence of jobs, // each of which turns out not to fit in available RAM - bool missing_coproc; - double missing_coproc_usage; - char missing_coproc_name[256]; - bool dont_throttle; - // jobs of this app version are exempt from CPU throttling - // Set for coprocessor apps bool is_vm_app; // currently this set if plan class includes "vbox" (kludge) bool is_wrapper; @@ -381,11 +390,11 @@ struct APP_VERSION { void clear_errors(); bool api_version_at_least(int major, int minor); inline bool uses_coproc(int rt) { - return (gpu_usage.rsc_type == rt); - } - inline int rsc_type() { - return gpu_usage.rsc_type; + return (resource_usage.rsc_type == rt); } + //inline int rsc_type() { + // return resource_usage.rsc_type; + //} inline bool is_opencl() { return (strstr(plan_class, "opencl") != NULL); } @@ -398,6 +407,9 @@ struct WORKUNIT { int version_num; // Deprecated, but need to keep around to let people revert // to versions before multi-platform support + bool has_resource_usage; + char plan_class[256]; + RESOURCE_USAGE resource_usage; std::string command_line; std::vector input_files; PROJECT* project; @@ -413,6 +425,9 @@ struct WORKUNIT { safe_strcpy(name, ""); safe_strcpy(app_name, ""); version_num = 0; + has_resource_usage = false; + plan_class[0] = 0; + resource_usage.clear(); command_line.clear(); input_files.clear(); job_keyword_ids.clear(); diff --git a/client/coproc_sched.cpp b/client/coproc_sched.cpp index 1f1c65356bf..0c72ffb7c95 100644 --- a/client/coproc_sched.cpp +++ b/client/coproc_sched.cpp @@ -292,10 +292,9 @@ void assign_coprocs(vector& jobs) { // for (i=0; iavp; - int rt = avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; if (rt) { - usage = avp->gpu_usage.usage; + usage = rp->resource_usage.coproc_usage; cp = &coprocs.coprocs[rt]; } else { continue; @@ -311,10 +310,9 @@ void assign_coprocs(vector& jobs) { job_iter = jobs.begin(); while (job_iter != jobs.end()) { RESULT* rp = *job_iter; - APP_VERSION* avp = rp->avp; - int rt = avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; if (rt) { - usage = avp->gpu_usage.usage; + usage = rp->resource_usage.coproc_usage; cp = &coprocs.coprocs[rt]; } else { ++job_iter; diff --git a/client/coproc_sched.h b/client/coproc_sched.h index d15a7ca54a3..bcdd4291efd 100644 --- a/client/coproc_sched.h +++ b/client/coproc_sched.h @@ -54,14 +54,13 @@ struct SPORADIC_RESOURCES { return false; } RESULT *rp = atp->result; - APP_VERSION *avp = rp->avp; - if (ncpus_used + avp->avg_ncpus > ncpus_max) { + if (ncpus_used + rp->resource_usage.avg_ncpus > ncpus_max) { return false; } - int rt = avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; bool found = false; if (rt) { - double u = avp->gpu_usage.usage; + double u = rp->resource_usage.coproc_usage; COPROC& cp = sr_coprocs.coprocs[rt]; for (int i=0; iapp, cp, i)) continue; @@ -78,12 +77,11 @@ struct SPORADIC_RESOURCES { // reserve resources for the task void reserve(ACTIVE_TASK *atp) { RESULT *rp = atp->result; - APP_VERSION *avp = rp->avp; mem_used += atp->procinfo.working_set_size_smoothed; - ncpus_used+= avp->avg_ncpus; - int rt = avp->gpu_usage.rsc_type; + ncpus_used+= rp->resource_usage.avg_ncpus; + int rt = rp->resource_usage.rsc_type; if (rt) { - double u = avp->gpu_usage.usage; + double u = rp->resource_usage.coproc_usage; COPROC& cp = sr_coprocs.coprocs[rt]; for (int i=0; iapp, cp, i)) continue; diff --git a/client/cpu_sched.cpp b/client/cpu_sched.cpp index 1a93ec72ffb..e512a4c31e0 100644 --- a/client/cpu_sched.cpp +++ b/client/cpu_sched.cpp @@ -156,9 +156,9 @@ struct PROC_RESOURCES { } else { return false; } - } else if (rp->avp->avg_ncpus > 1) { + } else if (rp->resource_usage.avg_ncpus > 1) { if (ncpus_used_mt == 0) return true; - return (ncpus_used_mt + rp->avp->avg_ncpus <= ncpus); + return (ncpus_used_mt + rp->resource_usage.avg_ncpus <= ncpus); } else { return (ncpus_used_st < ncpus); } @@ -167,7 +167,7 @@ struct PROC_RESOURCES { // we've decided to add this to the runnable list; update bookkeeping // void schedule(RESULT* rp, ACTIVE_TASK* atp, bool is_edf) { - int rt = rp->avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; // see if it's possible this job will be ruled out // when we try to actually run it @@ -195,10 +195,10 @@ struct PROC_RESOURCES { // - we end up running the uncheckpointed job // - this causes all or part of a CPU to be idle // - } else if (rp->avp->avg_ncpus > 1) { - ncpus_used_mt += rp->avp->avg_ncpus; + } else if (rp->resource_usage.avg_ncpus > 1) { + ncpus_used_mt += rp->resource_usage.avg_ncpus; } else { - ncpus_used_st += rp->avp->avg_ncpus; + ncpus_used_st += rp->resource_usage.avg_ncpus; } } if (log_flags.cpu_sched_debug) { @@ -215,10 +215,9 @@ struct PROC_RESOURCES { } bool sufficient_coprocs(RESULT& r) { - APP_VERSION& av = *r.avp; - int rt = av.gpu_usage.rsc_type; + int rt = r.resource_usage.rsc_type; if (!rt) return true; - double x = av.gpu_usage.usage; + double x = r.resource_usage.coproc_usage; COPROC& cp = pr_coprocs.coprocs[rt]; for (int i=0; iavp->gpu_usage.rsc_type) { - rp->coproc_missing = true; + if (rp->resource_usage.rsc_type) { + rp->resource_usage.missing_coproc = true; } } msg_printf(NULL, MSG_INFO, @@ -291,8 +289,8 @@ bool check_coprocs_usable() { gpus_usable = true; for (i=0; iavp->gpu_usage.rsc_type) { - rp->coproc_missing = false; + if (rp->resource_usage.rsc_type) { + rp->resource_usage.missing_coproc = false; } } msg_printf(NULL, MSG_INFO, @@ -614,12 +612,12 @@ static void update_rec() { } } -static double peak_flops(APP_VERSION* avp) { +static double peak_flops(RESULT *rp) { double f = gstate.host_info.p_fpops; - double x = f * avp->avg_ncpus; - int rt = avp->gpu_usage.rsc_type; + double x = f * rp->resource_usage.avg_ncpus; + int rt = rp->resource_usage.rsc_type; if (rt) { - x += f * avp->gpu_usage.usage * rsc_work_fetch[rt].relative_speed; + x += f * rp->resource_usage.coproc_usage * rsc_work_fetch[rt].relative_speed; } return x; } @@ -698,7 +696,7 @@ void PROJECT::compute_sched_priority() { // void adjust_rec_sched(RESULT* rp) { PROJECT* p = rp->project; - p->pwf.rec_temp += peak_flops(rp->avp)/total_peak_flops() * rec_sum/24; + p->pwf.rec_temp += peak_flops(rp)/total_peak_flops() * rec_sum/24; p->compute_sched_priority(); } @@ -803,7 +801,7 @@ static void promote_once_ran_edf() { if (atp->once_ran_edf) { RESULT* rp = atp->result; PROJECT* p = rp->project; - if (p->deadlines_missed(rp->avp->rsc_type())) { + if (p->deadlines_missed(rp->resource_usage.rsc_type)) { if (log_flags.cpu_sched_debug) { msg_printf(p, MSG_INFO, "[cpu_sched_debug] domino prevention: mark %s as deadline miss", @@ -1085,8 +1083,8 @@ static inline bool more_important(RESULT* r0, RESULT* r1) { // for CPU jobs, favor jobs that use more CPUs // if (!cp0) { - if (r0->avp->avg_ncpus > r1->avp->avg_ncpus) return true; - if (r1->avp->avg_ncpus > r0->avp->avg_ncpus) return false; + if (r0->resource_usage.avg_ncpus > r1->resource_usage.avg_ncpus) return true; + if (r1->resource_usage.avg_ncpus > r0->resource_usage.avg_ncpus) return false; } // favor jobs selected first by schedule_cpus() @@ -1277,7 +1275,7 @@ bool CLIENT_STATE::enforce_run_list(vector& run_list) { // if (ncpus_used >= n_usable_cpus) { if (rp->uses_coprocs()) { - if (ncpus_used + rp->avp->avg_ncpus > n_usable_cpus+1) { + if (ncpus_used + rp->resource_usage.avg_ncpus > n_usable_cpus+1) { if (log_flags.cpu_sched_debug) { msg_printf(rp->project, MSG_INFO, "[cpu_sched_debug] skipping GPU job %s; CPU committed", @@ -1370,7 +1368,7 @@ bool CLIENT_STATE::enforce_run_list(vector& run_list) { continue; } - ncpus_used += rp->avp->avg_ncpus; + ncpus_used += rp->resource_usage.avg_ncpus; atp->next_scheduler_state = CPU_SCHED_SCHEDULED; ram_left -= ewss; if (have_max_concurrent) { diff --git a/client/cs_scheduler.cpp b/client/cs_scheduler.cpp index b36dfe1d453..808807e7441 100644 --- a/client/cs_scheduler.cpp +++ b/client/cs_scheduler.cpp @@ -305,22 +305,22 @@ int CLIENT_STATE::make_scheduler_request(PROJECT* p) { double x = rp->estimated_runtime_remaining(); if (x == 0) continue; safe_strcpy(buf, ""); - int rt = rp->avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; if (rt) { if (rt == rsc_index(GPU_TYPE_NVIDIA)) { snprintf(buf, sizeof(buf), " %f\n", - rp->avp->gpu_usage.usage + rp->resource_usage.coproc_usage ); } else if (rt == rsc_index(GPU_TYPE_ATI)) { snprintf(buf, sizeof(buf), " %f\n", - rp->avp->gpu_usage.usage + rp->resource_usage.coproc_usage ); } else if (rt == rsc_index(GPU_TYPE_INTEL)) { snprintf(buf, sizeof(buf), " %f\n", - rp->avp->gpu_usage.usage + rp->resource_usage.coproc_usage ); } } @@ -335,7 +335,7 @@ int CLIENT_STATE::make_scheduler_request(PROJECT* p) { rp->name, rp->report_deadline, x, - rp->avp->avg_ncpus, + rp->resource_usage.avg_ncpus, buf ); } @@ -912,10 +912,10 @@ int CLIENT_STATE::handle_scheduler_reply( continue; } } - if (avpp.missing_coproc) { + if (avpp.resource_usage.missing_coproc) { msg_printf(project, MSG_INTERNAL_ERROR, "App version uses non-existent %s GPU", - avpp.missing_coproc_name + avpp.resource_usage.missing_coproc_name ); } APP* app = lookup_app(project, avpp.app_name); @@ -931,10 +931,7 @@ int CLIENT_STATE::handle_scheduler_reply( if (avp) { // update app version attributes in case they changed on server // - avp->avg_ncpus = avpp.avg_ncpus; - avp->flops = avpp.flops; - safe_strcpy(avp->cmdline, avpp.cmdline); - avp->gpu_usage = avpp.gpu_usage; + avp->resource_usage = avpp.resource_usage; strlcpy(avp->api_version, avpp.api_version, sizeof(avp->api_version)); avp->dont_throttle = avpp.dont_throttle; avp->needs_network = avpp.needs_network; @@ -1016,7 +1013,8 @@ int CLIENT_STATE::handle_scheduler_reply( delete rp; continue; } - if (rp->avp->missing_coproc) { + rp->init_resource_usage(); + if (rp->resource_usage.missing_coproc) { msg_printf(project, MSG_INTERNAL_ERROR, "Missing coprocessor for task %s; aborting", rp->name ); @@ -1024,7 +1022,7 @@ int CLIENT_STATE::handle_scheduler_reply( } else { rp->set_state(RESULT_NEW, "handle_scheduler_reply"); got_work_for_rsc[0] = true; - int rt = rp->avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; if (rt > 0) { est_rsc_runtime[rt] += rp->estimated_runtime(); got_work_for_rsc[rt] = true; diff --git a/client/cs_statefile.cpp b/client/cs_statefile.cpp index 09a34d17aea..d3e1b53cf30 100644 --- a/client/cs_statefile.cpp +++ b/client/cs_statefile.cpp @@ -292,18 +292,18 @@ int CLIENT_STATE::parse_state_file_aux(const char* fname) { safe_strcpy(avp->platform, get_primary_platform()); } } - if (avp->missing_coproc) { - if (strstr(avp->missing_coproc_name, "Apple ")) { + if (avp->resource_usage.missing_coproc) { + if (strstr(avp->resource_usage.missing_coproc_name, "Apple ")) { msg_printf(project, MSG_INFO, "App version uses deprecated GPU type '%s' - discarding", - avp->missing_coproc_name + avp->resource_usage.missing_coproc_name ); delete avp; continue; } else { msg_printf(project, MSG_INFO, "App version uses missing GPU '%s'", - avp->missing_coproc_name + avp->resource_usage.missing_coproc_name ); } } @@ -394,11 +394,11 @@ int CLIENT_STATE::parse_state_file_aux(const char* fname) { delete rp; continue; } - if (rp->avp->missing_coproc) { + rp->init_resource_usage(); + if (rp->resource_usage.missing_coproc) { msg_printf(project, MSG_INFO, "Missing coprocessor for task %s", rp->name ); - rp->coproc_missing = true; } rp->wup->version_num = rp->version_num; results.push_back(rp); diff --git a/client/log_flags.cpp b/client/log_flags.cpp index 74c11ca5253..a08943857ad 100644 --- a/client/log_flags.cpp +++ b/client/log_flags.cpp @@ -770,8 +770,8 @@ void process_gpu_exclusions() { for (i=0; imissing_coproc) continue; - int rt = avp->gpu_usage.rsc_type; + if (avp->resource_usage.missing_coproc) continue; + int rt = avp->resource_usage.rsc_type; if (!rt) continue; COPROC& cp = coprocs.coprocs[rt]; bool found = false; @@ -782,12 +782,11 @@ void process_gpu_exclusions() { } } if (found) continue; - avp->missing_coproc = true; - safe_strcpy(avp->missing_coproc_name, ""); + avp->resource_usage.missing_coproc = true; + safe_strcpy(avp->resource_usage.missing_coproc_name, ""); for (j=0; javp != avp) continue; - rp->coproc_missing = true; msg_printf(avp->project, MSG_INFO, "marking %s as coproc missing", rp->name diff --git a/client/project.cpp b/client/project.cpp index 591da195f47..7116ef2021b 100644 --- a/client/project.cpp +++ b/client/project.cpp @@ -698,7 +698,7 @@ void PROJECT::get_task_durs(double& not_started_dur, double& in_progress_dur) { RESULT* rp = gstate.results[i]; if (rp->project != this) continue; double d = rp->estimated_runtime_remaining(); - d /= gstate.time_stats.availability_frac(rp->avp->gpu_usage.rsc_type); + d /= gstate.time_stats.availability_frac(rp->resource_usage.rsc_type); if (rp->is_not_started()) { not_started_dur += d; } else { @@ -827,7 +827,7 @@ bool PROJECT::runnable(int rsc_type) { RESULT* rp = gstate.results[i]; if (rp->project != this) continue; if (rsc_type != RSC_TYPE_ANY) { - if (rp->avp->gpu_usage.rsc_type != rsc_type) { + if (rp->resource_usage.rsc_type != rsc_type) { continue; } } @@ -981,7 +981,7 @@ void PROJECT::check_no_apps() { for (unsigned int i=0; iproject != this) continue; - no_rsc_apps[avp->gpu_usage.rsc_type] = false; + no_rsc_apps[avp->resource_usage.rsc_type] = false; } } diff --git a/client/result.cpp b/client/result.cpp index 0a780872d4f..64d25c14080 100644 --- a/client/result.cpp +++ b/client/result.cpp @@ -78,7 +78,6 @@ void RESULT::clear() { exit_status = 0; stderr_out.clear(); suspended_via_gui = false; - coproc_missing = false; report_immediately = false; not_started = false; name_md5.clear(); @@ -389,7 +388,7 @@ int RESULT::write_gui(MIOFILE& out, bool check_resources) { if (project->suspended_via_gui) out.printf(" \n"); if (report_immediately) out.printf(" \n"); if (edf_scheduled) out.printf(" \n"); - if (coproc_missing) out.printf(" \n"); + if (resource_usage.missing_coproc) out.printf(" \n"); if (schedule_backoff > gstate.now) { out.printf(" \n"); if (strlen(schedule_backoff_reason)) { @@ -405,35 +404,35 @@ int RESULT::write_gui(MIOFILE& out, bool check_resources) { atp->write_gui(out); } if (!strlen(resources) || check_resources) { // update resource string only when zero or when app_config is updated. - if (avp->gpu_usage.rsc_type) { - if (avp->gpu_usage.usage == 1) { + if (resource_usage.rsc_type) { + if (resource_usage.coproc_usage == 1) { snprintf(resources, sizeof(resources), "%.3g %s + 1 %s", - avp->avg_ncpus, - cpu_string(avp->avg_ncpus), - rsc_name_long(avp->gpu_usage.rsc_type) + resource_usage.avg_ncpus, + cpu_string(resource_usage.avg_ncpus), + rsc_name_long(resource_usage.rsc_type) ); } else { snprintf(resources, sizeof(resources), "%.3g %s + %.3g %ss", - avp->avg_ncpus, - cpu_string(avp->avg_ncpus), - avp->gpu_usage.usage, - rsc_name_long(avp->gpu_usage.rsc_type) + resource_usage.avg_ncpus, + cpu_string(resource_usage.avg_ncpus), + resource_usage.coproc_usage, + rsc_name_long(resource_usage.rsc_type) ); } - } else if (avp->missing_coproc) { + } else if (resource_usage.missing_coproc) { snprintf(resources, sizeof(resources), "%.3g %s + %.12s GPU (missing)", - avp->avg_ncpus, - cpu_string(avp->avg_ncpus), - avp->missing_coproc_name + resource_usage.avg_ncpus, + cpu_string(resource_usage.avg_ncpus), + resource_usage.missing_coproc_name ); - } else if (!project->non_cpu_intensive && (avp->avg_ncpus != 1)) { + } else if (!project->non_cpu_intensive && (resource_usage.avg_ncpus != 1)) { snprintf(resources, sizeof(resources), "%.3g %s", - avp->avg_ncpus, - cpu_string(avp->avg_ncpus) + resource_usage.avg_ncpus, + cpu_string(resource_usage.avg_ncpus) ); } else { safe_strcpy(resources, " "); @@ -444,13 +443,13 @@ int RESULT::write_gui(MIOFILE& out, bool check_resources) { char buf[256]; safe_strcpy(buf, ""); if (atp && atp->scheduler_state == CPU_SCHED_SCHEDULED) { - if (avp->gpu_usage.rsc_type) { - COPROC& cp = coprocs.coprocs[avp->gpu_usage.rsc_type]; + if (resource_usage.rsc_type) { + COPROC& cp = coprocs.coprocs[resource_usage.rsc_type]; if (cp.count > 1) { // if there are multiple GPUs of this type, // show the user which one(s) are being used // - int n = (int)ceil(avp->gpu_usage.usage); + int n = (int)ceil(resource_usage.coproc_usage); safe_strcpy(buf, n>1?" (devices ":" (device "); for (int i=0; isuspended_via_gui) return false; if (state() != RESULT_FILES_DOWNLOADED) return false; - if (coproc_missing) return false; + if (resource_usage.missing_coproc) return false; if (schedule_backoff > gstate.now) return false; if (avp->needs_network && gstate.file_xfers_suspended) { // check file_xfers_suspended rather than network_suspended; @@ -618,7 +617,7 @@ bool RESULT::nearly_runnable() { default: return false; } - if (coproc_missing) return false; + if (resource_usage.missing_coproc) return false; if (schedule_backoff > gstate.now) return false; return true; } @@ -635,7 +634,7 @@ bool RESULT::downloading() { } double RESULT::estimated_runtime_uncorrected() { - return wup->rsc_fpops_est/avp->flops; + return wup->rsc_fpops_est/resource_usage.flops; } // estimate how long a result will take on this host @@ -665,7 +664,7 @@ double RESULT::estimated_runtime_remaining() { if (atp) { #ifdef SIM - return sim_flops_left/avp->flops; + return sim_flops_left/resource_usage.flops; #else return atp->est_dur() - atp->elapsed_time; #endif diff --git a/client/result.h b/client/result.h index 555e735eb34..22b4e5a4a05 100644 --- a/client/result.h +++ b/client/result.h @@ -81,9 +81,6 @@ struct RESULT { // // - X, where X is the app's stderr output bool suspended_via_gui; - bool coproc_missing; - // a coproc needed by this job is missing - // (e.g. because user removed their GPU board). bool report_immediately; bool not_started; // temp for CPU sched @@ -92,6 +89,8 @@ struct RESULT { APP* app; WORKUNIT* wup; + RESOURCE_USAGE resource_usage; + // copied from either app version or workunit PROJECT* project; RESULT(){ @@ -124,10 +123,17 @@ struct RESULT { #ifdef SIM return sim_flops_left; #else - return estimated_runtime_remaining()*avp->flops; + return estimated_runtime_remaining()*resource_usage.flops; #endif } + inline void init_resource_usage() { + if (wup->has_resource_usage) { + resource_usage = wup->resource_usage; + } else { + resource_usage = avp->resource_usage; + } + } inline bool computing_done() { if (state() >= RESULT_COMPUTE_ERROR) return true; if (ready_to_report) return true; @@ -144,16 +150,16 @@ struct RESULT { // some input or app file is downloading, and backed off // i.e. it may be a long time before we can run this result inline bool uses_coprocs() { - return (avp->gpu_usage.rsc_type != 0); + return (resource_usage.rsc_type != 0); } inline bool uses_gpu() { - int rt = avp->gpu_usage.rsc_type; + int rt = resource_usage.rsc_type; if (!rt) return false; if (coprocs.coprocs[rt].non_gpu) return false; return true; } inline int resource_type() { - return avp->gpu_usage.rsc_type; + return resource_usage.rsc_type; } inline bool non_cpu_intensive() { if (project->non_cpu_intensive) return true; @@ -172,14 +178,14 @@ struct RESULT { } // make a string describing resource usage inline void rsc_string(char* buf, int len) { - if (avp->gpu_usage.rsc_type) { + if (resource_usage.rsc_type) { snprintf(buf, len, "%.2f CPU + %.2f %s", - avp->avg_ncpus, avp->gpu_usage.usage, - rsc_name_long(avp->gpu_usage.rsc_type) + resource_usage.avg_ncpus, resource_usage.coproc_usage, + rsc_name_long(resource_usage.rsc_type) ); } else { - snprintf(buf, len, "%.2f CPU", avp->avg_ncpus); + snprintf(buf, len, "%.2f CPU", resource_usage.avg_ncpus); } } diff --git a/client/rr_sim.cpp b/client/rr_sim.cpp index c9a4edc9a5f..f9d57090408 100644 --- a/client/rr_sim.cpp +++ b/client/rr_sim.cpp @@ -80,20 +80,20 @@ struct RR_SIM { inline void activate(RESULT* rp) { PROJECT* p = rp->project; active_jobs.push_back(rp); - int rt = rp->avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; // if this is a GPU app and GPU computing is suspended, // don't count its CPU usage. // That way we'll fetch more CPU work if needed. // if (!rt || !gpu_suspend_reason) { - rsc_work_fetch[0].sim_nused += rp->avp->avg_ncpus; - p->rsc_pwf[0].sim_nused += rp->avp->avg_ncpus; + rsc_work_fetch[0].sim_nused += rp->resource_usage.avg_ncpus; + p->rsc_pwf[0].sim_nused += rp->resource_usage.avg_ncpus; } if (rt) { - rsc_work_fetch[rt].sim_nused += rp->avp->gpu_usage.usage; - p->rsc_pwf[rt].sim_nused += rp->avp->gpu_usage.usage; + rsc_work_fetch[rt].sim_nused += rp->resource_usage.coproc_usage; + p->rsc_pwf[rt].sim_nused += rp->resource_usage.coproc_usage; if (rsc_work_fetch[rt].has_exclusions) { set_bits( rp->app->non_excluded_instances[rt], @@ -130,11 +130,11 @@ void set_rrsim_flops(RESULT* rp) { // For coproc jobs, use app version estimate // if (rp->uses_gpu()) { - rp->rrsim_flops = rp->avp->flops * gstate.overall_gpu_frac(); + rp->rrsim_flops = rp->resource_usage.flops * gstate.overall_gpu_frac(); } else if (rp->avp->needs_network) { - rp->rrsim_flops = rp->avp->flops * gstate.overall_cpu_and_network_frac(); + rp->rrsim_flops = rp->resource_usage.flops * gstate.overall_cpu_and_network_frac(); } else { - rp->rrsim_flops = rp->avp->flops * gstate.overall_cpu_frac(); + rp->rrsim_flops = rp->resource_usage.flops * gstate.overall_cpu_frac(); } if (rp->rrsim_flops == 0) { rp->rrsim_flops = 1e6; // just in case @@ -195,11 +195,11 @@ void RR_SIM::init_pending_lists() { PROJECT* p = rp->project; p->pwf.n_runnable_jobs++; - p->rsc_pwf[0].nused_total += rp->avp->avg_ncpus; + p->rsc_pwf[0].nused_total += rp->resource_usage.avg_ncpus; set_rrsim_flops(rp); - int rt = rp->avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; if (rt) { - p->rsc_pwf[rt].nused_total += rp->avp->gpu_usage.usage; + p->rsc_pwf[rt].nused_total += rp->resource_usage.coproc_usage; p->rsc_pwf[rt].n_runnable_jobs++; p->rsc_pwf[rt].queue_est += rp->rrsim_flops_left/rp->rrsim_flops; } @@ -407,13 +407,13 @@ static void handle_missed_deadline(RESULT* rpbest, double diff, double ar) { } } else { rpbest->rr_sim_misses_deadline = true; - int rt = rpbest->avp->gpu_usage.rsc_type; + int rt = rpbest->resource_usage.rsc_type; if (rt) { pbest->rsc_pwf[rt].deadlines_missed++; - rsc_work_fetch[rt].deadline_missed_instances += rpbest->avp->gpu_usage.usage; + rsc_work_fetch[rt].deadline_missed_instances += rpbest->resource_usage.coproc_usage; } else { pbest->rsc_pwf[0].deadlines_missed++; - rsc_work_fetch[0].deadline_missed_instances += rpbest->avp->avg_ncpus; + rsc_work_fetch[0].deadline_missed_instances += rpbest->resource_usage.avg_ncpus; } if (log_flags.rr_simulation) { msg_printf(pbest, MSG_INFO, @@ -561,10 +561,10 @@ void RR_SIM::simulate() { // double frac = rpbest->uses_gpu()?gstate.overall_gpu_frac():gstate.overall_cpu_frac(); double dur = rpbest->estimated_runtime_remaining() / frac; - rsc_work_fetch[0].update_busy_time(dur, rpbest->avp->avg_ncpus); - int rt = rpbest->avp->gpu_usage.rsc_type; + rsc_work_fetch[0].update_busy_time(dur, rpbest->resource_usage.avg_ncpus); + int rt = rpbest->resource_usage.rsc_type; if (rt) { - rsc_work_fetch[rt].update_busy_time(dur, rpbest->avp->gpu_usage.usage); + rsc_work_fetch[rt].update_busy_time(dur, rpbest->resource_usage.coproc_usage); } } } @@ -698,20 +698,19 @@ int n_idle_resources() { RESULT* rp = gstate.results[i]; if (!rp->nearly_runnable()) continue; if (rp->some_download_stalled()) continue; - APP_VERSION* avp = rp->avp; if (rsc_work_fetch[0].nidle_now) { - rsc_work_fetch[0].nidle_now -= avp->avg_ncpus; + rsc_work_fetch[0].nidle_now -= rp->resource_usage.avg_ncpus; if (rsc_work_fetch[0].nidle_now <= 0) { nidle_rsc--; rsc_work_fetch[0].nidle_now = 0; } } - int j = avp->gpu_usage.rsc_type; + int j = rp->resource_usage.rsc_type; if (!j) { continue; } if (rsc_work_fetch[j].nidle_now) { - rsc_work_fetch[j].nidle_now -= avp->gpu_usage.usage; + rsc_work_fetch[j].nidle_now -= rp->resource_usage.coproc_usage; if (rsc_work_fetch[j].nidle_now <= 0) { nidle_rsc--; rsc_work_fetch[j].nidle_now = 0; diff --git a/client/sim.cpp b/client/sim.cpp index 7f8d5a528d6..6258fb73f78 100644 --- a/client/sim.cpp +++ b/client/sim.cpp @@ -133,13 +133,13 @@ void usage(char* prog) { exit(1); } -// peak flops of an app version +// peak flops of a result // -double app_peak_flops(APP_VERSION* avp, double cpu_scale) { - double x = avp->avg_ncpus*cpu_scale; - int rt = avp->gpu_usage.rsc_type; +double app_peak_flops(RESULT* rp, double cpu_scale) { + double x = rp->resource_usage.avg_ncpus*cpu_scale; + int rt = rp->resource_usage.rsc_type; if (rt) { - x += avp->gpu_usage.usage * rsc_work_fetch[rt].relative_speed; + x += rp->resource_usage.coproc_usage * rsc_work_fetch[rt].relative_speed; } x *= gstate.host_info.p_fpops; return x; @@ -184,7 +184,7 @@ APP* choose_app(vector& apps) { bool app_version_needs_work(APP_VERSION* avp) { if (avp->dont_use) return false; - int rt = avp->gpu_usage.rsc_type; + int rt = avp->resource_usage.rsc_type; if (rt) { return (rsc_work_fetch[rt].req_secs>0 || rsc_work_fetch[rt].req_instances>0); } @@ -210,7 +210,7 @@ APP_VERSION* choose_app_version(APP* app) { if (!app_version_needs_work(avp)) continue; if (!best_avp) { best_avp = avp; - } else if (avp->flops > best_avp->flops) { + } else if (avp->resource_usage.flops > best_avp->resource_usage.flops) { best_avp = avp; } } @@ -325,18 +325,21 @@ void decrement_request_rsc( } void decrement_request(RESULT* rp) { - APP_VERSION* avp = rp->avp; - double est_runtime = rp->wup->rsc_fpops_est/avp->flops; + double est_runtime = rp->wup->rsc_fpops_est/rp->resource_usage.flops; est_runtime /= (gstate.time_stats.on_frac*gstate.time_stats.active_frac); - decrement_request_rsc(rsc_work_fetch[0], avp->avg_ncpus, est_runtime); - int rt = avp->gpu_usage.rsc_type; + decrement_request_rsc( + rsc_work_fetch[0], rp->resource_usage.avg_ncpus, est_runtime + ); + int rt = rp->resource_usage.rsc_type; if (rt) { - decrement_request_rsc(rsc_work_fetch[rt], avp->gpu_usage.usage, est_runtime); + decrement_request_rsc( + rsc_work_fetch[rt], rp->resource_usage.coproc_usage, est_runtime + ); } } double get_estimated_delay(RESULT* rp) { - int rt = rp->avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; return rsc_work_fetch[rt].estimated_delay; } @@ -415,7 +418,7 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) { WORKUNIT* wup = new WORKUNIT; make_job(p, wup, rp, wapps); - double et = wup->rsc_fpops_est / rp->avp->flops; + double et = wup->rsc_fpops_est / rp->resource_usage.flops; if (server_uses_workload) { IP_RESULT c(rp->name, rp->report_deadline-now, et); if (check_candidate(c, n_usable_cpus, ip_results)) { @@ -596,10 +599,10 @@ bool ACTIVE_TASK_SET::poll() { RESULT* rp = atp->result; if (rp->uses_gpu()) { if (gpu_active) { - cpu_usage_gpu += rp->avp->avg_ncpus; + cpu_usage_gpu += rp->resource_usage.avg_ncpus; } } else { - cpu_usage_cpu += rp->avp->avg_ncpus; + cpu_usage_cpu += rp->resource_usage.avg_ncpus; } } double cpu_usage = cpu_usage_cpu + cpu_usage_gpu; @@ -620,7 +623,7 @@ bool ACTIVE_TASK_SET::poll() { continue; } atp->elapsed_time += diff; - double flops = rp->avp->flops; + double flops = rp->resource_usage.flops; if (!rp->uses_gpu()) { flops *= cpu_scale; } @@ -641,7 +644,7 @@ bool ACTIVE_TASK_SET::poll() { html_msg += buf; action = true; } - double pf = diff * app_peak_flops(rp->avp, cpu_scale); + double pf = diff * app_peak_flops(rp, cpu_scale); rp->project->project_results.flops_used += pf; rp->peak_flop_count += pf; sim_results.flops_used += pf; @@ -852,10 +855,10 @@ void show_resource(int rsc_type) { PROJECT* p = rp->project; double ninst=0; if (rsc_type) { - if (rp->avp->gpu_usage.rsc_type != rsc_type) continue; - ninst = rp->avp->gpu_usage.usage; + if (rp->resource_usage.rsc_type != rsc_type) continue; + ninst = rp->resource_usage.coproc_usage; } else { - ninst = rp->avp->avg_ncpus; + ninst = rp->resource_usage.avg_ncpus; } if (!found) { @@ -1127,8 +1130,8 @@ void simulate() { " %s %s (%s)\n time left %s deadline %s\n", rp->project->project_name, rp->name, - rsc_name_long(rp->avp->gpu_usage.rsc_type), - timediff_format(rp->sim_flops_left/rp->avp->flops).c_str(), + rsc_name_long(rp->resource_usage.rsc_type), + timediff_format(rp->sim_flops_left/rp->resource_usage.flops).c_str(), timediff_format(rp->report_deadline - START_TIME).c_str() ); } @@ -1209,23 +1212,23 @@ void show_app(APP* app) { for (unsigned int i=0; iapp != app) continue; - if (avp->gpu_usage.rsc_type) { + if (avp->resource_usage.rsc_type) { fprintf(summary_file, " app version %d (%s)\n" " %.2f CPUs, %.2f %s GPUs, %.0f GFLOPS\n", avp->version_num, avp->plan_class, - avp->avg_ncpus, - avp->gpu_usage.usage, - rsc_name(avp->gpu_usage.rsc_type), - avp->flops/1e9 + avp->resource_usage.avg_ncpus, + avp->resource_usage.coproc_usage, + rsc_name(avp->resource_usage.rsc_type), + avp->resource_usage.flops/1e9 ); } else { fprintf(summary_file, " app version %d (%s)\n" " %.2f CPUs, %.0f GFLOPS\n", avp->version_num, avp->plan_class, - avp->avg_ncpus, - avp->flops/1e9 + avp->resource_usage.avg_ncpus, + avp->resource_usage.flops/1e9 ); } } @@ -1266,7 +1269,7 @@ void get_app_params() { } for (i=0; imissing_coproc) continue; + if (avp->resource_usage.missing_coproc) continue; avp->app->ignore = false; } fprintf(summary_file, "Applications and version\n"); diff --git a/client/sim_util.cpp b/client/sim_util.cpp index 03a693cc399..c589dd52c0d 100644 --- a/client/sim_util.cpp +++ b/client/sim_util.cpp @@ -141,7 +141,7 @@ int ACTIVE_TASK::init(RESULT* rp) { result = rp; wup = rp->wup; app_version = rp->avp; - max_elapsed_time = rp->wup->rsc_fpops_bound/result->avp->flops; + max_elapsed_time = rp->wup->rsc_fpops_bound/result->resource_usage.flops; max_disk_usage = rp->wup->rsc_disk_bound; max_mem_usage = rp->wup->rsc_memory_bound; _task_state = PROCESS_UNINITIALIZED; diff --git a/client/work_fetch.cpp b/client/work_fetch.cpp index 01b5ea59c51..9c12dcf2743 100644 --- a/client/work_fetch.cpp +++ b/client/work_fetch.cpp @@ -60,7 +60,7 @@ inline bool has_coproc_app(PROJECT* p, int rsc_type) { for (i=0; iproject != p) continue; - if (avp->gpu_usage.rsc_type == rsc_type) return true; + if (avp->resource_usage.rsc_type == rsc_type) return true; } return false; } @@ -82,10 +82,10 @@ void RSC_PROJECT_WORK_FETCH::rr_init(PROJECT *p) { for (i=0; iproject != p) continue; - if (rsc_type && (avp->gpu_usage.rsc_type == rsc_type)) { - if (avp->gpu_usage.usage > x) x = avp->gpu_usage.usage; + if (rsc_type && (avp->resource_usage.rsc_type == rsc_type)) { + if (avp->resource_usage.coproc_usage > x) x = avp->resource_usage.coproc_usage; } else { - if (avp->avg_ncpus > x) x = avp->avg_ncpus; + if (avp->resource_usage.avg_ncpus > x) x = avp->resource_usage.avg_ncpus; } } @@ -442,7 +442,7 @@ void WORK_FETCH::rr_init() { RESULT* rp = gstate.results[i]; if (rp->schedule_backoff) { if (rp->schedule_backoff > gstate.now) { - int rt = rp->avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; rp->project->rsc_pwf[rt].has_deferred_job = true; } else { rp->schedule_backoff = 0; @@ -947,14 +947,14 @@ PROJECT* WORK_FETCH::choose_project() { // in last dt sec, and add to project totals // void WORK_FETCH::accumulate_inst_sec(ACTIVE_TASK* atp, double dt) { - APP_VERSION* avp = atp->result->avp; - PROJECT* p = atp->result->project; - double x = dt*avp->avg_ncpus; + RESULT *rp = atp->result; + PROJECT* p = rp->project; + double x = dt*rp->resource_usage.avg_ncpus; p->rsc_pwf[0].secs_this_rec_interval += x; rsc_work_fetch[0].secs_this_rec_interval += x; - int rt = avp->gpu_usage.rsc_type; + int rt = rp->resource_usage.rsc_type; if (rt) { - x = dt*avp->gpu_usage.usage; + x = dt*rp->resource_usage.coproc_usage; p->rsc_pwf[rt].secs_this_rec_interval += x; rsc_work_fetch[rt].secs_this_rec_interval += x; } @@ -1049,7 +1049,7 @@ void WORK_FETCH::handle_reply( } for (unsigned int i=0; iavp->gpu_usage.rsc_type] = true; + got_work[rp->resource_usage.rsc_type] = true; } for (int i=0; iproject != p) continue; - p->rsc_pwf[avp->gpu_usage.rsc_type].anonymous_platform_no_apps = false; + p->rsc_pwf[avp->resource_usage.rsc_type].anonymous_platform_no_apps = false; } } } @@ -1135,7 +1135,7 @@ void WORK_FETCH::init() { // clear backoff for app's resource // void WORK_FETCH::clear_backoffs(APP_VERSION& av) { - av.project->rsc_pwf[av.gpu_usage.rsc_type].clear_backoff(); + av.project->rsc_pwf[av.resource_usage.rsc_type].clear_backoff(); } //////////////////////// diff --git a/db/boinc_db_types.h b/db/boinc_db_types.h index 3586820dd85..c798764a54e 100644 --- a/db/boinc_db_types.h +++ b/db/boinc_db_types.h @@ -15,6 +15,10 @@ // You should have received a copy of the GNU Lesser General Public License // along with BOINC. If not, see . +// structures corresponding to various DB tables. +// In some cases the structures have extra fields, +// used by the server code but not stored in the DB + #ifndef _BOINC_DB_TYPES_ #define _BOINC_DB_TYPES_ diff --git a/html/inc/app_types.inc b/html/inc/app_types.inc new file mode 100644 index 00000000000..de599dd0a01 --- /dev/null +++ b/html/inc/app_types.inc @@ -0,0 +1,73 @@ +. + +// code to get list of app types (CPU/GPU) supported by project + +require_once("../inc/boinc_db.inc"); + +// return a structure indicating whether project can use +// various resource types, and a count of apps. +// Include both non-deprecated app versions and BUDA app variants. +// +function get_app_types() { + $t = new StdClass; + $t->cpu = false; + $t->cuda = false; + $t->ati = false; + $t->intel_gpu = false; + $t->apple_gpu = false; + $t->count = 0; + + $avs = BoincAppVersion::enum("deprecated=0"); + foreach ($avs as $av) { + do_plan_class($av->plan_class, $t); + } + + $pcs = file('../../buda_plan_classes'); + foreach ($pcs as $pc) { + do_plan_class($pc, $t); + } + return $t; +} + +function do_plan_class($plan_class, &$t) { + if (strstr($plan_class, "ati")) { + $t->ati = true; + $t->count++; + } else if (strstr($plan_class, "amd")) { + $t->ati = true; + $t->count++; + } else if (strstr($plan_class, "cuda")) { + $t->cuda = true; + $t->count++; + } else if (strstr($plan_class, "nvidia")) { + $t->cuda = true; + $t->count++; + } else if (strstr($plan_class, "intel_gpu")) { + $t->intel_gpu = true; + $t->count++; + } else if (strstr($plan_class, "apple_gpu")) { + $t->apple_gpu = true; + $t->count++; + } else { + $t->cpu = true; + $t->count++; + } +} + +?> diff --git a/html/inc/prefs_project.inc b/html/inc/prefs_project.inc index e0009735d14..c9bb4d68e12 100644 --- a/html/inc/prefs_project.inc +++ b/html/inc/prefs_project.inc @@ -44,6 +44,7 @@ // (send_email and show_hosts) that are treated as project preferences include_once("../inc/prefs_util.inc"); +include_once("../inc/app_types.inc"); include_once("../project/project_specific_prefs.inc"); global $app_types; @@ -70,7 +71,7 @@ if (!empty($accelerate_gpu_apps_pref)) { if ($app_types->cpu) { $project_pref_descs[] = new PREF_BOOL ( - tra("Use CPU"), + tra("Allow CPU-only tasks"), "Request CPU-only tasks from this project.", "no_cpu", false, diff --git a/html/inc/submit_util.inc b/html/inc/submit_util.inc index a3731be6f79..4d5ec9e726a 100644 --- a/html/inc/submit_util.inc +++ b/html/inc/submit_util.inc @@ -17,7 +17,7 @@ // You should have received a copy of the GNU Lesser General Public License // along with BOINC. If not, see . -// server-side utility functions for remote job submissions and control +// server-side utility functions for remote job submission and control require_once("../inc/submit_db.inc"); @@ -370,4 +370,41 @@ function parse_info_file($path) { return [$md5, $size]; } +///////////////// TEMPLATE CREATION ////////////// + +function file_ref_in($fname) { + return(sprintf( +' + %s + + +', + $fname + )); +} +function file_info_out($i) { + return sprintf( +' + + + + 5000000 + + +', + $i + ); +} + +function file_ref_out($i, $fname) { + return sprintf( +' + + %s + + +', $i, $fname + ); +} + ?> diff --git a/html/inc/util.inc b/html/inc/util.inc index ea2fc0d50ef..b740c94cd48 100644 --- a/html/inc/util.inc +++ b/html/inc/util.inc @@ -1009,46 +1009,6 @@ function db_init($try_replica=false) { return 0; } -// return a structure indicating whether project has non-deprecated -// apps versions for various resource types, -// and with a count of app versions -// -function get_app_types() { - $t = new StdClass; - $t->cpu = false; - $t->cuda = false; - $t->ati = false; - $t->intel_gpu = false; - $t->apple_gpu = false; - $t->count = 0; - $avs = BoincAppVersion::enum("deprecated=0"); - foreach ($avs as $av) { - if (strstr($av->plan_class, "ati")) { - $t->ati = true; - $t->count++; - } else if (strstr($av->plan_class, "amd")) { - $t->ati = true; - $t->count++; - } else if (strstr($av->plan_class, "cuda")) { - $t->cuda = true; - $t->count++; - } else if (strstr($av->plan_class, "nvidia")) { - $t->cuda = true; - $t->count++; - } else if (strstr($av->plan_class, "intel_gpu")) { - $t->intel_gpu = true; - $t->count++; - } else if (strstr($av->plan_class, "apple_gpu")) { - $t->apple_gpu = true; - $t->count++; - } else { - $t->cpu = true; - $t->count++; - } - } - return $t; -} - // Functions to sanitize GET and POST args // "next_url" arguments (must be local, not full URLs) diff --git a/html/user/buda.php b/html/user/buda.php index aea8aeea7d6..0068293ebdc 100644 --- a/html/user/buda.php +++ b/html/user/buda.php @@ -29,6 +29,32 @@ $buda_root = "../../buda_apps"; +// scan BUDA apps and variants, and write a file 'buda_plan_classes' +// in the project dir with list of plan classes +// +function write_plan_class_file() { + $pcs = []; + global $buda_root; + if (is_dir($buda_root)) { + $apps = scandir($buda_root); + foreach ($apps as $app) { + if ($app[0] == '.') continue; + if (!is_dir("$buda_root/$app")) continue; + $vars = scandir("$buda_root/$app"); + foreach ($vars as $var) { + if ($var[0] == '.') continue; + if (!is_dir("$buda_root/$app/$var")) continue; + $pcs[] = $var; + } + } + } + $pcs = array_unique($pcs); + file_put_contents( + "../../buda_plan_classes", + implode("\n", $pcs)."\n" + ); +} + // show list of BUDA apps and variants, // w/ buttons for adding and deleting // @@ -165,6 +191,48 @@ function copy_and_stage_file($user, $fname, $dir, $app, $variant) { return $phys_name; } +// create templates and put them in variant dir +// +function create_templates($variant, $variant_desc, $dir) { + // input template + // + $x = "\n"; + $ninfiles = 1 + count($variant_desc->input_file_names) + count($variant_desc->app_files); + for ($i=0; $i<$ninfiles; $i++) { + $x .= " \n \n \n"; + } + $x .= " \n"; + $x .= file_ref_in($variant_desc->dockerfile); + foreach ($variant_desc->app_files as $fname) { + $x .= file_ref_in($fname); + } + foreach ($variant_desc->input_file_names as $fname) { + $x .= file_ref_in($fname); + } + if ($variant == 'cpu') { + $x .= " \n"; + } else { + $x .= " $variant\n"; + } + $x .= " \n\n"; + file_put_contents("$dir/template_in", $x); + + // output template + // + $x = "\n"; + $i = 0; + foreach ($variant_desc->output_file_names as $fname) { + $x .= file_info_out($i++); + } + $x .= " \n"; + $i = 0; + foreach ($variant_desc->output_file_names as $fname) { + $x .= file_ref_out($i++, $fname); + } + $x .= " \n\n"; + file_put_contents("$dir/template_out", $x); +} + // create variant // function variant_action($user) { @@ -219,6 +287,8 @@ function variant_action($user) { json_encode($desc, JSON_PRETTY_PRINT) ); + create_templates($variant, $desc, $dir); + // Note: we don't currently allow indirect file access. // If we did, we'd need to create job.toml to mount project dir @@ -350,9 +420,13 @@ function view_file() { case 'variant_form': variant_form($user); break; case 'variant_action': - variant_action($user); break; + variant_action($user); + write_plan_class_file(); + break; case 'variant_delete': - variant_delete(); break; + variant_delete(); + write_plan_class_file(); + break; case 'view_file': view_file(); break; case null: diff --git a/html/user/buda_submit.php b/html/user/buda_submit.php index 26c9d40fbd6..6b97a276d14 100644 --- a/html/user/buda_submit.php +++ b/html/user/buda_submit.php @@ -147,13 +147,13 @@ function parse_batch_dir($batch_dir, $variant_desc) { } function create_batch($user, $njobs, $app, $variant) { - global $buda_app; + global $buda_boinc_app; $now = time(); $batch_name = sprintf('buda_%d_%d', $user->id, $now); $description = "$app ($variant)"; $batch_id = BoincBatch::insert(sprintf( "(user_id, create_time, logical_start_time, logical_end_time, est_completion_time, njobs, fraction_done, nerror_jobs, state, completion_time, credit_estimate, credit_canonical, credit_total, name, app_id, project_state, description, expire_time) values (%d, %d, 0, 0, 0, %d, 0, 0, %d, 0, 0, 0, 0, '%s', %d, 0, '%s', 0)", - $user->id, $now, $njobs, BATCH_STATE_INIT, $batch_name, $buda_app->id, + $user->id, $now, $njobs, BATCH_STATE_INIT, $batch_name, $buda_boinc_app->id, $description )); return BoincBatch::lookup_id($batch_id); @@ -188,17 +188,21 @@ function stage_input_files($batch_dir, $batch_desc, $batch_id) { // Use --stdin, where each job is described by a line // function create_jobs( - $variant_desc, $batch_desc, $batch_id, $batch_dir_name, + $app, $variant, $variant_desc, + $batch_desc, $batch_id, $batch_dir_name, $wrapper_verbose, $cmdline ) { - global $buda_app; + global $buda_boinc_app; - // get list of names of app files + // get list of physical names of app files // $app_file_names = $variant_desc->dockerfile_phys; foreach ($variant_desc->app_files_phys as $pname) { $app_file_names .= " $pname"; } + + // make per-job lines to pass as stdin + // $job_cmds = ''; foreach ($batch_desc->jobs as $job) { $job_cmd = sprintf('--wu_name batch_%d__job_%s', $batch_id, $job->dir); @@ -221,10 +225,10 @@ function create_jobs( ); $cmd = sprintf( 'cd ../..; bin/create_work --appname %s --batch %d --stdin --command_line %s --wu_template %s --result_template %s', - $buda_app->name, $batch_id, + $buda_boinc_app->name, $batch_id, $cw_cmdline, - "buda_batches/$batch_dir_name/template_in", - "buda_batches/$batch_dir_name/template_out" + "buda_apps/$app/$variant/template_in", + "buda_apps/$app/$variant/template_out" ); $cmd .= sprintf(' > %s 2<&1', "buda_batches/errfile"); @@ -242,80 +246,6 @@ function create_jobs( } } -///////////////// TEMPLATE CREATION ////////////// - -function file_ref_in($fname) { - return(sprintf( -' - %s - - -', - $fname - )); -} -function file_info_out($i) { - return sprintf( -' - - - - 5000000 - - -', - $i - ); -} - -function file_ref_out($i, $fname) { - return sprintf( -' - - %s - - -', $i, $fname - ); -} - -// create templates and put them in batch dir -// -function create_templates($variant_desc, $batch_dir) { - // input template - // - $x = "\n"; - $ninfiles = 1 + count($variant_desc->input_file_names) + count($variant_desc->app_files); - for ($i=0; $i<$ninfiles; $i++) { - $x .= " \n \n \n"; - } - $x .= " \n"; - $x .= file_ref_in($variant_desc->dockerfile); - foreach ($variant_desc->app_files as $fname) { - $x .= file_ref_in($fname); - } - foreach ($variant_desc->input_file_names as $fname) { - $x .= file_ref_in($fname); - } - $x .= " \n\n"; - file_put_contents("$batch_dir/template_in", $x); - - // output template - // - $x = "\n"; - $i = 0; - foreach ($variant_desc->output_file_names as $fname) { - $x .= file_info_out($i++); - } - $x .= " \n"; - $i = 0; - foreach ($variant_desc->output_file_names as $fname) { - $x .= file_ref_out($i++, $fname); - } - $x .= " \n\n"; - file_put_contents("$batch_dir/template_out", $x); -} - function handle_submit($user) { $app = get_str('app'); if (!is_valid_filename($app)) die('bad arg'); @@ -338,8 +268,6 @@ function handle_submit($user) { // scan batch dir; validate and return struct $batch_desc = parse_batch_dir($batch_dir, $variant_desc); - create_templates($variant_desc, $batch_dir); - $batch = create_batch( $user, count($batch_desc->jobs), $app, $variant ); @@ -349,7 +277,8 @@ function handle_submit($user) { stage_input_files($batch_dir, $batch_desc, $batch->id); create_jobs( - $variant_desc, $batch_desc, $batch->id, $batch_dir_name, + $app, $variant, $variant_desc, + $batch_desc, $batch->id, $batch_dir_name, $wrapper_verbose, $cmdline ); @@ -365,9 +294,9 @@ function handle_submit($user) { } $user = get_logged_in_user(); -$buda_app = BoincApp::lookup("name='buda'"); -if (!$buda_app) error_page('no buda app'); -if (!has_submit_access($user, $buda_app->id)) { +$buda_boinc_app = BoincApp::lookup("name='buda'"); +if (!$buda_boinc_app) error_page('no buda app'); +if (!has_submit_access($user, $buda_boinc_app->id)) { error_page('no access'); } $action = get_str('action', true); diff --git a/html/user/get_output3.php b/html/user/get_output3.php index 2da95847926..b8f8fb7385b 100644 --- a/html/user/get_output3.php +++ b/html/user/get_output3.php @@ -47,7 +47,7 @@ function get_batch() { $batch_id = get_str('batch_id'); $dir = "../../results/$batch_id"; $name = "batch_$batch_id.zip"; - $cmd = "cd $dir; rm -f $name; zip $name *"; + $cmd = "cd $dir; rm -f $name; zip -q $name *"; system($cmd); do_download("$dir/$name"); } diff --git a/sched/plan_class_spec.xml.sample b/sched/plan_class_spec.xml.sample index 345b2a6b942..13cc6b8850e 100644 --- a/sched/plan_class_spec.xml.sample +++ b/sched/plan_class_spec.xml.sample @@ -29,7 +29,6 @@ nvidia 100 - 200 3000 17700 254 diff --git a/sched/sched_array.cpp b/sched/sched_array.cpp index e28d19ceb52..86dfaf9034d 100644 --- a/sched/sched_array.cpp +++ b/sched/sched_array.cpp @@ -323,7 +323,15 @@ static bool scan_work_array() { // result.id = wu_result.resultid; if (result_still_sendable(result, wu)) { - add_result_to_reply(result, wu, bavp, false); + bool is_buda, is_ok; + HOST_USAGE hu; + check_buda_plan_class(wu, hu, is_buda, is_ok); + if (is_buda) { + if (!is_ok) continue; + } else { + hu = bavp->host_usage; + } + add_result_to_reply(result, wu, bavp, hu, is_buda, false); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. diff --git a/sched/sched_assign.cpp b/sched/sched_assign.cpp index 260ed9a6db6..75115578d63 100644 --- a/sched/sched_assign.cpp +++ b/sched/sched_assign.cpp @@ -154,7 +154,15 @@ static int send_assigned_job(ASSIGNMENT& asg) { DB_ID_TYPE result_id = boinc_db.insert_id(); SCHED_DB_RESULT result; retval = result.lookup_id(result_id); - add_result_to_reply(result, wu, bavp, false); + bool is_buda, is_ok; + HOST_USAGE hu; + check_buda_plan_class(wu, hu, is_buda, is_ok); + if (is_buda) { + if (!is_ok) return -1; + } else { + hu = bavp->host_usage; + } + add_result_to_reply(result, wu, bavp, hu, is_buda, false); if (config.debug_assignment) { log_messages.printf(MSG_NORMAL, diff --git a/sched/sched_customize.cpp b/sched/sched_customize.cpp index a424b9bac99..4a315f3a1b5 100644 --- a/sched/sched_customize.cpp +++ b/sched/sched_customize.cpp @@ -959,15 +959,21 @@ static inline bool app_plan_vbox( return true; } -static inline bool app_plan_wsl(SCHEDULER_REQUEST& sreq, char* plan_class, HOST_USAGE& hu) { +static inline bool app_plan_wsl( + SCHEDULER_REQUEST& sreq, char* plan_class, HOST_USAGE& hu +) { // no additional checks at the moment, just return true return true; } -// app planning function. +// if host can handle the plan class, populate host usage and return true +// // See https://github.com/BOINC/boinc/wiki/AppPlan // -bool app_plan(SCHEDULER_REQUEST& sreq, char* plan_class, HOST_USAGE& hu, const WORKUNIT* wu) { +bool app_plan( + SCHEDULER_REQUEST& sreq, char* plan_class, HOST_USAGE& hu, + const WORKUNIT* wu +) { char buf[256]; static bool check_plan_class_spec = true; static bool have_plan_class_spec = false; @@ -975,7 +981,11 @@ bool app_plan(SCHEDULER_REQUEST& sreq, char* plan_class, HOST_USAGE& hu, const W if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, - "[version] Checking plan class '%s'\n", plan_class + "[version] Checking plan class '%s' check %d have %d bad %d\n", + plan_class, + check_plan_class_spec, + have_plan_class_spec, + bad_plan_class_spec ); } diff --git a/sched/sched_locality.cpp b/sched/sched_locality.cpp index a780d1f2456..b8b93bd32d0 100644 --- a/sched/sched_locality.cpp +++ b/sched/sched_locality.cpp @@ -332,7 +332,15 @@ static int possibly_send_result(SCHED_DB_RESULT& result) { if (count > 0) return ERR_WU_USER_RULE; } - return add_result_to_reply(result, wu, bavp, true); + bool is_buda, is_ok; + HOST_USAGE hu; + check_buda_plan_class(wu, hu, is_buda, is_ok); + if (is_buda) { + if (!is_ok) return ERR_NO_APP_VERSION; + } else { + hu = bavp->host_usage; + } + return add_result_to_reply(result, wu, bavp, hu, is_buda, false); } // Retrieves and returns a trigger instance identified by the given diff --git a/sched/sched_nci.cpp b/sched/sched_nci.cpp index de3082efabe..b6f2fe7952d 100644 --- a/sched/sched_nci.cpp +++ b/sched/sched_nci.cpp @@ -94,7 +94,7 @@ static int send_job_for_app(APP& app) { "Sending non-CPU-intensive job: %s\n", wu.name ); } - add_result_to_reply(result, wu, bavp, false); + add_result_to_reply(result, wu, bavp, bavp->host_usage, false, false); return 0; } log_messages.printf(MSG_NORMAL, diff --git a/sched/sched_resend.cpp b/sched/sched_resend.cpp index 1eedcecd42b..d4ed3f12cce 100644 --- a/sched/sched_resend.cpp +++ b/sched/sched_resend.cpp @@ -249,7 +249,15 @@ bool resend_lost_work() { ); g_reply->insert_message(warning_msg, "low"); } else { - retval = add_result_to_reply(result, wu, bavp, false); + bool is_buda, is_ok; + HOST_USAGE hu; + check_buda_plan_class(wu, hu, is_buda, is_ok); + if (is_buda) { + if (!is_ok) continue; + } else { + hu = bavp->host_usage; + } + retval = add_result_to_reply(result, wu, bavp, hu, is_buda, false); if (retval) { log_messages.printf(MSG_CRITICAL, "[HOST#%lu] failed to send [RESULT#%lu]\n", diff --git a/sched/sched_score.cpp b/sched/sched_score.cpp index fc738f407d3..e97d45c55b6 100644 --- a/sched/sched_score.cpp +++ b/sched/sched_score.cpp @@ -50,6 +50,10 @@ static int get_size_class(APP& app, double es) { return app.n_size_classes - 1; } +JOB::JOB() { + memset(this, 0, sizeof(JOB)); +} + // Assign a score to this job, // representing the value of sending the job to this host. // Also do some initial screening, @@ -202,6 +206,7 @@ void send_work_score_type(int rt) { } WORKUNIT wu = wu_result.workunit; JOB job; + job.app = ssp->lookup_app(wu.appid); if (job.app->non_cpu_intensive) { if (config.debug_send_job) { @@ -223,6 +228,19 @@ void send_work_score_type(int rt) { continue; } + // check WU plan class (for BUDA jobs) + // + bool is_buda, is_ok; + HOST_USAGE hu; + check_buda_plan_class(wu, hu, is_buda, is_ok); + if (is_buda) { + if (!is_ok) continue; + job.host_usage = hu; + job.is_buda = true; + } else { + job.host_usage = job.bavp->host_usage; + } + job.index = i; job.result_id = wu_result.resultid; if (!job.get_score(i)) { @@ -350,7 +368,11 @@ void send_work_score_type(int rt) { SCHED_DB_RESULT result; result.id = wu_result.resultid; if (result_still_sendable(result, wu)) { - add_result_to_reply(result, wu, job.bavp, false); + add_result_to_reply( + result, wu, job.bavp, job.host_usage, + job.is_buda, + false // locality scheduling + ); // add_result_to_reply() fails only in pathological cases - // e.g. we couldn't update the DB record or modify XML fields. diff --git a/sched/sched_score.h b/sched/sched_score.h index 693a55ac5d3..10d4458a22a 100644 --- a/sched/sched_score.h +++ b/sched/sched_score.h @@ -23,7 +23,12 @@ struct JOB { double score; APP* app; BEST_APP_VERSION* bavp; + bool is_buda; + HOST_USAGE host_usage; + // if is_buda, usage returned by WU plan class + // else a copy of bavp->host_usage + JOB(); bool get_score(int); }; diff --git a/sched/sched_send.cpp b/sched/sched_send.cpp index 46ea751bf00..0ce0f9f88e5 100644 --- a/sched/sched_send.cpp +++ b/sched/sched_send.cpp @@ -570,10 +570,58 @@ static int insert_wu_tags(WORKUNIT& wu, APP& app) { return insert_after(wu.xml_doc, "\n", buf); } +// add host usage into to WU's xml_doc (for BUDA jobs) +// +static int add_usage_to_wu(WORKUNIT &wu, HOST_USAGE &hu) { + char buf[2048], buf2[2048]; + snprintf(buf, sizeof(buf), + " %f\n" + " %f\n", + hu.avg_ncpus, + hu.projected_flops + ); + if (hu.proc_type != PROC_TYPE_CPU) { + snprintf(buf2, sizeof(buf2), + " \n" + " %s\n" + " %f\n" + " \n", + proc_type_name_xml(hu.proc_type), + hu.gpu_usage + ); + strcat(buf, buf2); + } + if (strlen(hu.cmdline)) { + snprintf(buf2, sizeof(buf2), + " %s\n", + hu.cmdline + ); + strcat(buf, buf2); + } + + char *p = wu.xml_doc; + if (strlen(p) + strlen(buf) + 10 > sizeof(wu.xml_doc)) { + log_messages.printf(MSG_CRITICAL, + "add_usage_to_wu(): field too small: %ld %ld %ld\n", + strlen(p), strlen(buf), sizeof(wu.xml_doc) + ); + return -1; + } + p = strstr(p, ""); + if (!p) { + log_messages.printf(MSG_CRITICAL, "add_usage_to_wu(): no end tag\n"); + return -1; + } + strcpy(p, buf); + strcat(p, ""); + return 0; +} + // Add the given workunit, app, and app version to a reply. // static int add_wu_to_reply( - WORKUNIT& wu, SCHEDULER_REPLY&, APP* app, BEST_APP_VERSION* bavp + WORKUNIT& wu, SCHEDULER_REPLY&, APP* app, BEST_APP_VERSION* bavp, + bool is_buda, HOST_USAGE &hu ) { int retval; WORKUNIT wu2, wu3; @@ -626,6 +674,11 @@ static int add_wu_to_reply( ); return retval; } + + if (is_buda) { + retval = add_usage_to_wu(wu2, hu); + if (retval) return retval; + } wu3 = wu2; if (strlen(config.replace_download_url_by_timezone)) { process_wu_timezone(wu2, wu3); @@ -885,10 +938,61 @@ inline static DB_ID_TYPE get_app_version_id(BEST_APP_VERSION* bavp) { } } +static bool wu_has_plan_class(WORKUNIT &wu, char* buf) { + char *p = strstr(wu.xml_doc, ""); + if (!p) return false; + p += strlen(""); + strncpy(buf, p, 256); + p = strstr(buf, ""); + if (!p) return false; + *p = 0; + return true; +} + +// If workunit has a plan class (e.g. BUDA) +// return false if host not capable +// plan class computes host usage +// is_buda = true +// else +// host usage is from app version +// is_buda = false +// +void check_buda_plan_class( + WORKUNIT &wu, HOST_USAGE &hu, bool &is_buda, bool &is_ok +) { + char plan_class[256]; + if (!wu_has_plan_class(wu, plan_class)) { + is_buda = false; + return; + } + if (config.debug_version_select) { + log_messages.printf(MSG_NORMAL, + "[version] plan class: %s\n", plan_class + ); + } + is_buda = true; + is_ok = true; + if (!strlen(plan_class)) { + hu.sequential_app(g_reply->host.p_fpops); + return; + } + if (!app_plan(*g_request, plan_class, hu, &wu)) { + if (config.debug_version_select) { + log_messages.printf(MSG_NORMAL, + "[version] app_plan(%s) returned false\n", plan_class + ); + } + // can't send this job + is_ok = false; + } +} + int add_result_to_reply( SCHED_DB_RESULT& result, WORKUNIT& wu, BEST_APP_VERSION* bavp, + HOST_USAGE &host_usage, + bool is_buda, bool locality_scheduling ) { int retval; @@ -899,7 +1003,7 @@ int add_result_to_reply( result.userid = g_reply->user.id; result.sent_time = time(0); result.report_deadline = result.sent_time + wu.delay_bound; - result.flops_estimate = bavp->host_usage.peak_flops; + result.flops_estimate = host_usage.peak_flops; result.app_version_id = get_app_version_id(bavp); // update WU DB record. @@ -964,7 +1068,7 @@ int add_result_to_reply( // done with DB updates. // - retval = add_wu_to_reply(wu, *g_reply, app, bavp); + retval = add_wu_to_reply(wu, *g_reply, app, bavp, is_buda, host_usage); if (retval) return retval; // Adjust available disk space. @@ -978,7 +1082,7 @@ int add_result_to_reply( double est_dur = estimate_duration(wu, *bavp); if (config.debug_send) { - double max_time = wu.rsc_fpops_bound / bavp->host_usage.projected_flops; + double max_time = wu.rsc_fpops_bound / host_usage.projected_flops; char buf1[64],buf2[64]; secs_to_hmsf(est_dur, buf1); secs_to_hmsf(max_time, buf2); @@ -1017,11 +1121,11 @@ int add_result_to_reply( // because the scheduling of GPU jobs is constrained by the # of GPUs // if (g_wreq->rsc_spec_request) { - int pt = bavp->host_usage.proc_type; + int pt = host_usage.proc_type; if (pt == PROC_TYPE_CPU) { - double est_cpu_secs = est_dur*bavp->host_usage.avg_ncpus; + double est_cpu_secs = est_dur*host_usage.avg_ncpus; g_wreq->req_secs[PROC_TYPE_CPU] -= est_cpu_secs; - g_wreq->req_instances[PROC_TYPE_CPU] -= bavp->host_usage.avg_ncpus; + g_wreq->req_instances[PROC_TYPE_CPU] -= host_usage.avg_ncpus; if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] est_dur %f est_cpu_secs %f; new req_secs %f\n", @@ -1029,9 +1133,9 @@ int add_result_to_reply( ); } } else { - double est_gpu_secs = est_dur*bavp->host_usage.gpu_usage; + double est_gpu_secs = est_dur*host_usage.gpu_usage; g_wreq->req_secs[pt] -= est_gpu_secs; - g_wreq->req_instances[pt] -= bavp->host_usage.gpu_usage; + g_wreq->req_instances[pt] -= host_usage.gpu_usage; if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, "[send_job] est_dur %f est_gpu_secs %f; new req_secs %f\n", @@ -1045,7 +1149,7 @@ int add_result_to_reply( } update_estimated_delay(*bavp, est_dur); g_wreq->njobs_sent++; - config.max_jobs_in_progress.register_job(app, bavp->host_usage.proc_type); + config.max_jobs_in_progress.register_job(app, host_usage.proc_type); if (!resent_result) { DB_HOST_APP_VERSION* havp = bavp->host_app_version(); if (havp) { diff --git a/sched/sched_send.h b/sched/sched_send.h index 373533f0c8b..77e18449a6e 100644 --- a/sched/sched_send.h +++ b/sched/sched_send.h @@ -33,9 +33,17 @@ extern void send_work(); extern int add_result_to_reply( SCHED_DB_RESULT& result, WORKUNIT& wu, BEST_APP_VERSION* bavp, + HOST_USAGE&, + bool is_buda, bool locality_scheduling ); +// if WU has plan class, check host, and get corresponding host_usage +// +extern void check_buda_plan_class( + WORKUNIT &wu, HOST_USAGE &host_usage, bool &is_buda, bool &is_ok +); + inline bool is_anonymous(PLATFORM* platform) { return (!strcmp(platform->name, "anonymous")); } diff --git a/sched/sched_shmem.cpp b/sched/sched_shmem.cpp index 66faaa30c1d..d5403f4bd6a 100644 --- a/sched/sched_shmem.cpp +++ b/sched/sched_shmem.cpp @@ -28,6 +28,7 @@ #include using std::vector; +using std::string; #include "boinc_db.h" #include "error_numbers.h" @@ -107,6 +108,19 @@ static void overflow(const char* table, const char* param_name) { exit(1); } +void get_buda_plan_classes(vector &pcs) { + pcs.clear(); + FILE *f = boinc::fopen("../buda_plan_classes", "r"); + if (!f) return; + char buf[256]; + while (boinc::fgets(buf, 256, f)) { + strip_whitespace(buf); + pcs.push_back(buf); + } +} + +// scan various DB tables and populate shared-memory arrays +// int SCHED_SHMEM::scan_tables() { DB_PLATFORM platform; DB_APP app; @@ -243,6 +257,12 @@ int SCHED_SHMEM::scan_tables() { int rt = plan_class_to_proc_type(av.plan_class); have_apps_for_proc_type[rt] = true; } + vector buda_plan_classes; + get_buda_plan_classes(buda_plan_classes); + for (string pc: buda_plan_classes) { + int rt = plan_class_to_proc_type(pc.c_str()); + have_apps_for_proc_type[rt] = true; + } for (i=0; i