--[[ Local implementation of cli_filter.lua interface for Pawsey machies. The cli_filer interface is provided by the functions slurm_cli_pre_submit(options, offset) slurm_cli_post_submit(offset, jobid, stepid) slurm_cli_setup_defaults(options, early) Debugging output is through the slurm logging interface, but because verbosity settings aren't propagated to the cli filter, we can only use slurm.log_error and slurm.log_info lua API functions. Debug output is enabled only if the environment variable SLURM_CLI_FILTER_DEBUG is set to a number greater than zero. ]] -- $Date$, $Revision$ substituted by keyword script in staging. local git_date = '$Date$' local git_revision = '$Revision$' --[[ tokenize(str, pattern, max_tokens) Regard str as a string of tokens separated by separators that are described by the pattern string and return the tokens as a table. Operates similarly to perl's split function. - If max_tokens is a positive number, only the first (max_tokens - 1) separators will be considered. - If max_tokens is zero, exclude any trailing empty tokens from the result. - If max_tokens is a negative number, return all tokens. - If the pattern matches a zero-length subsring, it will only be considered to describe a separator if the preceding token would be non-empty. ]] local function tokenize(str, pattern, max_tokens) if #str == 0 then return {} end pattern = pattern or '%s' max_tokens = max_tokens or 0 local truncate_trailing_empty = max_tokens == 0 local tokens = {} local tok_from = 1 repeat if max_tokens == 1 then table.insert(tokens, str:sub(tok_from)) break end max_tokens = max_tokens - 1 local sep_from, sep_to = str:find(pattern, tok_from) -- Exclude zero-length tokens when the pattern gives a zero-length match. if sep_from == tok_from and sep_to < sep_from then sep_from, sep_to = str:find(pattern, tok_from + 1) end table.insert(tokens, str:sub(tok_from, (sep_from or 1 + #str) - 1)) tok_from = (sep_to or #str) + 1 until not sep_from if truncate_trailing_empty then while #tokens>0 and tokens[#tokens] == '' do tokens[#tokens] = nil end end return tokens end -- Round float to nearest int (use string.format to avoid getting bogged down in edge cases). local function round_nearest(x) return math.tointeger(string.format('%.0f', x)) end -- Test if a value exists in an array local function existsIn(array, value) for i=1, #array do if array[i] == value then return true end end return false end -- Wrappers for slurm logging local function slurm_error(msg) slurm.log_error("cli_filter: %s", msg) return slurm.ERROR end local function slurm_errorf(fmt, ...) slurm.log_error("cli_filter: "..fmt, ...) return slurm.ERROR end -- Upping log verbosity in e.g. salloc for some reason does not apply -- to cli_filter logging. For now, use log_info and rely upon an -- environment variable to enable/disable debug output. local function debug_lvl() local v = os.getenv('SLURM_CLI_FILTER_DEBUG') return (v and tonumber(v)) or 0 end local function slurm_debug(msg) if debug_lvl() > 0 then slurm.log_info("cli_filter: %s", msg) end end local function slurm_debugf(fmt, ...) if debug_lvl() > 0 then slurm.log_info("cli_filter: "..fmt, ...) end end -- Execute command; return captured stdout and return code. local function os_execute(cmd) local fileHandle = assert(io.popen(cmd, 'r')) local commandOutput = assert(fileHandle:read('*a')) local rc = {fileHandle:close()} return commandOutput, rc[3] -- rc[3] contains return code end -- Run scontrol show partition; this function will be mocked in unit testing. local function run_show_partition(partition) return os_execute('scontrol show partition --all --oneliner '..(partition or '')..' 2>/dev/null') end -- Run sinfo nodes -h -o "%Z %G"; this function will be mocked in unit testing. local function run_sinfo_nodes_ZG(partition) return os_execute('sinfo nodes -h -o %Z\\ %G -p '..partition..' 2>/dev/null') end local function get_default_partition() local all_pinfo_str, rc = run_show_partition() if rc == 0 then for _, line in ipairs(tokenize(all_pinfo_str, '\n')) do if line:find("Default=YES") then return line:match('PartitionName=([^%s]+)') end end end return nil end local function get_default_partition_or_env() return os.getenv('SLURM_JOB_PARTITION') or get_default_partition() end -- Parse slurm strings of the form 'key=value,key=value,...' to table. -- Supply e.g. ':' as second argument to parse strings of the form 'key:value,...' etc. -- The third agument, key_pfx, is a prefix to be stripped from every key on -- parsing. Why does Slurm turn --gres=gpu:2,tmp:20G on the command-line into -- options['gres'] == 'gres/gpu:2,gres/tmp:20G'? Why does Slurm do anything? local function parse_csv_tbl(csv, key_sep, key_pfx) key_sep = key_sep or '=' key_pfx = key_pfx or '' local function strip_pfx(s) if key_pfx == '' then return s end if string.sub(s, 1, string.len(key_pfx)) == key_pfx then return string.sub(s, string.len(key_pfx)+1) else return s end end local tbl = {} for _, field in ipairs(tokenize(csv, ',')) do local k, v = table.unpack(tokenize(field, key_sep, 2)) if v == nil then table.insert(tbl, strip_pfx(k)) else tbl[strip_pfx(k)] = v end end return tbl end -- Analagous to above, add key_pfx to keys when reassembling a csv field. local function collect_csv_tbl(tbl, key_sep, key_pfx) key_sep = key_sep or '=' key_pfx = key_pfx or '' local csv = '' local field_sep = '' for k, v in pairs(tbl) do if type(k) == 'string' then csv = csv .. field_sep .. key_pfx .. k .. key_sep .. v else csv = csv .. field_sep .. key_pfx .. v end field_sep = ',' end return csv end local function parse_partition_info_str(pinfo_str) if not pinfo_str then return nil end local pinfo = {} for _, field in ipairs(tokenize(pinfo_str, '%s+')) do local k, v = table.unpack(tokenize(field, '=', 2)) -- some fields themselves are expected to contain tables if k == 'JobDefaults' or k == 'TRES' or k == 'TRESBillingWeights' then local rhs = {} if v ~= '(null)' then rhs = parse_csv_tbl(v, '=') end pinfo[k] = rhs else pinfo[k] = v end end return pinfo end local function get_partition_info(partition) if not partition or partition == '' then return nil end local pinfo_str, rc = run_show_partition(partition) if rc == 0 then return parse_partition_info_str(pinfo_str) end return nil end local function parse_node_gres_str(node_gres_str) if not node_gres_str then return nil end -- Expect output to be comma-separated key:value pairs. -- On the assumption that the partition is homogeoneous, only look at the first line. line = string.gsub(node_gres_str, '\n.*', '') if line == '(null)' then return {} else return parse_csv_tbl(line, ':') end end local function get_node_ppc_gres(partition) if not partition or partition == '' then return nil end local node_ZG_str, rc = run_sinfo_nodes_ZG(partition) if rc == 0 then local ZG = tokenize(node_ZG_str, ' ', 2) return tonumber(ZG[1]), parse_node_gres_str(ZG[2]) end return nil end local function convert_MiB(memory) local scale_tbl = { t = 1024*1024, T = 1024*1024, g = 1024, G = 1024, m = 1, M = 1, k = 1/1024, K = 1/1024 } if tonumber(memory) then return tonumber(memory) end if type(memory) ~= 'string' then return nil end local scale = scale_tbl[string.sub(memory, -1)] memory = tonumber(string.sub(memory, 1, string.len(memory)-1)) if not scale or not memory then return nil else return memory * scale end end local function collate_spank_options(opt_tbl) local collated = {} if type(opt_tbl) == 'table' and type(opt_tbl.spank) == 'table' then for plugin, ptbl in pairs(opt_tbl.spank) do if type(ptbl) == 'table' then for k, v in pairs(ptbl) do if v == nil then table.insert(collated, k) else collated[k] = v end end end end end return collated end -- When the partition is the acceptance partition, the effective partition for -- the purposes of cli-filter processing is determined by the supplied -- constraints. -- -- The acceptance_mapping table maps a constraint or a pair of constraints (as -- given by the first and second-level keys of the table) to a partition name. -- That partition name is chosen as the effective partition if that constraint -- or pair of constraints constitutes a subset of the supplied set of -- constraints given on the command line. local acceptance_mapping = { askaprt = { askaprt = 'askaprt' }, copy = { copy = 'copy' }, cpu = { debug = 'debug', highmem = 'highmem', work = 'work' }, gpu = { debug = 'gpu-dev', highmem = 'gpu-highmem', work = 'gpu' }, ['mwa-asvocopy'] = { ['mwa-asvocopy'] = 'mwa-asvocopy' }, quantum = { quantum = 'quantum' } } local function effective_acceptance_partition(constraint_argument) if constraint_argument == nil or acceptance_mapping == nil then return nil end local constraints = tokenize(constraint_argument, ',') for tkey, tvalue in pairs(acceptance_mapping) do if existsIn(constraints, tkey) then for ckey, cvalue in pairs(tvalue) do if existsIn(constraints, ckey) then return cvalue end end end end return nil end -- Slurm CLI filter interface functions: function slurm_cli_setup_defaults(options, early) --[[ Rather than just have a default SLURM_HINT in the module, which is hard to override, this sets the same default in a "more elegant" way. See SchedMD Bug 10377 --]] options['threads-per-core'] = 1 return slurm.SUCCESS end function slurm_cli_post_submit(offset, jobid, stepid) return slurm.SUCCESS end function slurm_cli_pre_submit(options, offset) --[[ Sets the memory request if not provided Relies on output from scontrol so large formating changes will break this pre processing It also relies on mem=0 being a way of requesting all the memory on a node and that this value is stored internally as "0?". Finally, the script also relies on DefMemPerCPU being set and being a meaningful value such that DefMemPerCPU * Total number of cores on a node is all the memory on a node. --]] slurm_debugf('revision %s', git_revision:gsub('[$]Revision: ([^$]*)[$]', '%1')) slurm_debugf('date %s', git_date:gsub('[$]Date: ([^$]*)[$]', '%1')) slurm_debugf('options on entry: %s', slurm.json_cli_options(options)) slurm_debugf('SLURM_JOB_PARTITION=%s', os.getenv('SLURM_JOB_PARTITION') or '') local function is_gpu_partition(partition) return partition == 'gpu' or partition == 'gpu-dev' or partition == 'gpu-highmem' or partition == 'mwa-gpu' or partition == 'quantum' end local function is_excepted_partition(partition) return partition == 'acceptance' -- or partition == 'quantum' end -- An unset option can be repesented by nil, the string "-2", the string "0", or the string "unset": check all of them. -- (Use explicit checks if any of the above is in fact an expected possible value. Note that options exposed by -- the slurm cli_filter lua API differ between the json reporting and between the C-backed lua 'options' table.) local function is_unset(x) return x == nil or x == '-2' or x == 'unset' or x == '0' end -- Any option handled by a spank plugin is special. Gather these up in their own table. local spank_options = collate_spank_options(options) -- A gres option, if present, may contain multiple comma-separated key:value fields that we will need to examine. local gres_options = {} if not is_unset(options['gres']) then gres_options = parse_csv_tbl(options['gres'], ':', 'gres/') end -- Are we in srun that's being invoked inside an allocation? local is_srun_in_allocation = options['type'] == 'srun' and os.getenv('SLURM_JOB_PARTITION') ~= nil -- Have any cpu resource options been passed? local has_explicit_cpu_request = not is_unset(options['cpus-per-task']) or not is_unset(options['cpus-per-gpu']) or not is_unset(options['cores-per-socket']) -- Have any gpu resource options been passed? local has_explicit_gpu_request = not is_unset(gres_options.gpu) or not is_unset(options['gpus']) or not is_unset(options['gpus-per-node']) or not is_unset(options['gpus-per-task']) -- Have any mem resource options been passed, excluding a request for all node memory? local has_all_mem_request = options['mem'] == "0?" local has_explicit_mem_request = options['mem-per-cpu'] ~= nil or options['mem-per-gpu'] ~= nil or options['mem'] ~= nil and not has_all_mem_request -- Have any gpu power options been supplied? local has_gpu_power_request = spank_options['gpu-srange'] ~= nil or spank_options['gpu-power-cap'] ~= nil local is_node_exclusive = options['exclusive'] == 'exclusive' -- disregard 'user', 'mcs' possibilities. local partition = options['partition'] or get_default_partition_or_env() if partition == 'acceptance' then partition = effective_acceptance_partition(options.constraint) or partition end -- Retrieve node configutation on partition (pu-per-core, gres). local node_pu_per_core, node_gres = get_node_ppc_gres(partition) if node_pu_per_core == nil or node_gres == nil then return slurm_error("unable to retrieve node information") end local threads_to_pu_ratio = 1 local opt_threads_per_core = tonumber(options['threads-per-core']) if opt_threads_per_core and opt_threads_per_core > 0 then threads_to_pu_ratio = opt_threads_per_core / node_pu_per_core end if not is_gpu_partition(partition) and not is_excepted_partition(partition) then -- Non-gpu partition path: -- Abort if any gpu power options have been given. if has_gpu_power_request then return slurm_error("GPU power control options --gpu-srange and --gpu-power-cap may only be used with an exclusive GPU allocation request") end -- Compute correct mem-per-cpu value from available memory and threads-per-core option if memory has not been -- reqested explicitly. 'pu' (Processing Unit) is used as the term to describe both hardware threads and what Slurm calls CPUs. if not has_explicit_mem_request then local pinfo = get_partition_info(partition) if pinfo == nil then return slurm_error("unable to retrieve partition information") end local mem_per_pu = math.floor(tonumber(pinfo.DefMemPerCPU)) if is_node_exclusive or has_all_mem_request then local pu_per_node = math.floor(tonumber(pinfo.TotalCPUs)/tonumber(pinfo.TotalNodes)) options['mem'] = math.floor(mem_per_pu * pu_per_node) else options['mem-per-cpu'] = math.floor(mem_per_pu / threads_to_pu_ratio) end end elseif is_gpu_partition(partition) then -- Gpu partition path: -- Gpu power options are not permitted in non-exclusive allocations. -- Spank options are inherited by srun invocations inside an allocation; we need to ignore them in that instance. if has_gpu_power_request and not is_node_exclusive and not is_srun_in_allocation then return slurm_error("GPU power control options --gpu-srange and --gpu-power-cap may only be used with an exclusive GPU allocation request") end local pinfo = get_partition_info(partition) if pinfo == nil then return slurm_error("unable to retrieve partition information") end local gpus_per_node = 0 if node_gres.gpu then gpus_per_node = string.gsub(node_gres.gpu, "%(.*", "") gpus_per_node = tonumber(gpus_per_node) end if not gpus_per_node then return slurm_error("unable to determine GPU count from partition node information") end local max_tmp_str = nil local max_tmp_MiB = 0 if node_gres.tmp then max_tmp_str = node_gres.tmp max_tmp_MiB = convert_MiB(max_tmp_str) if max_tmp_MiB == nil then return slurm_error("unable to determine size of tmp gres") end end local tres = pinfo.TRES if not tres or not tres.cpu or not tres['gres/gpu'] or not tonumber(tres['gres/gpu']) then return slurm_error('unable to determine cpu to gpu ratio') end -- Note that quantum partition has grace arm cpus that have 1 thread per core, unlike the x86 milan cpus. local cpus_per_gpu = round_nearest(tonumber(tres.cpu)/tonumber(tres['gres/gpu']) * threads_to_pu_ratio) if has_explicit_cpu_request and not is_srun_in_allocation then return slurm_errorf('cannot explicitly request CPU resources for GPU allocation; each allocated GPU allocates %d cores', cpus_per_gpu) end -- The default tmp allocation is hard-coded in the job_submit.lua script, and not directly accessible from the filter. local default_tmp_MiB = convert_MiB('128G') local non_exclusive_max_tmp_MiB = math.max(0, max_tmp_MiB - (gpus_per_node - 1) * default_tmp_MiB) -- Try to get mem-per-gpu from JobDefaults? Only used for informational purposes. local def_mem_per_gpu = pinfo.JobDefaults and pinfo.JobDefaults.DefMemPerGPU if has_explicit_mem_request then return slurm_errorf('\nYou cannot explicitly request memory for GPU allocations\nYou asked for (per-cpu %s, per-gpu %s, mem %s)\nEach allocated GPU allocates %s MB of memory by the system', options['mem-per-cpu'], options['mem-per-gpu'], options['mem'], def_mem_per_gpu or "some") end -- Ensure there is some gpu request on a gpu partition if not is_node_exclusive and not has_explicit_gpu_request and not is_srun_in_allocation then return slurm_error('non-exclusive GPU allocations require a request for one or more GPUs') end -- Reject if tmp request is too high local tmp_request_MiB = convert_MiB(gres_options.tmp) or 0 if not is_node_exclusive and tmp_request_MiB > non_exclusive_max_tmp_MiB then return slurm_errorf('non-exclusive GPU allocations may request at most %d GiB of NVMe tmp', non_exclusive_max_tmp_MiB/1024) end options['cpus-per-gpu'] = cpus_per_gpu if is_node_exclusive and not is_srun_in_allocation then gres_options.gpu = gpus_per_node if max_tmp_str then gres_options.tmp = max_tmp_str end options['gres'] = collect_csv_tbl(gres_options, ':', 'gres/') end end slurm_debugf('options on exit: %s', slurm.json_cli_options(options)) return slurm.SUCCESS end -- return table of local functions for unit testing return { tokenize = tokenize, existsIn = existsIn, parse_csv_tbl = parse_csv_tbl, collect_csv_tbl = collect_csv_tbl, collate_spank_options = collate_spank_options, convert_MiB = convert_MiB, slurm_error = slurm_error, slurm_errorf = slurm_errorf, slurm_debug = slurm_debug, slurm_debugf = slurm_debugf, get_node_ppc_gres = get_node_ppc_gres, get_default_partition_or_env = get_default_partition_or_env, effective_acceptance_partition = effective_acceptance_partition, parse_partition_info_str = parse_partition_info_str, get_partition_info = get_partition_info, }