Skip to content

Commit

Permalink
Ready to test job_srun() (ref #194)
Browse files Browse the repository at this point in the history
  • Loading branch information
j-woz committed Oct 7, 2022
1 parent ffaab7d commit 16df822
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 19 deletions.
4 changes: 2 additions & 2 deletions turbine/code/export/job.swift
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
// Various system-level configurations for application jobs.

@par @dispatch=WORKER
(int status) job_srun(int cores_per_job, int procs_per_job,
string cmd_line[])
(int status) job_srun(int cores_per_node, int cores_per_job, int procs_per_job,
boolean bind, string cmd_line[])
"turbine" "0.0" "job_srun_tcl";
30 changes: 29 additions & 1 deletion turbine/code/lib/functions.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -592,9 +592,37 @@ namespace eval turbine {
}
return $result
}

proc contig { start count { step 1 } } {
set result [ list ]
set value $start
for { set i 0 } { $i < $count } { incr i } {
lappend result $value
incr value $step
}
return $result
}

# Break list L into count equal-size chunks (of size s)
proc fragment { L count } {
set result [ list ]
set n [ llength $L ]
set s [ expr $n / $count ]
set index 0
for { set c 0 } { $c < $count } { incr c } {
set chunk [ list ]
for { set i 0 } { $i < $s } { incr i } {
lappend chunk [ lindex $L [ expr $index + $i ] ]
}
lappend result $chunk
incr index $i
}

return $result
}
}

# Local Variables:
# mode: tcl
# tcl-indent-level: 4
# tcl-indent-level: 2
# End:
99 changes: 83 additions & 16 deletions turbine/code/lib/job.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,22 @@ namespace eval turbine {

proc job_srun_tcl { outputs inputs } {
set exit_code [ lindex $outputs 0 ]
set cpj [ lindex $inputs 0 ]
set ppj [ lindex $inputs 1 ]
set cmd_line [ lindex $inputs 2 ]
rule $inputs "turbine::job_srun_tcl_body $exit_code $cpj $ppj $cmd_line" \
set cpn [ lindex $inputs 0 ]
set cpj [ lindex $inputs 1 ]
set ppj [ lindex $inputs 2 ]
set bind [ lindex $inputs 3 ]
set cmd_line [ lindex $inputs 4 ]
rule $inputs \
"turbine::job_srun_tcl_body $exit_code $cpn $cpj $ppj $bind $cmd_line" \
type $turbine::WORK
}

proc job_srun_tcl_body { exit_code cpj ppj cmd_line } {
proc job_srun_tcl_body { exit_code cpn cpj ppj bind cmd_line } {
# Retrieve data (decr?)
set cpj_value [ retrieve_integer $cpj ]
set ppj_value [ retrieve_integer $ppj ]
set cpn_value [ retrieve_integer $cpn ]
set cpj_value [ retrieve_integer $cpj ]
set ppj_value [ retrieve_integer $ppj ]
set bind_value [ retrieve_integer $bind ]
# Unpack command line
set D [ adlb::enumerate $cmd_line dict all 0 ]
set cmd_value [ list ]
Expand All @@ -25,27 +30,89 @@ namespace eval turbine {
lappend cmd_value [ dict get $D $k ]
}
# Run the user code
set exit_code_value [ job_srun_impl $cpj_value $ppj_value $cmd_value ]
set exit_code_value \
[ job_srun_impl $cpn_value $cpj_value $ppj_value $bind_value $cmd_value ]
# Store result
store_integer $exit_code $exit_code_value
}

proc job_srun_impl { cpj ppj cmd } {
proc job_srun_impl { cpn cpj ppj bind cmd } {
# Setup and run the job. Return a unix exit code.
global env
puts "turbine: srun: job_srun ..."

if $bind {
set cpu_bind [ bind_mask_cpu $cpn $cpj $ppj ]
} else {
set cpu_bind ""
}

puts "turbine: srun: job_srun -n $ppj -N 1 $cpu_bind $cmd"
puts "turbine: srun: in PWD: $env(PWD)"
try {
puts "turbine: srun: exec: srun -n $ppj $cmd"
set fp [ open "|srun -n $ppj $cmd" "r" ]
show fp
# Run the user job! (with pipe to capture output)
set fp [ open "|srun -n $ppj -N 1 $cpu_bind $cmd 2>@1" "r" ]
while { [ gets $fp line ] >= 0 } {
puts "srun: $line"
}
close $fp
} on error e {
puts "turbine: srun failed!"
puts "turbine: srun error message begin:"
puts $e
puts "turbine: srun error message end."
job_srun_error $e
return 1
}
return 0
}

proc bind_mask_cpu { cpn cpj ppj } {
# Set up the SLURM cpu binding
global env
set cpu_bind "--cpu-bind=verbose,mask_cpu:"
set offset $env(ADLB_RANK_OFFSET)
set ppn $env(PPN)
# puts "offset=$offset ppn=$ppn cpn=$cpn"
show offset ppn cpn cpj ppj

set L [ list ]
set start [ expr $offset * $cpj ]
set spacing [ expr $cpj / $ppj ]
set cpj_max [ expr $cpn / $ppn ]
show cpj_max
set start [ expr $cpj_max * $offset ]
# set S1 [ contig $start $cpj_max ]
# show S1
set step [ expr $cpj_max / $cpj ]
set S2 [ contig $start $cpj $step ]
show step S2
set K [ fragment $S2 $ppj ]
show K

# set cpu_ids [ join $L "," ]
# append cpu_bind $cpu_ids
set masks [ list ]
foreach chunk $K {
set mask [ list2mask $chunk ]
show mask
lappend masks $mask
}
show masks
append cpu_bind [ join $masks "," ]
return $cpu_bind
}

proc job_srun_error { e } {
puts "turbine: srun failed!"
puts "turbine: srun error message begin:"
puts $e
puts "turbine: srun error message end."
}

proc list2mask { L } {
set A 0
foreach i $L {
incr A [ expr 2 ** $i ]
}
puts $A
# printf "bitmap: %b" $A
return [ format "0x%X" $A ]
}
}

0 comments on commit 16df822

Please sign in to comment.