Skip to content

Commit

Permalink
Merge branch 'master' into mpi4py_dev
Browse files Browse the repository at this point in the history
  • Loading branch information
alexnick83 authored Aug 4, 2023
2 parents d8c00e9 + 7171ecc commit edbc3af
Show file tree
Hide file tree
Showing 87 changed files with 3,652 additions and 1,173 deletions.
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,6 @@ Reid Wahl
Yihang Luo
Alexandru Calotoiu
Phillip Lane
Samuel Martin

and other contributors listed in https://github.com/spcl/dace/graphs/contributors
3 changes: 2 additions & 1 deletion dace/cli/daceprof.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,8 @@ def make_sequential(sdfg: dace.SDFG):
for n, _ in sdfg.all_nodes_recursive():
if isinstance(n, dace.nodes.EntryNode):
sched = getattr(n, 'schedule', False)
if sched == dace.ScheduleType.CPU_Multicore or sched == dace.ScheduleType.Default:
if sched in (dace.ScheduleType.CPU_Multicore, dace.ScheduleType.CPU_Persistent,
dace.ScheduleType.Default):
n.schedule = dace.ScheduleType.Sequential

registered.append(dace.hooks.register_sdfg_call_hook(before_hook=make_sequential))
Expand Down
2 changes: 1 addition & 1 deletion dace/codegen/instrumentation/likwid.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class LIKWIDInstrumentationCPU(InstrumentationProvider):
the Likwid tool.
"""

perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential]
perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential]

def __init__(self):
self._likwid_used = False
Expand Down
6 changes: 3 additions & 3 deletions dace/codegen/instrumentation/papi.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class PAPIInstrumentation(InstrumentationProvider):

_counters: Optional[Set[str]] = None

perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential]
perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential]

def __init__(self):
self._papi_used = False
Expand Down Expand Up @@ -350,7 +350,7 @@ def on_consume_entry(self, sdfg, state, node, outer_stream, inner_stream):

@staticmethod
def perf_get_supersection_start_string(node, dfg, unified_id):
if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):
# Nested SuperSections are not supported. Therefore, we mark the
# outermost section and disallow internal scopes from creating it.
if not hasattr(node.map, '_can_be_supersection_start'):
Expand All @@ -360,7 +360,7 @@ def perf_get_supersection_start_string(node, dfg, unified_id):
for x in children:
if not hasattr(x.map, '_can_be_supersection_start'):
x.map._can_be_supersection_start = True
if x.map.schedule == dtypes.ScheduleType.CPU_Multicore:
if x.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):

x.map._can_be_supersection_start = False
elif x.map.schedule == dtypes.ScheduleType.Sequential:
Expand Down
15 changes: 12 additions & 3 deletions dace/codegen/targets/cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -1082,11 +1082,20 @@ def _subscript_expr(self, slicenode: ast.AST, target: str) -> symbolic.SymbolicT
]

if isinstance(visited_slice, ast.Tuple):
if len(strides) != len(visited_slice.elts):
# If slice is multi-dimensional and writes to array with more than 1 elements, then:
# - Assume this is indirection (?)
# - Soft-squeeze the slice (remove unit-modes) to match the treatment of the strides above.
if target not in self.constants:
desc = self.sdfg.arrays[dname]
if isinstance(desc, data.Array) and data._prod(desc.shape) != 1:
elts = [e for i, e in enumerate(visited_slice.elts) if desc.shape[i] != 1]
else:
elts = visited_slice.elts
if len(strides) != len(elts):
raise SyntaxError('Invalid number of dimensions in expression (expected %d, '
'got %d)' % (len(strides), len(visited_slice.elts)))
'got %d)' % (len(strides), len(elts)))

return sum(symbolic.pystr_to_symbolic(unparse(elt)) * s for elt, s in zip(visited_slice.elts, strides))
return sum(symbolic.pystr_to_symbolic(unparse(elt)) * s for elt, s in zip(elts, strides))

if len(strides) != 1:
raise SyntaxError('Missing dimensions in expression (expected %d, got one)' % len(strides))
Expand Down
140 changes: 83 additions & 57 deletions dace/codegen/targets/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from dace.sdfg import nodes, utils as sdutils
from dace.sdfg import (ScopeSubgraphView, SDFG, scope_contains_scope, is_array_stream_view, NodeNotExpandedError,
dynamic_map_inputs, local_transients)
from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga
from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga, is_in_scope
from typing import Union
from dace.codegen.targets import fpga

Expand Down Expand Up @@ -79,7 +79,9 @@ def __init__(self, frame_codegen, sdfg):

# Register dispatchers
dispatcher.register_node_dispatcher(self)
dispatcher.register_map_dispatcher([dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential], self)
dispatcher.register_map_dispatcher(
[dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential],
self)

cpu_storage = [dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_ThreadLocal, dtypes.StorageType.Register]
dispatcher.register_array_dispatcher(cpu_storage, self)
Expand Down Expand Up @@ -222,7 +224,7 @@ def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, de
# We add the `dfg is not None` check because the `sdutils.is_nonfree_sym_dependent` check will fail if
# `nodedesc` is a View and `dfg` is None.
if dfg and not sdutils.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols):
raise NotImplementedError("The declare_array method should only be used for variables "
raise NotImplementedError("The declare_array method should only be used for variables "
"that must have their declaration and allocation separate.")

name = node.data
Expand Down Expand Up @@ -1714,66 +1716,87 @@ def _generate_MapEntry(

# TODO: Refactor to generate_scope_preamble once a general code
# generator (that CPU inherits from) is implemented
if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
map_header += "#pragma omp parallel for"
if node.map.omp_schedule != dtypes.OMPScheduleType.Default:
schedule = " schedule("
if node.map.omp_schedule == dtypes.OMPScheduleType.Static:
schedule += "static"
elif node.map.omp_schedule == dtypes.OMPScheduleType.Dynamic:
schedule += "dynamic"
elif node.map.omp_schedule == dtypes.OMPScheduleType.Guided:
schedule += "guided"
if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):
# OpenMP header
in_persistent = False
if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
in_persistent = is_in_scope(sdfg, state_dfg, node, [dtypes.ScheduleType.CPU_Persistent])
if in_persistent:
# If already in a #pragma omp parallel, no need to use it twice
map_header += "#pragma omp for"
# TODO(later): barriers and map_header += " nowait"
else:
raise ValueError("Unknown OpenMP schedule type")
if node.map.omp_chunk_size > 0:
schedule += f", {node.map.omp_chunk_size}"
schedule += ")"
map_header += schedule
if node.map.omp_num_threads > 0:
map_header += f" num_threads({node.map.omp_num_threads})"
if node.map.collapse > 1:
map_header += "#pragma omp parallel for"

elif node.map.schedule == dtypes.ScheduleType.CPU_Persistent:
map_header += "#pragma omp parallel"

# OpenMP schedule properties
if not in_persistent:
if node.map.omp_schedule != dtypes.OMPScheduleType.Default:
schedule = " schedule("
if node.map.omp_schedule == dtypes.OMPScheduleType.Static:
schedule += "static"
elif node.map.omp_schedule == dtypes.OMPScheduleType.Dynamic:
schedule += "dynamic"
elif node.map.omp_schedule == dtypes.OMPScheduleType.Guided:
schedule += "guided"
else:
raise ValueError("Unknown OpenMP schedule type")
if node.map.omp_chunk_size > 0:
schedule += f", {node.map.omp_chunk_size}"
schedule += ")"
map_header += schedule

if node.map.omp_num_threads > 0:
map_header += f" num_threads({node.map.omp_num_threads})"

# OpenMP nested loop properties
if node.map.schedule == dtypes.ScheduleType.CPU_Multicore and node.map.collapse > 1:
map_header += ' collapse(%d)' % node.map.collapse
# Loop over outputs, add OpenMP reduction clauses to detected cases
# TODO: set up register outside loop
# exit_node = dfg.exit_node(node)
reduction_stmts = []
# for outedge in dfg.in_edges(exit_node):
# if (isinstance(outedge.src, nodes.CodeNode)
# and outedge.data.wcr is not None):
# redt = operations.detect_reduction_type(outedge.data.wcr)
# if redt != dtypes.ReductionType.Custom:
# reduction_stmts.append('reduction({typ}:{var})'.format(
# typ=_REDUCTION_TYPE_TO_OPENMP[redt],
# var=outedge.src_conn))
# reduced_variables.append(outedge)

map_header += " %s\n" % ", ".join(reduction_stmts)

# TODO: Explicit map unroller
if node.map.unroll:
if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
raise ValueError("A Multicore CPU map cannot be unrolled (" + node.map.label + ")")

constsize = all([not symbolic.issymbolic(v, sdfg.constants) for r in node.map.range for v in r])
if node.map.unroll:
if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):
raise ValueError("An OpenMP map cannot be unrolled (" + node.map.label + ")")

# Nested loops
result.write(map_header, sdfg, state_id, node)
for i, r in enumerate(node.map.range):
# var = '__DACEMAP_%s_%d' % (node.map.label, i)
var = map_params[i]
begin, end, skip = r

if node.map.unroll:
result.write("#pragma unroll", sdfg, state_id, node)
if node.map.schedule == dtypes.ScheduleType.CPU_Persistent:
result.write('{\n', sdfg, state_id, node)

# Find if bounds are used within the scope
scope = state_dfg.scope_subgraph(node, False, False)
fsyms = scope.free_symbols
# Include external edges
for n in scope.nodes():
for e in state_dfg.all_edges(n):
fsyms |= e.data.free_symbols
fsyms = set(map(str, fsyms))

ntid_is_used = '__omp_num_threads' in fsyms
tid_is_used = node.map.params[0] in fsyms
if tid_is_used or ntid_is_used:
function_stream.write('#include <omp.h>', sdfg, state_id, node)
if tid_is_used:
result.write(f'auto {node.map.params[0]} = omp_get_thread_num();', sdfg, state_id, node)
if ntid_is_used:
result.write(f'auto __omp_num_threads = omp_get_num_threads();', sdfg, state_id, node)
else:
# Emit nested loops
for i, r in enumerate(node.map.range):
var = map_params[i]
begin, end, skip = r

result.write(
"for (auto %s = %s; %s < %s; %s += %s) {\n" %
(var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, cpp.sym2cpp(skip)),
sdfg,
state_id,
node,
)
if node.map.unroll:
result.write("#pragma unroll", sdfg, state_id, node)

result.write(
"for (auto %s = %s; %s < %s; %s += %s) {\n" %
(var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, cpp.sym2cpp(skip)),
sdfg,
state_id,
node,
)

callsite_stream.write(inner_stream.getvalue())

Expand Down Expand Up @@ -1803,8 +1826,11 @@ def _generate_MapExit(self, sdfg, dfg, state_id, node, function_stream, callsite

self.generate_scope_postamble(sdfg, dfg, state_id, function_stream, outer_stream, callsite_stream)

for _ in map_node.map.range:
if map_node.map.schedule == dtypes.ScheduleType.CPU_Persistent:
result.write("}", sdfg, state_id, node)
else:
for _ in map_node.map.range:
result.write("}", sdfg, state_id, node)

result.write(outer_stream.getvalue())

Expand Down
27 changes: 24 additions & 3 deletions dace/codegen/targets/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -1306,10 +1306,31 @@ def generate_devicelevel_state(self, sdfg, state, function_stream, callsite_stre
for c in components:

has_map = any(isinstance(node, dace.nodes.MapEntry) for node in c.nodes())
# If a global is modified, execute once per global state,
# if a shared memory element is modified, execute once per block,
# if a local scalar is modified, execute in every thread.
if not has_map:
callsite_stream.write("if (blockIdx.x == 0 "
"&& threadIdx.x == 0) "
"{ // sub-graph begin", sdfg, state.node_id)
written_nodes = [n for n in c if state.in_degree(n) > 0 and isinstance(n, dace.nodes.AccessNode)]

# The order of the branching below matters - it reduces the scope with every detected write
write_scope = 'thread' # General case acts in every thread
if any(sdfg.arrays[n.data].storage in (dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned)
for n in written_nodes):
write_scope = 'grid'
if any(sdfg.arrays[n.data].storage == dtypes.StorageType.GPU_Shared for n in written_nodes):
write_scope = 'block'
if any(sdfg.arrays[n.data].storage == dtypes.StorageType.Register for n in written_nodes):
write_scope = 'thread'

if write_scope == 'grid':
callsite_stream.write("if (blockIdx.x == 0 "
"&& threadIdx.x == 0) "
"{ // sub-graph begin", sdfg, state.node_id)
elif write_scope == 'block':
callsite_stream.write("if (threadIdx.x == 0) "
"{ // sub-graph begin", sdfg, state.node_id)
else:
callsite_stream.write("{ // subgraph begin", sdfg, state.node_id)
else:
callsite_stream.write("{ // subgraph begin", sdfg, state.node_id)

Expand Down
5 changes: 4 additions & 1 deletion dace/codegen/targets/framecode.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,10 @@ def free_symbols(self, obj: Any):
k = id(obj)
if k in self.fsyms:
return self.fsyms[k]
result = obj.free_symbols
if hasattr(obj, 'used_symbols'):
result = obj.used_symbols(all_symbols=False)
else:
result = obj.free_symbols
self.fsyms[k] = result
return result

Expand Down
48 changes: 33 additions & 15 deletions dace/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,14 +243,26 @@ def as_arg(self, with_types=True, for_call=False, name=None):
"""Returns a string for a C++ function signature (e.g., `int *A`). """
raise NotImplementedError

def used_symbols(self, all_symbols: bool) -> Set[symbolic.SymbolicType]:
"""
Returns a set of symbols that are used by this data descriptor.
:param all_symbols: Include not-strictly-free symbols that are used by this data descriptor,
e.g., shape and size of a global array.
:return: A set of symbols that are used by this data descriptor. NOTE: The results are symbolic
rather than a set of strings.
"""
result = set()
if self.transient or all_symbols:
for s in self.shape:
if isinstance(s, sp.Basic):
result |= set(s.free_symbols)
return result

@property
def free_symbols(self) -> Set[symbolic.SymbolicType]:
""" Returns a set of undefined symbols in this data descriptor. """
result = set()
for s in self.shape:
if isinstance(s, sp.Basic):
result |= set(s.free_symbols)
return result
return self.used_symbols(all_symbols=True)

def __repr__(self):
return 'Abstract Data Container, DO NOT USE'
Expand Down Expand Up @@ -689,20 +701,23 @@ def as_arg(self, with_types=True, for_call=False, name=None):
def sizes(self):
return [d.name if isinstance(d, symbolic.symbol) else str(d) for d in self.shape]

@property
def free_symbols(self):
result = super().free_symbols
def used_symbols(self, all_symbols: bool) -> Set[symbolic.SymbolicType]:
result = super().used_symbols(all_symbols)
for s in self.strides:
if isinstance(s, sp.Expr):
result |= set(s.free_symbols)
if isinstance(self.total_size, sp.Expr):
result |= set(self.total_size.free_symbols)
for o in self.offset:
if isinstance(o, sp.Expr):
result |= set(o.free_symbols)

if self.transient or all_symbols:
if isinstance(self.total_size, sp.Expr):
result |= set(self.total_size.free_symbols)
return result

@property
def free_symbols(self):
return self.used_symbols(all_symbols=True)

def _set_shape_dependent_properties(self, shape, strides, total_size, offset):
"""
Used to set properties which depend on the shape of the array
Expand Down Expand Up @@ -890,17 +905,20 @@ def covers_range(self, rng):

return True

@property
def free_symbols(self):
result = super().free_symbols
if isinstance(self.buffer_size, sp.Expr):
def used_symbols(self, all_symbols: bool) -> Set[symbolic.SymbolicType]:
result = super().used_symbols(all_symbols)
if (self.transient or all_symbols) and isinstance(self.buffer_size, sp.Expr):
result |= set(self.buffer_size.free_symbols)
for o in self.offset:
if isinstance(o, sp.Expr):
result |= set(o.free_symbols)

return result

@property
def free_symbols(self):
return self.used_symbols(all_symbols=True)


@make_properties
class View(Array):
Expand Down
Loading

0 comments on commit edbc3af

Please sign in to comment.