Merge branch 'master' into mpi4py_dev

spcl · Aug 4, 2023 · edbc3af · edbc3af
2 parents d8c00e9 + 7171ecc
commit edbc3af
Show file tree

Hide file tree

Showing 87 changed files with 3,652 additions and 1,173 deletions.
diff --git a/AUTHORS b/AUTHORS
@@ -36,5 +36,6 @@ Reid Wahl
 Yihang Luo
 Alexandru Calotoiu
 Phillip Lane
+Samuel Martin
 
 and other contributors listed in https://github.com/spcl/dace/graphs/contributors
diff --git a/dace/cli/daceprof.py b/dace/cli/daceprof.py
@@ -227,7 +227,8 @@ def make_sequential(sdfg: dace.SDFG):
             for n, _ in sdfg.all_nodes_recursive():
                 if isinstance(n, dace.nodes.EntryNode):
                     sched = getattr(n, 'schedule', False)
-                    if sched == dace.ScheduleType.CPU_Multicore or sched == dace.ScheduleType.Default:
+                    if sched in (dace.ScheduleType.CPU_Multicore, dace.ScheduleType.CPU_Persistent,
+                                 dace.ScheduleType.Default):
                         n.schedule = dace.ScheduleType.Sequential
 
         registered.append(dace.hooks.register_sdfg_call_hook(before_hook=make_sequential))

diff --git a/dace/codegen/instrumentation/likwid.py b/dace/codegen/instrumentation/likwid.py
@@ -69,7 +69,7 @@ class LIKWIDInstrumentationCPU(InstrumentationProvider):
         the Likwid tool.
     """
 
-    perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential]
+    perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential]
 
     def __init__(self):
         self._likwid_used = False

diff --git a/dace/codegen/instrumentation/papi.py b/dace/codegen/instrumentation/papi.py
@@ -43,7 +43,7 @@ class PAPIInstrumentation(InstrumentationProvider):
 
     _counters: Optional[Set[str]] = None
 
-    perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential]
+    perf_whitelist_schedules = [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential]
 
     def __init__(self):
         self._papi_used = False
@@ -350,7 +350,7 @@ def on_consume_entry(self, sdfg, state, node, outer_stream, inner_stream):
 
     @staticmethod
     def perf_get_supersection_start_string(node, dfg, unified_id):
-        if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
+        if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):
             # Nested SuperSections are not supported. Therefore, we mark the
             # outermost section and disallow internal scopes from creating it.
             if not hasattr(node.map, '_can_be_supersection_start'):
@@ -360,7 +360,7 @@ def perf_get_supersection_start_string(node, dfg, unified_id):
             for x in children:
                 if not hasattr(x.map, '_can_be_supersection_start'):
                     x.map._can_be_supersection_start = True
-                if x.map.schedule == dtypes.ScheduleType.CPU_Multicore:
+                if x.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):
 
                     x.map._can_be_supersection_start = False
                 elif x.map.schedule == dtypes.ScheduleType.Sequential:

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
@@ -1082,11 +1082,20 @@ def _subscript_expr(self, slicenode: ast.AST, target: str) -> symbolic.SymbolicT
                 ]
 
         if isinstance(visited_slice, ast.Tuple):
-            if len(strides) != len(visited_slice.elts):
+            # If slice is multi-dimensional and writes to array with more than 1 elements, then:
+            # - Assume this is indirection (?)
+            # - Soft-squeeze the slice (remove unit-modes) to match the treatment of the strides above.
+            if target not in self.constants:
+                desc = self.sdfg.arrays[dname]
+                if isinstance(desc, data.Array) and data._prod(desc.shape) != 1:
+                    elts = [e for i, e in enumerate(visited_slice.elts) if desc.shape[i] != 1]
+            else:
+                elts = visited_slice.elts
+            if len(strides) != len(elts):
                 raise SyntaxError('Invalid number of dimensions in expression (expected %d, '
-                                  'got %d)' % (len(strides), len(visited_slice.elts)))
+                                  'got %d)' % (len(strides), len(elts)))
 
-            return sum(symbolic.pystr_to_symbolic(unparse(elt)) * s for elt, s in zip(visited_slice.elts, strides))
+            return sum(symbolic.pystr_to_symbolic(unparse(elt)) * s for elt, s in zip(elts, strides))
 
         if len(strides) != 1:
             raise SyntaxError('Missing dimensions in expression (expected %d, got one)' % len(strides))

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
@@ -18,7 +18,7 @@
 from dace.sdfg import nodes, utils as sdutils
 from dace.sdfg import (ScopeSubgraphView, SDFG, scope_contains_scope, is_array_stream_view, NodeNotExpandedError,
                        dynamic_map_inputs, local_transients)
-from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga
+from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga, is_in_scope
 from typing import Union
 from dace.codegen.targets import fpga
 
@@ -79,7 +79,9 @@ def __init__(self, frame_codegen, sdfg):
 
         # Register dispatchers
         dispatcher.register_node_dispatcher(self)
-        dispatcher.register_map_dispatcher([dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.Sequential], self)
+        dispatcher.register_map_dispatcher(
+            [dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent, dtypes.ScheduleType.Sequential],
+            self)
 
         cpu_storage = [dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_ThreadLocal, dtypes.StorageType.Register]
         dispatcher.register_array_dispatcher(cpu_storage, self)
@@ -222,7 +224,7 @@ def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, de
         # We add the `dfg is not None` check because the `sdutils.is_nonfree_sym_dependent` check will fail if
         # `nodedesc` is a View and `dfg` is None.
         if dfg and not sdutils.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols):
-                raise NotImplementedError("The declare_array method should only be used for variables "
+            raise NotImplementedError("The declare_array method should only be used for variables "
                                       "that must have their declaration and allocation separate.")
 
         name = node.data
@@ -1714,66 +1716,87 @@ def _generate_MapEntry(
 
         # TODO: Refactor to generate_scope_preamble once a general code
         #  generator (that CPU inherits from) is implemented
-        if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
-            map_header += "#pragma omp parallel for"
-            if node.map.omp_schedule != dtypes.OMPScheduleType.Default:
-                schedule = " schedule("
-                if node.map.omp_schedule == dtypes.OMPScheduleType.Static:
-                    schedule += "static"
-                elif node.map.omp_schedule == dtypes.OMPScheduleType.Dynamic:
-                    schedule += "dynamic"
-                elif node.map.omp_schedule == dtypes.OMPScheduleType.Guided:
-                    schedule += "guided"
+        if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):
+            # OpenMP header
+            in_persistent = False
+            if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
+                in_persistent = is_in_scope(sdfg, state_dfg, node, [dtypes.ScheduleType.CPU_Persistent])
+                if in_persistent:
+                    # If already in a #pragma omp parallel, no need to use it twice
+                    map_header += "#pragma omp for"
+                    # TODO(later): barriers and map_header += " nowait"
                 else:
-                    raise ValueError("Unknown OpenMP schedule type")
-                if node.map.omp_chunk_size > 0:
-                    schedule += f", {node.map.omp_chunk_size}"
-                schedule += ")"
-                map_header += schedule
-            if node.map.omp_num_threads > 0:
-                map_header += f" num_threads({node.map.omp_num_threads})"
-            if node.map.collapse > 1:
+                    map_header += "#pragma omp parallel for"
+
+            elif node.map.schedule == dtypes.ScheduleType.CPU_Persistent:
+                map_header += "#pragma omp parallel"
+
+            # OpenMP schedule properties
+            if not in_persistent:
+                if node.map.omp_schedule != dtypes.OMPScheduleType.Default:
+                    schedule = " schedule("
+                    if node.map.omp_schedule == dtypes.OMPScheduleType.Static:
+                        schedule += "static"
+                    elif node.map.omp_schedule == dtypes.OMPScheduleType.Dynamic:
+                        schedule += "dynamic"
+                    elif node.map.omp_schedule == dtypes.OMPScheduleType.Guided:
+                        schedule += "guided"
+                    else:
+                        raise ValueError("Unknown OpenMP schedule type")
+                    if node.map.omp_chunk_size > 0:
+                        schedule += f", {node.map.omp_chunk_size}"
+                    schedule += ")"
+                    map_header += schedule
+
+                if node.map.omp_num_threads > 0:
+                    map_header += f" num_threads({node.map.omp_num_threads})"
+
+            # OpenMP nested loop properties
+            if node.map.schedule == dtypes.ScheduleType.CPU_Multicore and node.map.collapse > 1:
                 map_header += ' collapse(%d)' % node.map.collapse
-            # Loop over outputs, add OpenMP reduction clauses to detected cases
-            # TODO: set up register outside loop
-            # exit_node = dfg.exit_node(node)
-            reduction_stmts = []
-            # for outedge in dfg.in_edges(exit_node):
-            #    if (isinstance(outedge.src, nodes.CodeNode)
-            #            and outedge.data.wcr is not None):
-            #        redt = operations.detect_reduction_type(outedge.data.wcr)
-            #        if redt != dtypes.ReductionType.Custom:
-            #            reduction_stmts.append('reduction({typ}:{var})'.format(
-            #                typ=_REDUCTION_TYPE_TO_OPENMP[redt],
-            #                var=outedge.src_conn))
-            #            reduced_variables.append(outedge)
-
-            map_header += " %s\n" % ", ".join(reduction_stmts)
-
-        # TODO: Explicit map unroller
-        if node.map.unroll:
-            if node.map.schedule == dtypes.ScheduleType.CPU_Multicore:
-                raise ValueError("A Multicore CPU map cannot be unrolled (" + node.map.label + ")")
 
-        constsize = all([not symbolic.issymbolic(v, sdfg.constants) for r in node.map.range for v in r])
+        if node.map.unroll:
+            if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent):
+                raise ValueError("An OpenMP map cannot be unrolled (" + node.map.label + ")")
 
-        # Nested loops
         result.write(map_header, sdfg, state_id, node)
-        for i, r in enumerate(node.map.range):
-            # var = '__DACEMAP_%s_%d' % (node.map.label, i)
-            var = map_params[i]
-            begin, end, skip = r
 
-            if node.map.unroll:
-                result.write("#pragma unroll", sdfg, state_id, node)
+        if node.map.schedule == dtypes.ScheduleType.CPU_Persistent:
+            result.write('{\n', sdfg, state_id, node)
+
+            # Find if bounds are used within the scope
+            scope = state_dfg.scope_subgraph(node, False, False)
+            fsyms = scope.free_symbols
+            # Include external edges
+            for n in scope.nodes():
+                for e in state_dfg.all_edges(n):
+                    fsyms |= e.data.free_symbols
+            fsyms = set(map(str, fsyms))
+
+            ntid_is_used = '__omp_num_threads' in fsyms
+            tid_is_used = node.map.params[0] in fsyms
+            if tid_is_used or ntid_is_used:
+                function_stream.write('#include <omp.h>', sdfg, state_id, node)
+            if tid_is_used:
+                result.write(f'auto {node.map.params[0]} = omp_get_thread_num();', sdfg, state_id, node)
+            if ntid_is_used:
+                result.write(f'auto __omp_num_threads = omp_get_num_threads();', sdfg, state_id, node)
+        else:
+            # Emit nested loops
+            for i, r in enumerate(node.map.range):
+                var = map_params[i]
+                begin, end, skip = r
 
-            result.write(
-                "for (auto %s = %s; %s < %s; %s += %s) {\n" %
-                (var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, cpp.sym2cpp(skip)),
-                sdfg,
-                state_id,
-                node,
-            )
+                if node.map.unroll:
+                    result.write("#pragma unroll", sdfg, state_id, node)
+
+                result.write(
+                    "for (auto %s = %s; %s < %s; %s += %s) {\n" %
+                    (var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, cpp.sym2cpp(skip)),
+                    sdfg,
+                    state_id,
+                    node,
+                )
 
         callsite_stream.write(inner_stream.getvalue())
 
@@ -1803,8 +1826,11 @@ def _generate_MapExit(self, sdfg, dfg, state_id, node, function_stream, callsite
 
         self.generate_scope_postamble(sdfg, dfg, state_id, function_stream, outer_stream, callsite_stream)
 
-        for _ in map_node.map.range:
+        if map_node.map.schedule == dtypes.ScheduleType.CPU_Persistent:
             result.write("}", sdfg, state_id, node)
+        else:
+            for _ in map_node.map.range:
+                result.write("}", sdfg, state_id, node)
 
         result.write(outer_stream.getvalue())
 

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
@@ -1306,10 +1306,31 @@ def generate_devicelevel_state(self, sdfg, state, function_stream, callsite_stre
             for c in components:
 
                 has_map = any(isinstance(node, dace.nodes.MapEntry) for node in c.nodes())
+                # If a global is modified, execute once per global state,
+                # if a shared memory element is modified, execute once per block,
+                # if a local scalar is modified, execute in every thread.
                 if not has_map:
-                    callsite_stream.write("if (blockIdx.x == 0 "
-                                          "&& threadIdx.x == 0) "
-                                          "{  // sub-graph begin", sdfg, state.node_id)
+                    written_nodes = [n for n in c if state.in_degree(n) > 0 and isinstance(n, dace.nodes.AccessNode)]
+
+                    # The order of the branching below matters - it reduces the scope with every detected write
+                    write_scope = 'thread'  # General case acts in every thread
+                    if any(sdfg.arrays[n.data].storage in (dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned)
+                           for n in written_nodes):
+                        write_scope = 'grid'
+                    if any(sdfg.arrays[n.data].storage == dtypes.StorageType.GPU_Shared for n in written_nodes):
+                        write_scope = 'block'
+                    if any(sdfg.arrays[n.data].storage == dtypes.StorageType.Register for n in written_nodes):
+                        write_scope = 'thread'
+
+                    if write_scope == 'grid':
+                        callsite_stream.write("if (blockIdx.x == 0 "
+                                            "&& threadIdx.x == 0) "
+                                            "{  // sub-graph begin", sdfg, state.node_id)
+                    elif write_scope == 'block':
+                        callsite_stream.write("if (threadIdx.x == 0) "
+                                            "{  // sub-graph begin", sdfg, state.node_id)
+                    else:
+                        callsite_stream.write("{  // subgraph begin", sdfg, state.node_id)
                 else:
                     callsite_stream.write("{  // subgraph begin", sdfg, state.node_id)
 

diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
@@ -82,7 +82,10 @@ def free_symbols(self, obj: Any):
         k = id(obj)
         if k in self.fsyms:
             return self.fsyms[k]
-        result = obj.free_symbols
+        if hasattr(obj, 'used_symbols'):
+            result = obj.used_symbols(all_symbols=False)
+        else:
+            result = obj.free_symbols
         self.fsyms[k] = result
         return result
 

diff --git a/dace/data.py b/dace/data.py
@@ -243,14 +243,26 @@ def as_arg(self, with_types=True, for_call=False, name=None):
         """Returns a string for a C++ function signature (e.g., `int *A`). """
         raise NotImplementedError
 
+    def used_symbols(self, all_symbols: bool) -> Set[symbolic.SymbolicType]:
+        """
+        Returns a set of symbols that are used by this data descriptor.
+
+        :param all_symbols: Include not-strictly-free symbols that are used by this data descriptor,
+                            e.g., shape and size of a global array.
+        :return: A set of symbols that are used by this data descriptor. NOTE: The results are symbolic
+                 rather than a set of strings.
+        """
+        result = set()
+        if self.transient or all_symbols:
+            for s in self.shape:
+                if isinstance(s, sp.Basic):
+                    result |= set(s.free_symbols)
+        return result
+
     @property
     def free_symbols(self) -> Set[symbolic.SymbolicType]:
         """ Returns a set of undefined symbols in this data descriptor. """
-        result = set()
-        for s in self.shape:
-            if isinstance(s, sp.Basic):
-                result |= set(s.free_symbols)
-        return result
+        return self.used_symbols(all_symbols=True)
 
     def __repr__(self):
         return 'Abstract Data Container, DO NOT USE'
@@ -689,20 +701,23 @@ def as_arg(self, with_types=True, for_call=False, name=None):
     def sizes(self):
         return [d.name if isinstance(d, symbolic.symbol) else str(d) for d in self.shape]
 
-    @property
-    def free_symbols(self):
-        result = super().free_symbols
+    def used_symbols(self, all_symbols: bool) -> Set[symbolic.SymbolicType]:
+        result = super().used_symbols(all_symbols)
         for s in self.strides:
             if isinstance(s, sp.Expr):
                 result |= set(s.free_symbols)
-        if isinstance(self.total_size, sp.Expr):
-            result |= set(self.total_size.free_symbols)
         for o in self.offset:
             if isinstance(o, sp.Expr):
                 result |= set(o.free_symbols)
-
+        if self.transient or all_symbols:
+            if isinstance(self.total_size, sp.Expr):
+                result |= set(self.total_size.free_symbols)
         return result
 
+    @property
+    def free_symbols(self):
+        return self.used_symbols(all_symbols=True)
+
     def _set_shape_dependent_properties(self, shape, strides, total_size, offset):
         """
         Used to set properties which depend on the shape of the array
@@ -890,17 +905,20 @@ def covers_range(self, rng):
 
         return True
 
-    @property
-    def free_symbols(self):
-        result = super().free_symbols
-        if isinstance(self.buffer_size, sp.Expr):
+    def used_symbols(self, all_symbols: bool) -> Set[symbolic.SymbolicType]:
+        result = super().used_symbols(all_symbols)
+        if (self.transient or all_symbols) and isinstance(self.buffer_size, sp.Expr):
             result |= set(self.buffer_size.free_symbols)
         for o in self.offset:
             if isinstance(o, sp.Expr):
                 result |= set(o.free_symbols)
 
         return result
 
+    @property
+    def free_symbols(self):
+        return self.used_symbols(all_symbols=True)
+
 
 @make_properties
 class View(Array):