Merge branch 'main' into data_groups

spcl · Oct 31, 2024 · 64b4fb8 · 64b4fb8
2 parents 941ba14 + 2811e40
commit 64b4fb8
Show file tree

Hide file tree

Showing 68 changed files with 2,777 additions and 355 deletions.
diff --git a/.github/workflows/fpga-ci.yml b/.github/workflows/fpga-ci.yml
@@ -2,11 +2,14 @@ name: FPGA Tests
 
 on:
   push:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   pull_request:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   merge_group:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
+
+env:
+  CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
 jobs:
   test-fpga:

diff --git a/.github/workflows/general-ci.yml b/.github/workflows/general-ci.yml
@@ -2,11 +2,11 @@ name: General Tests
 
 on:
   push:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   pull_request:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   merge_group:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
 
 jobs:
   test:
@@ -85,4 +85,8 @@ jobs:
         ./tests/polybench_test.sh
         ./tests/xform_test.sh
         coverage combine .; coverage report; coverage xml
-        ./codecov
+
+    - uses: codecov/codecov-action@v4
+      with:
+        token: ${{ secrets.CODECOV_TOKEN }}
+        verbose: true
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
@@ -2,15 +2,16 @@ name: GPU Tests
 
 on:
   push:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   pull_request:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   merge_group:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
 
 env:
   CUDACXX: /usr/local/cuda/bin/nvcc
   MKLROOT: /opt/intel/oneapi/mkl/latest/
+  CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
 
 jobs:

diff --git a/.github/workflows/heterogeneous-ci.yml b/.github/workflows/heterogeneous-ci.yml
@@ -2,16 +2,17 @@ name: Heterogeneous Tests
 
 on:
   push:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   pull_request:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   merge_group:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
 
 env:
   CUDA_HOME: /usr/local/cuda
   CUDACXX: nvcc
   MKLROOT: /opt/intel/oneapi/mkl/latest/
+  CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
 jobs:
   test-heterogeneous:

diff --git a/.github/workflows/pyFV3-ci.yml b/.github/workflows/pyFV3-ci.yml
@@ -2,11 +2,11 @@ name: NASA/NOAA pyFV3 repository build test
 
 on:
   push:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   pull_request:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   merge_group:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
 
 defaults:
     run:

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -47,7 +47,7 @@ For automatic styling, we use the [yapf](https://github.com/google/yapf) file fo
 We use [pytest](https://www.pytest.org/) for our testing infrastructure. All tests under the `tests/` folder 
 (and any subfolders within) are automatically read and run. The files must be under the right subfolder
 based on the component being tested (e.g., `tests/sdfg/` for IR-related tests), and must have the right
-suffix: either `*_test.py` or `*_cudatest.py`. See [pytest.ini](https://github.com/spcl/dace/blob/master/pytest.ini)
+suffix: either `*_test.py` or `*_cudatest.py`. See [pytest.ini](https://github.com/spcl/dace/blob/main/pytest.ini)
 for more information, and for the markers we use to specify software/hardware requirements.
 
 The structure of the test file must follow `pytest` standards (i.e., free functions called `test_*`), and

diff --git a/README.md b/README.md
@@ -3,15 +3,15 @@
 [![FPGA Tests](https://github.com/spcl/dace/actions/workflows/fpga-ci.yml/badge.svg)](https://github.com/spcl/dace/actions/workflows/fpga-ci.yml)
 [![Documentation Status](https://readthedocs.org/projects/spcldace/badge/?version=latest)](https://spcldace.readthedocs.io/en/latest/?badge=latest)
 [![PyPI version](https://badge.fury.io/py/dace.svg)](https://badge.fury.io/py/dace)
-[![codecov](https://codecov.io/gh/spcl/dace/branch/master/graph/badge.svg)](https://codecov.io/gh/spcl/dace)
+[![codecov](https://codecov.io/gh/spcl/dace/branch/main/graph/badge.svg)](https://codecov.io/gh/spcl/dace)
 
 
 ![D](dace.svg)aCe - Data-Centric Parallel Programming
 =====================================================
 
 _Decoupling domain science from performance optimization._
 
-DaCe is a [fast](https://nbviewer.org/github/spcl/dace/blob/master/tutorials/benchmarking.ipynb) parallel programming
+DaCe is a [fast](https://nbviewer.org/github/spcl/dace/blob/main/tutorials/benchmarking.ipynb) parallel programming
 framework that takes code in Python/NumPy and other programming languages, and maps it to high-performance 
 **CPU, GPU, and FPGA** programs, which can be optimized to achieve state-of-the-art. Internally, DaCe 
 uses the Stateful DataFlow multiGraph (SDFG) *data-centric intermediate 
@@ -61,13 +61,13 @@ be used in any C ABI compatible language (C/C++, FORTRAN, etc.).
 
 For more information on how to use DaCe, see the [samples](samples) or tutorials below:
 
-* [Getting Started](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/getting_started.ipynb)
-* [Benchmarks, Instrumentation, and Performance Comparison with Other Python Compilers](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/benchmarking.ipynb)
-* [Explicit Dataflow in Python](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/explicit.ipynb)
-* [NumPy API Reference](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/numpy_frontend.ipynb)
-* [SDFG API](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/sdfg_api.ipynb)
-* [Using and Creating Transformations](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/transformations.ipynb)
-* [Extending the Code Generator](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/codegen.ipynb)
+* [Getting Started](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/getting_started.ipynb)
+* [Benchmarks, Instrumentation, and Performance Comparison with Other Python Compilers](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/benchmarking.ipynb)
+* [Explicit Dataflow in Python](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/explicit.ipynb)
+* [NumPy API Reference](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/numpy_frontend.ipynb)
+* [SDFG API](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/sdfg_api.ipynb)
+* [Using and Creating Transformations](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/transformations.ipynb)
+* [Extending the Code Generator](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/codegen.ipynb)
 
 Publication
 -----------

diff --git a/dace/codegen/cppunparse.py b/dace/codegen/cppunparse.py
@@ -749,6 +749,8 @@ def _Num(self, t):
         # For complex values, use ``dtype_to_typeclass``
         if isinstance(t_n, complex):
             dtype = dtypes.dtype_to_typeclass(complex)
+            repr_n = f'{dtype}({t_n.real}, {t_n.imag})'
+
 
         # Handle large integer values
         if isinstance(t_n, int):
@@ -765,10 +767,8 @@ def _Num(self, t):
             elif bits >= 64:
                 warnings.warn(f'Value wider than 64 bits encountered in expression ({t_n}), emitting as-is')
 
-        if repr_n.endswith("j"):
-            self.write("%s(0, %s)" % (dtype, repr_n.replace("inf", INFSTR)[:-1]))
-        else:
-            self.write(repr_n.replace("inf", INFSTR))
+        repr_n = repr_n.replace("inf", INFSTR)
+        self.write(repr_n)
 
     def _List(self, t):
         raise NotImplementedError('Invalid C++')

diff --git a/dace/codegen/prettycode.py b/dace/codegen/prettycode.py
@@ -14,7 +14,7 @@ class CodeIOStream(StringIO):
         nodes. """
     def __init__(self, base_indentation=0):
         super(CodeIOStream, self).__init__()
-        self._indent = 0
+        self._indent = base_indentation
         self._spaces = int(Config.get('compiler', 'indentation_spaces'))
         self._lineinfo = Config.get_bool('compiler', 'codegen_lineinfo')
 

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
@@ -23,8 +23,8 @@
 from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute
 from dace.config import Config
 from dace.frontend import operations
-from dace.sdfg import (SDFG, ScopeSubgraphView, SDFGState, has_dynamic_map_inputs,
-                       is_array_stream_view, is_devicelevel_gpu, nodes, scope_contains_scope)
+from dace.sdfg import (SDFG, ScopeSubgraphView, SDFGState, has_dynamic_map_inputs, is_array_stream_view,
+                       is_devicelevel_gpu, nodes, scope_contains_scope)
 from dace.sdfg import utils as sdutil
 from dace.sdfg.graph import MultiConnectorEdge
 from dace.sdfg.state import ControlFlowRegion, StateSubgraphView
@@ -68,6 +68,7 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG):
         dispatcher = self._dispatcher
 
         self.create_grid_barrier = False
+        self.dynamic_tbmap_type = None
         self.extra_nsdfg_args = []
         CUDACodeGen._in_device_code = False
         self._cpu_codegen: Optional['CPUCodeGen'] = None
@@ -892,8 +893,8 @@ def increment(streams):
 
         return max_streams, max_events
 
-    def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.StorageType,
-                   dst_node: nodes.Node, dst_storage: dtypes.StorageType, dst_schedule: dtypes.ScheduleType,
+    def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.StorageType, dst_node: nodes.Node,
+                   dst_storage: dtypes.StorageType, dst_schedule: dtypes.ScheduleType,
                    edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], sdfg: SDFG, cfg: ControlFlowRegion,
                    dfg: StateSubgraphView, callsite_stream: CodeIOStream) -> None:
         u, uconn, v, vconn, memlet = edge
@@ -1163,11 +1164,8 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
                             copysize=', '.join(_topy(copy_shape)),
                             is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false',
                             accum=accum or '::Copy',
-                            args=', '.join(
-                                [src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) + custom_reduction
-                            )
-                        ),
-                        cfg, state_id, [src_node, dst_node])
+                            args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) +
+                                           custom_reduction)), cfg, state_id, [src_node, dst_node])
                 else:
                     callsite_stream.write(
                         ('    {func}<{type}, {bdims}, {copysize}, ' +
@@ -1236,8 +1234,12 @@ def _begin_streams(self, sdfg, state):
                         result.add(e.dst._cuda_stream)
         return result
 
-    def generate_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState,
-                       function_stream: CodeIOStream, callsite_stream: CodeIOStream,
+    def generate_state(self,
+                       sdfg: SDFG,
+                       cfg: ControlFlowRegion,
+                       state: SDFGState,
+                       function_stream: CodeIOStream,
+                       callsite_stream: CodeIOStream,
                        generate_state_footer: bool = False) -> None:
         # Two modes: device-level state and if this state has active streams
         if CUDACodeGen._in_device_code:
@@ -1361,8 +1363,7 @@ def generate_devicelevel_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state:
                                               "&& threadIdx.x == 0) "
                                               "{  // sub-graph begin", cfg, state.block_id)
                     elif write_scope == 'block':
-                        callsite_stream.write("if (threadIdx.x == 0) "
-                                              "{  // sub-graph begin", cfg, state.block_id)
+                        callsite_stream.write("if (threadIdx.x == 0) " "{  // sub-graph begin", cfg, state.block_id)
                     else:
                         callsite_stream.write("{  // subgraph begin", cfg, state.block_id)
                 else:
@@ -1985,16 +1986,13 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S
 
         # allocating shared memory for dynamic threadblock maps
         if has_dtbmap:
-            kernel_stream.write(
-                '__shared__ dace::'
-                'DynamicMap<{fine_grained}, {block_size}>'
-                '::shared_type dace_dyn_map_shared;'.format(
-                    fine_grained=('true'
-                                  if Config.get_bool('compiler', 'cuda', 'dynamic_map_fine_grained') else 'false'),
-                    block_size=functools.reduce(
-                        (lambda x, y: x * y),
-                        [int(x) for x in Config.get('compiler', 'cuda', 'dynamic_map_block_size').split(',')])), cfg,
-                state_id, node)
+            self.dynamic_tbmap_type = (
+                f'dace::DynamicMap<{"true" if Config.get_bool("compiler", "cuda", "dynamic_map_fine_grained") else "false"}, '
+                f'{functools.reduce((lambda x, y: x * y), [int(x) for x in Config.get("compiler", "cuda", "dynamic_map_block_size").split(",")])}>'
+                '::shared_type')
+            kernel_stream.write(f'__shared__ {self.dynamic_tbmap_type} dace_dyn_map_shared;', cfg, state_id, node)
+        else:
+            self.dynamic_tbmap_type = None
 
         # Add extra opening brace (dynamic map ranges, closed in MapExit
         # generator)
@@ -2072,8 +2070,8 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S
 
         # Generate conditions for this block's execution using min and max
         # element, e.g., skipping out-of-bounds threads in trailing block
-        # unless thsi is handled by another map down the line
-        if (not has_tbmap and not has_dtbmap and node.map.schedule != dtypes.ScheduleType.GPU_Persistent):
+        # unless this is handled by another map down the line
+        if ((not has_tbmap or has_dtbmap) and node.map.schedule != dtypes.ScheduleType.GPU_Persistent):
             dsym_end = [d + bs - 1 for d, bs in zip(dsym, self._block_dims)]
             minels = krange.min_element()
             maxels = krange.max_element()
@@ -2090,10 +2088,12 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S
                     condition += '%s < %s' % (v, _topy(maxel + 1))
                 if len(condition) > 0:
                     self._kernel_grid_conditions.append(f'if ({condition}) {{')
-                    kernel_stream.write('if (%s) {' % condition, cfg, state_id, scope_entry)
+                    if not has_dtbmap:
+                        kernel_stream.write('if (%s) {' % condition, cfg, state_id, scope_entry)
                 else:
                     self._kernel_grid_conditions.append('{')
-                    kernel_stream.write('{', cfg, state_id, scope_entry)
+                    if not has_dtbmap:
+                        kernel_stream.write('{', cfg, state_id, scope_entry)
 
         self._dispatcher.dispatch_subgraph(sdfg,
                                            cfg,
@@ -2112,6 +2112,7 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S
         self._kernel_state = None
         CUDACodeGen._in_device_code = False
         self._grid_dims = None
+        self.dynamic_tbmap_type = None
 
     def get_next_scope_entries(self, dfg, scope_entry):
         parent_scope_entry = dfg.entry_node(scope_entry)
@@ -2179,10 +2180,8 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco
                 current_sdfg = current_state.parent
             if not outer_scope:
                 raise ValueError(f'Failed to find the outer scope of {scope_entry}')
-            callsite_stream.write(
-                'if ({} < {}) {{'.format(outer_scope.map.params[0],
-                                         _topy(subsets.Range(outer_scope.map.range[::-1]).max_element()[0] + 1)), cfg,
-                state_id, scope_entry)
+            for cond in self._kernel_grid_conditions:
+                callsite_stream.write(cond, cfg, state_id, scope_entry)
 
             # NOTE: Dynamic map inputs must be defined both outside and inside the dynamic Map schedule.
             # They define inside the schedule the bounds of the any nested Maps.
@@ -2205,8 +2204,9 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco
                 '__dace_dynmap_begin = {begin};\n'
                 '__dace_dynmap_end = {end};'.format(begin=dynmap_begin, end=dynmap_end), cfg, state_id, scope_entry)
 
-            # close if
-            callsite_stream.write('}', cfg, state_id, scope_entry)
+            # Close kernel grid conditions
+            for _ in self._kernel_grid_conditions:
+                callsite_stream.write('}', cfg, state_id, scope_entry)
 
             callsite_stream.write(
                 'dace::DynamicMap<{fine_grained}, {bsize}>::'
@@ -2215,7 +2215,7 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco
                 'auto {param}) {{'.format(fine_grained=('true' if Config.get_bool(
                     'compiler', 'cuda', 'dynamic_map_fine_grained') else 'false'),
                                           bsize=total_block_size,
-                                          kmapIdx=outer_scope.map.params[0],
+                                          kmapIdx=outer_scope.map.params[-1],
                                           param=dynmap_var), cfg, state_id, scope_entry)
 
             for e in dace.sdfg.dynamic_map_inputs(dfg, scope_entry):
@@ -2556,8 +2556,8 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco
                 for cond in self._kernel_grid_conditions:
                     callsite_stream.write(cond, cfg, state_id, scope_entry)
 
-    def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
-                      node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
+    def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node,
+                      function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
         if self.node_dispatch_predicate(sdfg, dfg, node):
             # Dynamically obtain node generator according to class name
             gen = getattr(self, '_generate_' + type(node).__name__, False)
@@ -2594,6 +2594,8 @@ def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node):
         result = self._cpu_codegen.generate_nsdfg_arguments(sdfg, cfg, dfg, state, node)
         if self.create_grid_barrier:
             result.append(('cub::GridBarrier&', '__gbar', '__gbar'))
+        if self.dynamic_tbmap_type:
+            result.append((f'{self.dynamic_tbmap_type}&', 'dace_dyn_map_shared', 'dace_dyn_map_shared'))
 
         # Add data from nested SDFGs to kernel arguments
         result.extend([(atype, aname, aname) for atype, aname, _ in self.extra_nsdfg_args])

diff --git a/dace/codegen/tools/type_inference.py b/dace/codegen/tools/type_inference.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 import ast
-from dace import dtypes
+from dace import data, dtypes
 from dace import symbolic
 from dace.codegen import cppunparse
 from dace.symbolic import symbol, SymExpr, symstr
@@ -286,6 +286,8 @@ def _Name(t, symbols, inferred_symbols):
                 inferred_type = dtypes.typeclass(inferred_type.type)
             elif isinstance(inferred_type, symbolic.symbol):
                 inferred_type = inferred_type.dtype
+            elif isinstance(inferred_type, data.Data):
+                inferred_type = inferred_type.dtype
         elif t_id in inferred_symbols:
             inferred_type = inferred_symbols[t_id]
         return inferred_type

diff --git a/dace/config_schema.yml b/dace/config_schema.yml
@@ -919,6 +919,13 @@ required:
                 description: >
                     Check for undefined symbols in memlets during SDFG validation.
 
+            check_race_conditions:
+                type: bool
+                default: false
+                title: Check race conditions
+                description: >
+                    Check for potential race conditions during validation.
+
     #############################################
     # Features for unit testing