Default schedule inference uses dynamic thread blocks if they exist

spcl · Oct 28, 2024 · 7d02759 · 7d02759
1 parent c7f4694
commit 7d02759
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 9 deletions.
diff --git a/dace/sdfg/infer_types.py b/dace/sdfg/infer_types.py
@@ -116,8 +116,7 @@ def infer_connector_types(sdfg: SDFG):
             for e in state.out_edges(node):
                 cname = e.src_conn
                 if cname and node.out_connectors[cname] is None:
-                    raise TypeError('Ambiguous or uninferable type in'
-                                    ' connector "%s" of node "%s"' % (cname, node))
+                    raise TypeError('Ambiguous or uninferable type in' ' connector "%s" of node "%s"' % (cname, node))
 
 
 #############################################################################
@@ -301,6 +300,12 @@ def _set_default_schedule_in_scope(state: SDFGState,
     else:
         child_schedule = _determine_child_schedule(parent_schedules)
 
+        # Special case for dynamic thread-block neighboring schedules
+        if child_schedule == dtypes.ScheduleType.GPU_ThreadBlock:
+            from dace.transformation.helpers import gpu_map_has_explicit_dyn_threadblocks  # Avoid import loops
+            if gpu_map_has_explicit_dyn_threadblocks(state, parent_node):
+                child_schedule = dtypes.ScheduleType.GPU_ThreadBlock_Dynamic
+
     # Set child schedule type in scope
     for node in child_nodes[parent_node]:
         # Set default schedule types
@@ -393,6 +398,7 @@ def _get_storage_from_parent(data_name: str, sdfg: SDFG) -> dtypes.StorageType:
 
     raise ValueError(f'Could not find data descriptor {data_name} in parent SDFG')
 
+
 def infer_aliasing(node: nodes.NestedSDFG, sdfg: SDFG, state: SDFGState) -> None:
     """
     Infers aliasing information on nested SDFG arrays based on external edges and connectors.

diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py
@@ -934,11 +934,7 @@ def replicate_scope(sdfg: SDFG, state: SDFGState, scope: ScopeSubgraphView) -> S
     return ScopeSubgraphView(state, new_nodes, new_entry)
 
 
-def offset_map(state: SDFGState,
-               entry: nodes.MapEntry,
-               dim: int,
-               offset: symbolic.SymbolicType,
-               negative: bool = True):
+def offset_map(state: SDFGState, entry: nodes.MapEntry, dim: int, offset: symbolic.SymbolicType, negative: bool = True):
     """
     Offsets a map parameter and its contents by a value.
 
@@ -1270,6 +1266,17 @@ def gpu_map_has_explicit_threadblocks(state: SDFGState, entry: nodes.EntryNode)
     return False
 
 
+def gpu_map_has_explicit_dyn_threadblocks(state: SDFGState, entry: nodes.EntryNode) -> bool:
+    """
+    Returns True if GPU_Device map has explicit thread-block maps nested within.
+    """
+    internal_maps = get_internal_scopes(state, entry)
+    if any(m.schedule == dtypes.ScheduleType.GPU_ThreadBlock_Dynamic for _, m in internal_maps):
+        return True
+
+    return False
+
+
 def reconnect_edge_through_map(
         state: SDFGState, edge: graph.MultiConnectorEdge[Memlet], new_node: Union[nodes.EntryNode, nodes.ExitNode],
         keep_src: bool) -> Tuple[graph.MultiConnectorEdge[Memlet], graph.MultiConnectorEdge[Memlet]]:

diff --git a/tests/dynamic_tb_map_cudatest.py b/tests/dynamic_tb_map_cudatest.py
@@ -12,10 +12,8 @@
 
 @dace.program(dace.uint32[H + 1], dace.uint32[nnz], dace.float32[nnz], dace.float32[W], dace.float32[H])
 def spmv(A_row, A_col, A_val, x, b):
-
     @dace.mapscope(_[0:H])
     def compute_row(i):
-
         @dace.map(_[A_row[i]:A_row[i + 1]])
         def compute(j):
             a << A_val[j]
@@ -292,8 +290,29 @@ def sddvm(D_vals: dace.float32[nnz_D], A2_crd: dace.int32[nnz_A], A2_pos: dace.i
     assert np.allclose(val, ref.data)
 
 
+@pytest.mark.gpu
+def test_dynamic_default_schedule():
+    N = dace.symbol('N')
+
+    @dace.program
+    def tester(a: dace.float32[N, 10]):
+        A = dace.ndarray([N, 10], dtype=dace.float32, storage=dace.StorageType.GPU_Global)
+        A[:] = a
+        for i in dace.map[0:N] @ dace.ScheduleType.GPU_Device:
+            smem = np.empty((10, ), dtype=np.float32) @ dace.StorageType.GPU_Shared
+            smem[:] = 1
+            for j in dace.map[0:10] @ dace.ScheduleType.GPU_ThreadBlock_Dynamic:
+                A[i, j] = i * 10 + smem[j]
+        a[:] = A
+
+    a = np.zeros((65, 10), dtype=np.float32)
+    tester(a)
+    assert np.allclose(a, np.fromfunction(lambda i, j, k: i * 10 + j, (65, 10), dtype=np.float32))
+
+
 if __name__ == '__main__':
     test_dynamic_map()
     test_dynamic_maps()
     test_nested_dynamic_map()
     test_dynamic_map_with_step()
+    test_dynamic_default_schedule()