Skip to content

Commit

Permalink
Default schedule inference uses dynamic thread blocks if they exist
Browse files Browse the repository at this point in the history
  • Loading branch information
tbennun committed Oct 28, 2024
1 parent c7f4694 commit 7d02759
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 9 deletions.
10 changes: 8 additions & 2 deletions dace/sdfg/infer_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,7 @@ def infer_connector_types(sdfg: SDFG):
for e in state.out_edges(node):
cname = e.src_conn
if cname and node.out_connectors[cname] is None:
raise TypeError('Ambiguous or uninferable type in'
' connector "%s" of node "%s"' % (cname, node))
raise TypeError('Ambiguous or uninferable type in' ' connector "%s" of node "%s"' % (cname, node))


#############################################################################
Expand Down Expand Up @@ -301,6 +300,12 @@ def _set_default_schedule_in_scope(state: SDFGState,
else:
child_schedule = _determine_child_schedule(parent_schedules)

# Special case for dynamic thread-block neighboring schedules
if child_schedule == dtypes.ScheduleType.GPU_ThreadBlock:
from dace.transformation.helpers import gpu_map_has_explicit_dyn_threadblocks # Avoid import loops
if gpu_map_has_explicit_dyn_threadblocks(state, parent_node):
child_schedule = dtypes.ScheduleType.GPU_ThreadBlock_Dynamic

# Set child schedule type in scope
for node in child_nodes[parent_node]:
# Set default schedule types
Expand Down Expand Up @@ -393,6 +398,7 @@ def _get_storage_from_parent(data_name: str, sdfg: SDFG) -> dtypes.StorageType:

raise ValueError(f'Could not find data descriptor {data_name} in parent SDFG')


def infer_aliasing(node: nodes.NestedSDFG, sdfg: SDFG, state: SDFGState) -> None:
"""
Infers aliasing information on nested SDFG arrays based on external edges and connectors.
Expand Down
17 changes: 12 additions & 5 deletions dace/transformation/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -934,11 +934,7 @@ def replicate_scope(sdfg: SDFG, state: SDFGState, scope: ScopeSubgraphView) -> S
return ScopeSubgraphView(state, new_nodes, new_entry)


def offset_map(state: SDFGState,
entry: nodes.MapEntry,
dim: int,
offset: symbolic.SymbolicType,
negative: bool = True):
def offset_map(state: SDFGState, entry: nodes.MapEntry, dim: int, offset: symbolic.SymbolicType, negative: bool = True):
"""
Offsets a map parameter and its contents by a value.
Expand Down Expand Up @@ -1270,6 +1266,17 @@ def gpu_map_has_explicit_threadblocks(state: SDFGState, entry: nodes.EntryNode)
return False


def gpu_map_has_explicit_dyn_threadblocks(state: SDFGState, entry: nodes.EntryNode) -> bool:
"""
Returns True if GPU_Device map has explicit thread-block maps nested within.
"""
internal_maps = get_internal_scopes(state, entry)
if any(m.schedule == dtypes.ScheduleType.GPU_ThreadBlock_Dynamic for _, m in internal_maps):
return True

return False


def reconnect_edge_through_map(
state: SDFGState, edge: graph.MultiConnectorEdge[Memlet], new_node: Union[nodes.EntryNode, nodes.ExitNode],
keep_src: bool) -> Tuple[graph.MultiConnectorEdge[Memlet], graph.MultiConnectorEdge[Memlet]]:
Expand Down
23 changes: 21 additions & 2 deletions tests/dynamic_tb_map_cudatest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,8 @@

@dace.program(dace.uint32[H + 1], dace.uint32[nnz], dace.float32[nnz], dace.float32[W], dace.float32[H])
def spmv(A_row, A_col, A_val, x, b):

@dace.mapscope(_[0:H])
def compute_row(i):

@dace.map(_[A_row[i]:A_row[i + 1]])
def compute(j):
a << A_val[j]
Expand Down Expand Up @@ -292,8 +290,29 @@ def sddvm(D_vals: dace.float32[nnz_D], A2_crd: dace.int32[nnz_A], A2_pos: dace.i
assert np.allclose(val, ref.data)


@pytest.mark.gpu
def test_dynamic_default_schedule():
N = dace.symbol('N')

@dace.program
def tester(a: dace.float32[N, 10]):
A = dace.ndarray([N, 10], dtype=dace.float32, storage=dace.StorageType.GPU_Global)
A[:] = a
for i in dace.map[0:N] @ dace.ScheduleType.GPU_Device:
smem = np.empty((10, ), dtype=np.float32) @ dace.StorageType.GPU_Shared
smem[:] = 1
for j in dace.map[0:10] @ dace.ScheduleType.GPU_ThreadBlock_Dynamic:
A[i, j] = i * 10 + smem[j]
a[:] = A

a = np.zeros((65, 10), dtype=np.float32)
tester(a)
assert np.allclose(a, np.fromfunction(lambda i, j, k: i * 10 + j, (65, 10), dtype=np.float32))


if __name__ == '__main__':
test_dynamic_map()
test_dynamic_maps()
test_nested_dynamic_map()
test_dynamic_map_with_step()
test_dynamic_default_schedule()

0 comments on commit 7d02759

Please sign in to comment.