diff --git a/.github/workflows/fpga-ci.yml b/.github/workflows/fpga-ci.yml
index d03d044b30..ef8e5348da 100644
--- a/.github/workflows/fpga-ci.yml
+++ b/.github/workflows/fpga-ci.yml
@@ -2,11 +2,14 @@ name: FPGA Tests
 
 on:
   push:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   pull_request:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   merge_group:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
+
+env:
+  CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
 jobs:
   test-fpga:
diff --git a/.github/workflows/general-ci.yml b/.github/workflows/general-ci.yml
index f7b44e6978..faf0a727be 100644
--- a/.github/workflows/general-ci.yml
+++ b/.github/workflows/general-ci.yml
@@ -2,11 +2,11 @@ name: General Tests
 
 on:
   push:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   pull_request:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   merge_group:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
 
 jobs:
   test:
@@ -85,4 +85,8 @@ jobs:
         ./tests/polybench_test.sh
         ./tests/xform_test.sh
         coverage combine .; coverage report; coverage xml
-        ./codecov
+
+    - uses: codecov/codecov-action@v4
+      with:
+        token: ${{ secrets.CODECOV_TOKEN }}
+        verbose: true
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index ce7f9b628e..527e004478 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -2,15 +2,16 @@ name: GPU Tests
 
 on:
   push:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   pull_request:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   merge_group:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
 
 env:
   CUDACXX: /usr/local/cuda/bin/nvcc
   MKLROOT: /opt/intel/oneapi/mkl/latest/
+  CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
 
 jobs:
diff --git a/.github/workflows/heterogeneous-ci.yml b/.github/workflows/heterogeneous-ci.yml
index 7c65e90718..99b566e21f 100644
--- a/.github/workflows/heterogeneous-ci.yml
+++ b/.github/workflows/heterogeneous-ci.yml
@@ -2,16 +2,17 @@ name: Heterogeneous Tests
 
 on:
   push:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   pull_request:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   merge_group:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
 
 env:
   CUDA_HOME: /usr/local/cuda
   CUDACXX: nvcc
   MKLROOT: /opt/intel/oneapi/mkl/latest/
+  CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
 jobs:
   test-heterogeneous:
diff --git a/.github/workflows/pyFV3-ci.yml b/.github/workflows/pyFV3-ci.yml
index 2b98327381..f58fdf85ac 100644
--- a/.github/workflows/pyFV3-ci.yml
+++ b/.github/workflows/pyFV3-ci.yml
@@ -2,11 +2,11 @@ name: NASA/NOAA pyFV3 repository build test
 
 on:
   push:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   pull_request:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
   merge_group:
-    branches: [ master, ci-fix ]
+    branches: [ main, ci-fix ]
 
 defaults:
     run:
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6bf69495b1..313b3f0f21 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -47,7 +47,7 @@ For automatic styling, we use the [yapf](https://github.com/google/yapf) file fo
 We use [pytest](https://www.pytest.org/) for our testing infrastructure. All tests under the `tests/` folder 
 (and any subfolders within) are automatically read and run. The files must be under the right subfolder
 based on the component being tested (e.g., `tests/sdfg/` for IR-related tests), and must have the right
-suffix: either `*_test.py` or `*_cudatest.py`. See [pytest.ini](https://github.com/spcl/dace/blob/master/pytest.ini)
+suffix: either `*_test.py` or `*_cudatest.py`. See [pytest.ini](https://github.com/spcl/dace/blob/main/pytest.ini)
 for more information, and for the markers we use to specify software/hardware requirements.
 
 The structure of the test file must follow `pytest` standards (i.e., free functions called `test_*`), and
diff --git a/README.md b/README.md
index 41b059c953..ef4bdec1db 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 [![FPGA Tests](https://github.com/spcl/dace/actions/workflows/fpga-ci.yml/badge.svg)](https://github.com/spcl/dace/actions/workflows/fpga-ci.yml)
 [![Documentation Status](https://readthedocs.org/projects/spcldace/badge/?version=latest)](https://spcldace.readthedocs.io/en/latest/?badge=latest)
 [![PyPI version](https://badge.fury.io/py/dace.svg)](https://badge.fury.io/py/dace)
-[![codecov](https://codecov.io/gh/spcl/dace/branch/master/graph/badge.svg)](https://codecov.io/gh/spcl/dace)
+[![codecov](https://codecov.io/gh/spcl/dace/branch/main/graph/badge.svg)](https://codecov.io/gh/spcl/dace)
 
 
 ![D](dace.svg)aCe - Data-Centric Parallel Programming
@@ -11,7 +11,7 @@
 
 _Decoupling domain science from performance optimization._
 
-DaCe is a [fast](https://nbviewer.org/github/spcl/dace/blob/master/tutorials/benchmarking.ipynb) parallel programming
+DaCe is a [fast](https://nbviewer.org/github/spcl/dace/blob/main/tutorials/benchmarking.ipynb) parallel programming
 framework that takes code in Python/NumPy and other programming languages, and maps it to high-performance 
 **CPU, GPU, and FPGA** programs, which can be optimized to achieve state-of-the-art. Internally, DaCe 
 uses the Stateful DataFlow multiGraph (SDFG) *data-centric intermediate 
@@ -61,13 +61,13 @@ be used in any C ABI compatible language (C/C++, FORTRAN, etc.).
 
 For more information on how to use DaCe, see the [samples](samples) or tutorials below:
 
-* [Getting Started](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/getting_started.ipynb)
-* [Benchmarks, Instrumentation, and Performance Comparison with Other Python Compilers](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/benchmarking.ipynb)
-* [Explicit Dataflow in Python](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/explicit.ipynb)
-* [NumPy API Reference](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/numpy_frontend.ipynb)
-* [SDFG API](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/sdfg_api.ipynb)
-* [Using and Creating Transformations](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/transformations.ipynb)
-* [Extending the Code Generator](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/codegen.ipynb)
+* [Getting Started](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/getting_started.ipynb)
+* [Benchmarks, Instrumentation, and Performance Comparison with Other Python Compilers](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/benchmarking.ipynb)
+* [Explicit Dataflow in Python](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/explicit.ipynb)
+* [NumPy API Reference](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/numpy_frontend.ipynb)
+* [SDFG API](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/sdfg_api.ipynb)
+* [Using and Creating Transformations](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/transformations.ipynb)
+* [Extending the Code Generator](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/codegen.ipynb)
 
 Publication
 -----------
diff --git a/dace/codegen/cppunparse.py b/dace/codegen/cppunparse.py
index 18ee00721b..edeb5270ca 100644
--- a/dace/codegen/cppunparse.py
+++ b/dace/codegen/cppunparse.py
@@ -749,6 +749,8 @@ def _Num(self, t):
         # For complex values, use ``dtype_to_typeclass``
         if isinstance(t_n, complex):
             dtype = dtypes.dtype_to_typeclass(complex)
+            repr_n = f'{dtype}({t_n.real}, {t_n.imag})'
+
 
         # Handle large integer values
         if isinstance(t_n, int):
@@ -765,10 +767,8 @@ def _Num(self, t):
             elif bits >= 64:
                 warnings.warn(f'Value wider than 64 bits encountered in expression ({t_n}), emitting as-is')
 
-        if repr_n.endswith("j"):
-            self.write("%s(0, %s)" % (dtype, repr_n.replace("inf", INFSTR)[:-1]))
-        else:
-            self.write(repr_n.replace("inf", INFSTR))
+        repr_n = repr_n.replace("inf", INFSTR)
+        self.write(repr_n)
 
     def _List(self, t):
         raise NotImplementedError('Invalid C++')
diff --git a/dace/codegen/prettycode.py b/dace/codegen/prettycode.py
index de143f5e86..0fc4ebe3f1 100644
--- a/dace/codegen/prettycode.py
+++ b/dace/codegen/prettycode.py
@@ -14,7 +14,7 @@ class CodeIOStream(StringIO):
         nodes. """
     def __init__(self, base_indentation=0):
         super(CodeIOStream, self).__init__()
-        self._indent = 0
+        self._indent = base_indentation
         self._spaces = int(Config.get('compiler', 'indentation_spaces'))
         self._lineinfo = Config.get_bool('compiler', 'codegen_lineinfo')
 
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index f080f2cc62..1cf8919d74 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -23,8 +23,8 @@
 from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute
 from dace.config import Config
 from dace.frontend import operations
-from dace.sdfg import (SDFG, ScopeSubgraphView, SDFGState, has_dynamic_map_inputs,
-                       is_array_stream_view, is_devicelevel_gpu, nodes, scope_contains_scope)
+from dace.sdfg import (SDFG, ScopeSubgraphView, SDFGState, has_dynamic_map_inputs, is_array_stream_view,
+                       is_devicelevel_gpu, nodes, scope_contains_scope)
 from dace.sdfg import utils as sdutil
 from dace.sdfg.graph import MultiConnectorEdge
 from dace.sdfg.state import ControlFlowRegion, StateSubgraphView
@@ -68,6 +68,7 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG):
         dispatcher = self._dispatcher
 
         self.create_grid_barrier = False
+        self.dynamic_tbmap_type = None
         self.extra_nsdfg_args = []
         CUDACodeGen._in_device_code = False
         self._cpu_codegen: Optional['CPUCodeGen'] = None
@@ -892,8 +893,8 @@ def increment(streams):
 
         return max_streams, max_events
 
-    def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.StorageType,
-                   dst_node: nodes.Node, dst_storage: dtypes.StorageType, dst_schedule: dtypes.ScheduleType,
+    def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.StorageType, dst_node: nodes.Node,
+                   dst_storage: dtypes.StorageType, dst_schedule: dtypes.ScheduleType,
                    edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], sdfg: SDFG, cfg: ControlFlowRegion,
                    dfg: StateSubgraphView, callsite_stream: CodeIOStream) -> None:
         u, uconn, v, vconn, memlet = edge
@@ -1163,11 +1164,8 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
                             copysize=', '.join(_topy(copy_shape)),
                             is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false',
                             accum=accum or '::Copy',
-                            args=', '.join(
-                                [src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) + custom_reduction
-                            )
-                        ),
-                        cfg, state_id, [src_node, dst_node])
+                            args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) +
+                                           custom_reduction)), cfg, state_id, [src_node, dst_node])
                 else:
                     callsite_stream.write(
                         ('    {func}<{type}, {bdims}, {copysize}, ' +
@@ -1236,8 +1234,12 @@ def _begin_streams(self, sdfg, state):
                         result.add(e.dst._cuda_stream)
         return result
 
-    def generate_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState,
-                       function_stream: CodeIOStream, callsite_stream: CodeIOStream,
+    def generate_state(self,
+                       sdfg: SDFG,
+                       cfg: ControlFlowRegion,
+                       state: SDFGState,
+                       function_stream: CodeIOStream,
+                       callsite_stream: CodeIOStream,
                        generate_state_footer: bool = False) -> None:
         # Two modes: device-level state and if this state has active streams
         if CUDACodeGen._in_device_code:
@@ -1361,8 +1363,7 @@ def generate_devicelevel_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state:
                                               "&& threadIdx.x == 0) "
                                               "{  // sub-graph begin", cfg, state.block_id)
                     elif write_scope == 'block':
-                        callsite_stream.write("if (threadIdx.x == 0) "
-                                              "{  // sub-graph begin", cfg, state.block_id)
+                        callsite_stream.write("if (threadIdx.x == 0) " "{  // sub-graph begin", cfg, state.block_id)
                     else:
                         callsite_stream.write("{  // subgraph begin", cfg, state.block_id)
                 else:
@@ -1985,16 +1986,13 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S
 
         # allocating shared memory for dynamic threadblock maps
         if has_dtbmap:
-            kernel_stream.write(
-                '__shared__ dace::'
-                'DynamicMap<{fine_grained}, {block_size}>'
-                '::shared_type dace_dyn_map_shared;'.format(
-                    fine_grained=('true'
-                                  if Config.get_bool('compiler', 'cuda', 'dynamic_map_fine_grained') else 'false'),
-                    block_size=functools.reduce(
-                        (lambda x, y: x * y),
-                        [int(x) for x in Config.get('compiler', 'cuda', 'dynamic_map_block_size').split(',')])), cfg,
-                state_id, node)
+            self.dynamic_tbmap_type = (
+                f'dace::DynamicMap<{"true" if Config.get_bool("compiler", "cuda", "dynamic_map_fine_grained") else "false"}, '
+                f'{functools.reduce((lambda x, y: x * y), [int(x) for x in Config.get("compiler", "cuda", "dynamic_map_block_size").split(",")])}>'
+                '::shared_type')
+            kernel_stream.write(f'__shared__ {self.dynamic_tbmap_type} dace_dyn_map_shared;', cfg, state_id, node)
+        else:
+            self.dynamic_tbmap_type = None
 
         # Add extra opening brace (dynamic map ranges, closed in MapExit
         # generator)
@@ -2072,8 +2070,8 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S
 
         # Generate conditions for this block's execution using min and max
         # element, e.g., skipping out-of-bounds threads in trailing block
-        # unless thsi is handled by another map down the line
-        if (not has_tbmap and not has_dtbmap and node.map.schedule != dtypes.ScheduleType.GPU_Persistent):
+        # unless this is handled by another map down the line
+        if ((not has_tbmap or has_dtbmap) and node.map.schedule != dtypes.ScheduleType.GPU_Persistent):
             dsym_end = [d + bs - 1 for d, bs in zip(dsym, self._block_dims)]
             minels = krange.min_element()
             maxels = krange.max_element()
@@ -2090,10 +2088,12 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S
                     condition += '%s < %s' % (v, _topy(maxel + 1))
                 if len(condition) > 0:
                     self._kernel_grid_conditions.append(f'if ({condition}) {{')
-                    kernel_stream.write('if (%s) {' % condition, cfg, state_id, scope_entry)
+                    if not has_dtbmap:
+                        kernel_stream.write('if (%s) {' % condition, cfg, state_id, scope_entry)
                 else:
                     self._kernel_grid_conditions.append('{')
-                    kernel_stream.write('{', cfg, state_id, scope_entry)
+                    if not has_dtbmap:
+                        kernel_stream.write('{', cfg, state_id, scope_entry)
 
         self._dispatcher.dispatch_subgraph(sdfg,
                                            cfg,
@@ -2112,6 +2112,7 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S
         self._kernel_state = None
         CUDACodeGen._in_device_code = False
         self._grid_dims = None
+        self.dynamic_tbmap_type = None
 
     def get_next_scope_entries(self, dfg, scope_entry):
         parent_scope_entry = dfg.entry_node(scope_entry)
@@ -2179,10 +2180,8 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco
                 current_sdfg = current_state.parent
             if not outer_scope:
                 raise ValueError(f'Failed to find the outer scope of {scope_entry}')
-            callsite_stream.write(
-                'if ({} < {}) {{'.format(outer_scope.map.params[0],
-                                         _topy(subsets.Range(outer_scope.map.range[::-1]).max_element()[0] + 1)), cfg,
-                state_id, scope_entry)
+            for cond in self._kernel_grid_conditions:
+                callsite_stream.write(cond, cfg, state_id, scope_entry)
 
             # NOTE: Dynamic map inputs must be defined both outside and inside the dynamic Map schedule.
             # They define inside the schedule the bounds of the any nested Maps.
@@ -2205,8 +2204,9 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco
                 '__dace_dynmap_begin = {begin};\n'
                 '__dace_dynmap_end = {end};'.format(begin=dynmap_begin, end=dynmap_end), cfg, state_id, scope_entry)
 
-            # close if
-            callsite_stream.write('}', cfg, state_id, scope_entry)
+            # Close kernel grid conditions
+            for _ in self._kernel_grid_conditions:
+                callsite_stream.write('}', cfg, state_id, scope_entry)
 
             callsite_stream.write(
                 'dace::DynamicMap<{fine_grained}, {bsize}>::'
@@ -2215,7 +2215,7 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco
                 'auto {param}) {{'.format(fine_grained=('true' if Config.get_bool(
                     'compiler', 'cuda', 'dynamic_map_fine_grained') else 'false'),
                                           bsize=total_block_size,
-                                          kmapIdx=outer_scope.map.params[0],
+                                          kmapIdx=outer_scope.map.params[-1],
                                           param=dynmap_var), cfg, state_id, scope_entry)
 
             for e in dace.sdfg.dynamic_map_inputs(dfg, scope_entry):
@@ -2556,8 +2556,8 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco
                 for cond in self._kernel_grid_conditions:
                     callsite_stream.write(cond, cfg, state_id, scope_entry)
 
-    def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
-                      node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
+    def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node,
+                      function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
         if self.node_dispatch_predicate(sdfg, dfg, node):
             # Dynamically obtain node generator according to class name
             gen = getattr(self, '_generate_' + type(node).__name__, False)
@@ -2594,6 +2594,8 @@ def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node):
         result = self._cpu_codegen.generate_nsdfg_arguments(sdfg, cfg, dfg, state, node)
         if self.create_grid_barrier:
             result.append(('cub::GridBarrier&', '__gbar', '__gbar'))
+        if self.dynamic_tbmap_type:
+            result.append((f'{self.dynamic_tbmap_type}&', 'dace_dyn_map_shared', 'dace_dyn_map_shared'))
 
         # Add data from nested SDFGs to kernel arguments
         result.extend([(atype, aname, aname) for atype, aname, _ in self.extra_nsdfg_args])
diff --git a/dace/codegen/tools/type_inference.py b/dace/codegen/tools/type_inference.py
index 893866522f..8f8dd84151 100644
--- a/dace/codegen/tools/type_inference.py
+++ b/dace/codegen/tools/type_inference.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 import ast
-from dace import dtypes
+from dace import data, dtypes
 from dace import symbolic
 from dace.codegen import cppunparse
 from dace.symbolic import symbol, SymExpr, symstr
@@ -286,6 +286,8 @@ def _Name(t, symbols, inferred_symbols):
                 inferred_type = dtypes.typeclass(inferred_type.type)
             elif isinstance(inferred_type, symbolic.symbol):
                 inferred_type = inferred_type.dtype
+            elif isinstance(inferred_type, data.Data):
+                inferred_type = inferred_type.dtype
         elif t_id in inferred_symbols:
             inferred_type = inferred_symbols[t_id]
         return inferred_type
diff --git a/dace/config_schema.yml b/dace/config_schema.yml
index da35e61997..7afb06a50a 100644
--- a/dace/config_schema.yml
+++ b/dace/config_schema.yml
@@ -919,6 +919,13 @@ required:
                 description: >
                     Check for undefined symbols in memlets during SDFG validation.
 
+            check_race_conditions:
+                type: bool
+                default: false
+                title: Check race conditions
+                description: >
+                    Check for potential race conditions during validation.
+
     #############################################
     # Features for unit testing
 
diff --git a/dace/distr_types.py b/dace/distr_types.py
index 1b595a1b84..b60eb4925e 100644
--- a/dace/distr_types.py
+++ b/dace/distr_types.py
@@ -96,6 +96,10 @@ def _validate(self):
             raise ValueError('Color must have only logical true (1) or false (0) values.')
         return True
 
+    @property
+    def dtype(self):
+        return type(self)
+
     def to_json(self):
         attrs = serialize.all_properties_to_json(self)
         retdict = {"type": type(self).__name__, "attributes": attrs}
diff --git a/dace/dtypes.py b/dace/dtypes.py
index c5f9bb4732..a016ac60e2 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -1,10 +1,8 @@
 # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
 """ A module that contains various DaCe type definitions. """
-from __future__ import print_function
 import ctypes
 import aenum
 import inspect
-import itertools
 import numpy
 import re
 from collections import OrderedDict
diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index 88a6b0c54a..c517028d53 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -50,14 +50,14 @@ def _cart_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, dims: Shape
 
 
 @oprepo.replaces_method('Intracomm', 'Create_cart')
-def _intracomm_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', dims: ShapeType):
+def _intracomm_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, dims: ShapeType):
     """ Equivalent to `dace.comm.Cart_create(dims).
         :param dims: Shape of the process-grid (see `dims` parameter of `MPI_Cart_create`), e.g., [2, 3, 3].
         :return: Name of the new process-grid descriptor.
     """
 
     from mpi4py import MPI
-    icomm_name, icomm_obj = icomm
+    icomm_name, icomm_obj = icomm, pv.globals[icomm]
     if icomm_obj != MPI.COMM_WORLD:
         raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
     return _cart_create(pv, sdfg, state, dims)
@@ -186,13 +186,13 @@ def _bcast(pv: ProgramVisitor,
 def _intracomm_bcast(pv: 'ProgramVisitor',
                      sdfg: SDFG,
                      state: SDFGState,
-                     comm: Tuple[str, 'Comm'],
+                     comm: str,
                      buffer: str,
                      root: Union[str, sp.Expr, Number] = 0):
     """ Equivalent to `dace.comm.Bcast(buffer, root)`. """
 
     from mpi4py import MPI
-    comm_name, comm_obj = comm
+    comm_name, comm_obj = comm, pv.globals[comm]
     if comm_obj == MPI.COMM_WORLD:
         return _bcast(pv, sdfg, state, buffer, root)
     # NOTE: Highly experimental
@@ -267,12 +267,12 @@ def _alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, inbuffer: str,
 
 
 @oprepo.replaces_method('Intracomm', 'Alltoall')
-def _intracomm_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', inp_buffer: str,
+def _intracomm_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, inp_buffer: str,
                         out_buffer: str):
     """ Equivalent to `dace.comm.Alltoall(inp_buffer, out_buffer)`. """
 
     from mpi4py import MPI
-    icomm_name, icomm_obj = icomm
+    icomm_name, icomm_obj = icomm, pv.globals[icomm]
     if icomm_obj != MPI.COMM_WORLD:
         raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
     return _alltoall(pv, sdfg, state, inp_buffer, out_buffer)
@@ -303,12 +303,12 @@ def _allreduce(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, op
 
 
 @oprepo.replaces_method('Intracomm', 'Allreduce')
-def _intracomm_allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', inp_buffer: 'InPlace',
+def _intracomm_allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, inp_buffer: 'InPlace',
                          out_buffer: str, op: str):
     """ Equivalent to `dace.comm.Allreduce(out_buffer, op)`. """
 
     from mpi4py import MPI
-    icomm_name, icomm_obj = icomm
+    icomm_name, icomm_obj = icomm, pv.globals[icomm]
     if icomm_obj != MPI.COMM_WORLD:
         raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
     if inp_buffer != MPI.IN_PLACE:
@@ -470,12 +470,12 @@ def _send(pv: ProgramVisitor,
 
 
 @oprepo.replaces_method('Intracomm', 'Send')
-def _intracomm_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str,
+def _intracomm_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, buffer: str,
                     dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
     """ Equivalent to `dace.comm.end(buffer, dst, tag)`. """
 
     from mpi4py import MPI
-    icomm_name, icomm_obj = icomm
+    icomm_name, icomm_obj = icomm, pv.globals[icomm]
     if icomm_obj != MPI.COMM_WORLD:
         raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
     return _send(pv, sdfg, state, buffer, dst, tag)
@@ -592,12 +592,12 @@ def _isend(pv: ProgramVisitor,
 
 
 @oprepo.replaces_method('Intracomm', 'Isend')
-def _intracomm_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str,
+def _intracomm_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, buffer: str,
                      dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
     """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req)`. """
 
     from mpi4py import MPI
-    icomm_name, icomm_obj = icomm
+    icomm_name, icomm_obj = icomm, pv.globals[icomm]
     if icomm_obj != MPI.COMM_WORLD:
         raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
     req, _ = sdfg.add_array("isend_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True)
@@ -690,12 +690,12 @@ def _recv(pv: ProgramVisitor,
 
 
 @oprepo.replaces_method('Intracomm', 'Recv')
-def _intracomm_Recv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str,
+def _intracomm_Recv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, buffer: str,
                     src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
     """ Equivalent to `dace.comm.Recv(buffer, src, tagq)`. """
 
     from mpi4py import MPI
-    icomm_name, icomm_obj = icomm
+    icomm_name, icomm_obj = icomm, pv.globals[icomm]
     if icomm_obj != MPI.COMM_WORLD:
         raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
     return _recv(pv, sdfg, state, buffer, src, tag)
@@ -810,12 +810,12 @@ def _irecv(pv: ProgramVisitor,
 
 
 @oprepo.replaces_method('Intracomm', 'Irecv')
-def _intracomm_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str,
+def _intracomm_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, buffer: str,
                      src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]):
     """ Equivalent to `dace.comm.Irecv(buffer, src, tag, req)`. """
 
     from mpi4py import MPI
-    icomm_name, icomm_obj = icomm
+    icomm_name, icomm_obj = icomm, pv.globals[icomm]
     if icomm_obj != MPI.COMM_WORLD:
         raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.')
     req, _ = sdfg.add_array("irecv_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True)
diff --git a/dace/frontend/python/README.md b/dace/frontend/python/README.md
index bd57e36519..aa176f687c 100644
--- a/dace/frontend/python/README.md
+++ b/dace/frontend/python/README.md
@@ -4,7 +4,7 @@ The Python-Frontend aims to assist users in creating SDFGs from Python code
 relatively quickly. You may read a list of supported Python features
 [here](python_supported_features.md). The frontend supports also operations
 among DaCe arrays, in a manner similar to NumPy. A short tutorial can be bound
-[here](https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/numpy_frontend.ipynb).
+[here](https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/numpy_frontend.ipynb).
 Please note that the Python-Frontend is still in an early version. For any issues
 and feature requests, you can create an issue in the main DaCe project. You can
 also address any questions you have to alziogas@inf.ethz.ch
diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index cacf15d785..b4e83cc1e7 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -1342,7 +1342,7 @@ def defined(self):
 
         # MPI-related stuff
         result.update({
-            k: self.sdfg.process_grids[v]
+            v: self.sdfg.process_grids[v]
             for k, v in self.variables.items() if v in self.sdfg.process_grids
         })
         try:
@@ -1489,19 +1489,19 @@ def _symbols_from_params(self, params: List[Tuple[str, Union[str, dtypes.typecla
             else:
                 values = str(val).split(':')
                 if len(values) == 1:
-                    result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.globals, **dyn_inputs}))
+                    result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.defined, **dyn_inputs}))
                 elif len(values) == 2:
                     result[name] = symbolic.symbol(
                         name,
                         dtypes.result_type_of(infer_expr_type(values[0], {
-                            **self.globals,
+                            **self.defined,
                             **dyn_inputs
                         }), infer_expr_type(values[1], {
-                            **self.globals,
+                            **self.defined,
                             **dyn_inputs
                         })))
                 elif len(values) == 3:
-                    result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.globals, **dyn_inputs}))
+                    result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.defined, **dyn_inputs}))
                 else:
                     raise DaceSyntaxError(
                         self, None, "Invalid number of arguments in a range iterator. "
@@ -3258,18 +3258,23 @@ def visit_AnnAssign(self, node: ast.AnnAssign):
             dtype = astutils.evalnode(node.annotation, {**self.globals, **self.defined})
             if isinstance(dtype, data.Data):
                 simple_type = dtype.dtype
+                storage = dtype.storage
             else:
                 simple_type = dtype
+                storage = dtypes.StorageType.Default
             if not isinstance(simple_type, dtypes.typeclass):
                 raise TypeError
         except:
             dtype = None
+            storage = dtypes.StorageType.Default
             type_name = rname(node.annotation)
             warnings.warn('typeclass {} is not supported'.format(type_name))
         if node.value is None and dtype is not None:  # Annotating type without assignment
             self.annotated_types[rname(node.target)] = dtype
             return
-        self._visit_assign(node, node.target, None, dtype=dtype)
+        results = self._visit_assign(node, node.target, None, dtype=dtype)
+        if storage != dtypes.StorageType.Default:
+            self.sdfg.arrays[results[0][0]].storage = storage
 
     def _visit_assign(self, node, node_target, op, dtype=None, is_return=False):
         # Get targets (elts) and results
@@ -3563,6 +3568,8 @@ def _visit_assign(self, node, node_target, op, dtype=None, is_return=False):
                 self.cfg_target.add_edge(self.last_block, output_indirection, dace.sdfg.InterstateEdge())
                 self.last_block = output_indirection
 
+        return results
+
     def visit_AugAssign(self, node: ast.AugAssign):
         self._visit_assign(node, node.target, augassign_ops[type(node.op).__name__])
 
@@ -4454,7 +4461,14 @@ def visit_Call(self, node: ast.Call, create_callbacks=False):
                 func = node.func.value
 
         if func is None:
-            funcname = rname(node)
+            func_result = self.visit(node.func)
+            if isinstance(func_result, str):
+                if isinstance(node.func, ast.Attribute):
+                    funcname = f'{func_result}.{node.func.attr}'
+                else:
+                    funcname = func_result
+            else:
+                funcname = rname(node)
             # Check if the function exists as an SDFG in a different module
             modname = until(funcname, '.')
             if ('.' in funcname and len(modname) > 0 and modname in self.globals
@@ -4569,7 +4583,7 @@ def visit_Call(self, node: ast.Call, create_callbacks=False):
                 arg = self.scope_vars[modname]
             else:
                 # Fallback to (name, object)
-                arg = (modname, self.defined[modname])
+                arg = modname
             args.append(arg)
         # Otherwise, try to find a default implementation for the SDFG
         elif not found_ufunc:
@@ -4623,10 +4637,16 @@ def visit_Call(self, node: ast.Call, create_callbacks=False):
         self._add_state('call_%d' % node.lineno)
         self.last_block.set_default_lineinfo(self.current_lineinfo)
 
-        if found_ufunc:
-            result = func(self, node, self.sdfg, self.last_block, ufunc_name, args, keywords)
-        else:
-            result = func(self, self.sdfg, self.last_block, *args, **keywords)
+        try:
+            if found_ufunc:
+                result = func(self, node, self.sdfg, self.last_block, ufunc_name, args, keywords)
+            else:
+                result = func(self, self.sdfg, self.last_block, *args, **keywords)
+        except DaceSyntaxError as ex:
+            # Attach source information to exception
+            if ex.node is None:
+                ex.node = node
+            raise
 
         self.last_block.set_default_lineinfo(None)
 
@@ -4782,12 +4802,18 @@ def _visitname(self, name: str, node: ast.AST):
                 self.sdfg.add_symbol(result.name, result.dtype)
             return result
 
+        if name in self.closure.callbacks:
+            return name
+
         if name in self.sdfg.arrays:
             return name
 
         if name in self.sdfg.symbols:
             return name
 
+        if name in __builtins__:
+            return name
+
         if name not in self.scope_vars:
             raise DaceSyntaxError(self, node, 'Use of undefined variable "%s"' % name)
         rname = self.scope_vars[name]
@@ -4832,33 +4858,43 @@ def visit_NameConstant(self, node: NameConstant):
         return self.visit_Constant(node)
 
     def visit_Attribute(self, node: ast.Attribute):
-        # If visiting an attribute, return attribute value if it's of an array or global
-        name = until(astutils.unparse(node), '.')
-        result = self._visitname(name, node)
+        result = self.visit(node.value)
+        if isinstance(result, (tuple, list, dict)):
+            if len(result) > 1:
+                raise DaceSyntaxError(
+                    self, node.value, f'{type(result)} object cannot use attributes. Try storing the '
+                    'object to a different variable first (e.g., ``a = result; a.attribute``')
+            else:
+                result = result[0]
+
         tmpname = f"{result}.{astutils.unparse(node.attr)}"
         if tmpname in self.sdfg.arrays:
             return tmpname
+
         if isinstance(result, str) and result in self.sdfg.arrays:
             arr = self.sdfg.arrays[result]
         elif isinstance(result, str) and result in self.scope_arrays:
             arr = self.scope_arrays[result]
         else:
-            return result
+            arr = None
 
         # Try to find sub-SDFG attribute
-        func = oprepo.Replacements.get_attribute(type(arr), node.attr)
-        if func is not None:
-            # A new state is likely needed here, e.g., for transposition (ndarray.T)
-            self._add_state('%s_%d' % (type(node).__name__, node.lineno))
-            self.last_block.set_default_lineinfo(self.current_lineinfo)
-            result = func(self, self.sdfg, self.last_block, result)
-            self.last_block.set_default_lineinfo(None)
-            return result
+        if arr is not None:
+            func = oprepo.Replacements.get_attribute(type(arr), node.attr)
+            if func is not None:
+                # A new state is likely needed here, e.g., for transposition (ndarray.T)
+                self._add_state('%s_%d' % (type(node).__name__, node.lineno))
+                self.last_block.set_default_lineinfo(self.current_lineinfo)
+                result = func(self, self.sdfg, self.last_block, result)
+                self.last_block.set_default_lineinfo(None)
+                return result
 
         # Otherwise, try to find compile-time attribute (such as shape)
         try:
-            return getattr(arr, node.attr)
-        except KeyError:
+            if arr is not None:
+                return getattr(arr, node.attr)
+            return getattr(result, node.attr)
+        except (AttributeError, KeyError):
             return result
 
     def visit_List(self, node: ast.List):
diff --git a/dace/frontend/python/parser.py b/dace/frontend/python/parser.py
index d99be1265d..d03759fa8e 100644
--- a/dace/frontend/python/parser.py
+++ b/dace/frontend/python/parser.py
@@ -92,14 +92,15 @@ def infer_symbols_from_datadescriptor(sdfg: SDFG,
             desc = sdfg.arrays[arg_name]
             if not hasattr(desc, 'shape') or not hasattr(arg_val, 'shape'):
                 continue
-            symbolic_values = list(desc.shape) + list(getattr(desc, 'strides', []))
+            symbolic_values = list(desc.shape) + list(getattr(desc, 'strides', [])) + list(getattr(desc, 'offset', []))
             given_values = list(arg_val.shape)
             given_strides = []
             if hasattr(arg_val, 'strides'):
                 # NumPy arrays use bytes in strides
                 factor = getattr(arg_val, 'itemsize', 1)
                 given_strides = [s // factor for s in arg_val.strides]
-            given_values += given_strides
+            given_offset = [o for o in arg_val.offset] if hasattr(arg_val, 'offset') else []
+            given_values += given_strides + given_offset
 
             for sym_dim, real_dim in zip(symbolic_values, given_values):
                 repldict = {}
diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py
index eca07a4930..f51b67ddb2 100644
--- a/dace/frontend/python/preprocessing.py
+++ b/dace/frontend/python/preprocessing.py
@@ -527,6 +527,8 @@ def global_value_to_node(self,
         elif isinstance(value, symbolic.symbol):
             # Symbols resolve to the symbol name
             newnode = ast.Name(id=value.name, ctx=ast.Load())
+        elif isinstance(value, sympy.Basic):  # Symbolic or constant expression
+            newnode = ast.parse(symbolic.symstr(value)).body[0].value
         elif isinstance(value, ast.Name):
             newnode = ast.Name(id=value.id, ctx=ast.Load())
         elif (dtypes.isconstant(value) or isinstance(value, (StringLiteral, SDFG)) or hasattr(value, '__sdfg__')):
diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py
index 5e6118a34b..c5b3e3b2a2 100644
--- a/dace/frontend/python/replacements.py
+++ b/dace/frontend/python/replacements.py
@@ -313,6 +313,9 @@ def _numpy_full(pv: ProgramVisitor,
     """ Creates and array of the specified shape and initializes it with
         the fill value.
     """
+    if isinstance(shape, Number) or symbolic.issymbolic(shape):
+        shape = [shape]
+    
     is_data = False
     if isinstance(fill_value, (Number, np.bool_)):
         vtype = dtypes.dtype_to_typeclass(type(fill_value))
@@ -322,24 +325,30 @@ def _numpy_full(pv: ProgramVisitor,
         is_data = True
         vtype = sdfg.arrays[fill_value].dtype
     dtype = dtype or vtype
+
+    # Handle one-dimensional inputs
+    if isinstance(shape, (Number, str)) or symbolic.issymbolic(shape):
+        shape = [shape]
+
+    if any(isinstance(s, str) for s in shape):
+        raise DaceSyntaxError(
+            pv, None, f'Data-dependent shape {shape} is currently not allowed. Only constants '
+            'and symbolic values can be used.')
+
     name, _ = sdfg.add_temp_transient(shape, dtype)
 
     if is_data:
         state.add_mapped_tasklet(
-            '_numpy_full_', {
-                "__i{}".format(i): "0: {}".format(s)
-                for i, s in enumerate(shape)
-            },
+            '_numpy_full_', {"__i{}".format(i): "0: {}".format(s)
+                             for i, s in enumerate(shape)},
             dict(__inp=dace.Memlet(data=fill_value, subset='0')),
             "__out = __inp",
             dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))),
             external_edges=True)
     else:
         state.add_mapped_tasklet(
-            '_numpy_full_', {
-                "__i{}".format(i): "0: {}".format(s)
-                for i, s in enumerate(shape)
-            }, {},
+            '_numpy_full_', {"__i{}".format(i): "0: {}".format(s)
+                             for i, s in enumerate(shape)}, {},
             "__out = {}".format(fill_value),
             dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))),
             external_edges=True)
@@ -459,10 +468,8 @@ def _numpy_flip(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, axis
     inpidx = ','.join([f'__i{i}' for i in range(ndim)])
     outidx = ','.join([f'{s} - __i{i} - 1' if a else f'__i{i}' for i, (a, s) in enumerate(zip(axis, desc.shape))])
     state.add_mapped_tasklet(name="_numpy_flip_",
-                             map_ranges={
-                                 f'__i{i}': f'0:{s}:1'
-                                 for i, s in enumerate(desc.shape)
-                             },
+                             map_ranges={f'__i{i}': f'0:{s}:1'
+                                         for i, s in enumerate(desc.shape)},
                              inputs={'__inp': Memlet(f'{arr}[{inpidx}]')},
                              code='__out = __inp',
                              outputs={'__out': Memlet(f'{arr_copy}[{outidx}]')},
@@ -532,10 +539,8 @@ def _numpy_rot90(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, k=1
 
     outidx = ','.join(out_indices)
     state.add_mapped_tasklet(name="_rot90_",
-                             map_ranges={
-                                 f'__i{i}': f'0:{s}:1'
-                                 for i, s in enumerate(desc.shape)
-                             },
+                             map_ranges={f'__i{i}': f'0:{s}:1'
+                                         for i, s in enumerate(desc.shape)},
                              inputs={'__inp': Memlet(f'{arr}[{inpidx}]')},
                              code='__out = __inp',
                              outputs={'__out': Memlet(f'{arr_copy}[{outidx}]')},
@@ -546,8 +551,13 @@ def _numpy_rot90(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, k=1
 
 @oprepo.replaces('numpy.arange')
 @oprepo.replaces('dace.arange')
-def _arange(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, *args, **kwargs):
-    """ Implementes numpy.arange """
+def _arange(pv: ProgramVisitor,
+            sdfg: SDFG,
+            state: SDFGState,
+            *args,
+            dtype: dtypes.typeclass = None,
+            like: Optional[str] = None):
+    """ Implements numpy.arange """
 
     start = 0
     stop = None
@@ -561,35 +571,42 @@ def _arange(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, *args, **kwargs):
     else:
         start, stop, step = args
 
+    if isinstance(start, str):
+        raise TypeError(f'Cannot compile numpy.arange with a scalar start value "{start}" (only constants and symbolic '
+                        'expressions are supported). Please use numpy.linspace instead.')
+    if isinstance(stop, str):
+        raise TypeError(f'Cannot compile numpy.arange with a scalar stop value "{stop}" (only constants and symbolic '
+                        'expressions are supported). Please use numpy.linspace instead.')
+    if isinstance(step, str):
+        raise TypeError(f'Cannot compile numpy.arange with a scalar step value "{step}" (only constants and symbolic '
+                        'expressions are supported). Please use numpy.linspace instead.')
+
     actual_step = step
     if isinstance(start, Number) and isinstance(stop, Number):
         actual_step = type(start + step)(start + step) - start
 
     if any(not isinstance(s, Number) for s in [start, stop, step]):
-        shape = (symbolic.int_ceil(stop - start, step), )
+        if step == 1:  # Common case where ceiling is not necessary
+            shape = (stop - start,)
+        else:
+            shape = (symbolic.int_ceil(stop - start, step), )
     else:
         shape = (np.int64(np.ceil((stop - start) / step)), )
 
-    if not isinstance(shape[0], Number) and ('dtype' not in kwargs or kwargs['dtype'] == None):
-        raise NotImplementedError("The current implementation of numpy.arange requires that the output dtype is given "
-                                  "when at least one of (start, stop, step) is symbolic.")
+    # Infer dtype from input arguments
+    if dtype is None:
+        dtype, _ = _result_type(args)
+
     # TODO: Unclear what 'like' does
-    # if 'like' in kwargs and kwargs['like'] != None:
-    #     outname, outarr = sdfg.add_temp_transient_like(sdfg.arrays[kwargs['like']])
+    # if like is not None:
+    #     outname, outarr = sdfg.add_temp_transient_like(sdfg.arrays[like])
     #     outarr.shape = shape
-    if 'dtype' in kwargs and kwargs['dtype'] != None:
-        dtype = kwargs['dtype']
-        if not isinstance(dtype, dtypes.typeclass):
-            dtype = dtypes.dtype_to_typeclass(dtype)
-        outname, outarr = sdfg.add_temp_transient(shape, dtype)
-    else:
-        # infer dtype based on args's dtype
-        # (since the `dtype` keyword argument isn't given, none of the arguments can be symbolic)
-        if any(isinstance(arg, (float, np.float32, np.float64)) for arg in args):
-            dtype = dtypes.float64
-        else:
-            dtype = dtypes.int64
-        outname, outarr = sdfg.add_temp_transient(shape, dtype)
+    if not isinstance(dtype, dtypes.typeclass):
+        dtype = dtypes.dtype_to_typeclass(dtype)
+    outname, outarr = sdfg.add_temp_transient(shape, dtype)
+
+    start = f'decltype(__out)({start})'
+    actual_step = f'decltype(__out)({actual_step})'
 
     state.add_mapped_tasklet(name="_numpy_arange_",
                              map_ranges={'__i': f"0:{shape[0]}"},
@@ -601,6 +618,131 @@ def _arange(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, *args, **kwargs):
     return outname
 
 
+def _add_axis_to_shape(shape: Sequence[symbolic.SymbolicType], axis: int,
+                       axis_value: Any) -> List[symbolic.SymbolicType]:
+    if axis > len(shape):
+        raise ValueError(f'axis {axis} is out of bounds for array of dimension {len(shape)}')
+    if axis < 0:
+        naxis = len(shape) + 1 + axis
+        if naxis < 0 or naxis > len(shape):
+            raise ValueError(f'axis {axis} is out of bounds for array of dimension {len(shape)}')
+        axis = naxis
+
+    # Make a new shape list with the inserted dimension
+    new_shape = [None] * (len(shape) + 1)
+    for i in range(len(shape) + 1):
+        if i == axis:
+            new_shape[i] = axis_value
+        elif i < axis:
+            new_shape[i] = shape[i]
+        else:
+            new_shape[i] = shape[i - 1]
+
+    return new_shape
+
+
+@oprepo.replaces('numpy.linspace')
+def _linspace(pv: ProgramVisitor,
+              sdfg: SDFG,
+              state: SDFGState,
+              start: Union[Number, symbolic.SymbolicType, str],
+              stop: Union[Number, symbolic.SymbolicType, str],
+              num: Union[Integral, symbolic.SymbolicType] = 50,
+              endpoint: bool = True,
+              retstep: bool = False,
+              dtype: dtypes.typeclass = None,
+              axis: int = 0):
+    """ Implements numpy.linspace """
+    # Argument checks
+    if not isinstance(num, (Integral, sp.Basic)):
+        raise TypeError('numpy.linspace can only be compiled when the ``num`` argument is symbolic or constant.')
+    if not isinstance(axis, Integral):
+        raise TypeError('numpy.linspace can only be compiled when the ``axis`` argument is constant.')
+
+    # Start and stop are broadcast together, then, a new dimension is added to axis (taken from ``ndim + 1``),
+    # along which the numbers are filled.
+    start_shape = sdfg.arrays[start].shape if (isinstance(start, str) and start in sdfg.arrays) else []
+    stop_shape = sdfg.arrays[stop].shape if (isinstance(stop, str) and stop in sdfg.arrays) else []
+
+    shape, ranges, outind, ind1, ind2 = _broadcast_together(start_shape, stop_shape)
+    shape_with_axis = _add_axis_to_shape(shape, axis, num)
+    ranges_with_axis = _add_axis_to_shape(ranges, axis, ('__sind', f'0:{symbolic.symstr(num)}'))
+    if outind:
+        outind_with_axis = _add_axis_to_shape(outind.split(', '), axis, '__sind')
+    else:
+        outind_with_axis = ['__sind']
+
+    if dtype is None:
+        # Infer output type from start and stop
+        start_type = sdfg.arrays[start] if (isinstance(start, str) and start in sdfg.arrays) else start
+        stop_type = sdfg.arrays[stop] if (isinstance(stop, str) and stop in sdfg.arrays) else stop
+
+        dtype, _ = _result_type((start_type, stop_type), 'Add')
+
+        # From the NumPy documentation: The inferred dtype will never be an integer; float is chosen even if the
+        # arguments would produce an array of integers.
+        if dtype in (dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16, dtypes.uint32,
+                     dtypes.uint64):
+            dtype = dtypes.dtype_to_typeclass(float)
+
+    outname, _ = sdfg.add_temp_transient(shape_with_axis, dtype)
+
+    if endpoint == True:
+        num -= 1
+
+    # Fill in input memlets as necessary
+    inputs = {}
+    if isinstance(start, str) and start in sdfg.arrays:
+        index = f'[{ind1}]' if ind1 else ''
+        inputs['__start'] = Memlet(f'{start}{index}')
+        startcode = '__start'
+    else:
+        startcode = symbolic.symstr(start)
+
+    if isinstance(stop, str) and stop in sdfg.arrays:
+        index = f'[{ind2}]' if ind2 else ''
+        inputs['__stop'] = Memlet(f'{stop}{index}')
+        stopcode = '__stop'
+    else:
+        stopcode = symbolic.symstr(stop)
+
+    # Create tasklet code based on inputs
+    code = f'__out = {startcode} + __sind * decltype(__out)({stopcode} - {startcode}) / decltype(__out)({symbolic.symstr(num)})'
+
+    state.add_mapped_tasklet(name="linspace",
+                             map_ranges=ranges_with_axis,
+                             inputs=inputs,
+                             code=code,
+                             outputs={'__out': Memlet(f"{outname}[{','.join(outind_with_axis)}]")},
+                             external_edges=True)
+
+    if retstep == False:
+        return outname
+
+    # Return step if requested
+
+    # Handle scalar outputs
+    if not ranges:
+        ranges = [('__unused', '0:1')]
+    out_index = f'[{outind}]'
+
+    if len(shape) > 0:
+        stepname, _ = sdfg.add_temp_transient(shape, dtype)
+    else:
+        stepname, _ = sdfg.add_scalar(sdfg.temp_data_name(), dtype, transient=True)
+        out_index = '[0]'
+
+    state.add_mapped_tasklet(
+        'retstep',
+        ranges,
+        copy.deepcopy(inputs),
+        f'__out = decltype(__out)({stopcode} - {startcode}) / decltype(__out)({symbolic.symstr(num)})',
+        {'__out': Memlet(f"{stepname}{out_index}")},
+        external_edges=True)
+
+    return outname, stepname
+
+
 @oprepo.replaces('elementwise')
 @oprepo.replaces('dace.elementwise')
 def _elementwise(pv: 'ProgramVisitor',
@@ -644,7 +786,8 @@ def _elementwise(pv: 'ProgramVisitor',
     else:
         state.add_mapped_tasklet(
             name="_elementwise_",
-            map_ranges={f'__i{dim}': f'0:{N}' for dim, N in enumerate(inparr.shape)},
+            map_ranges={f'__i{dim}': f'0:{N}'
+                        for dim, N in enumerate(inparr.shape)},
             inputs={'__inp': Memlet.simple(in_array, ','.join([f'__i{dim}' for dim in range(len(inparr.shape))]))},
             code=code,
             outputs={'__out': Memlet.simple(out_array, ','.join([f'__i{dim}' for dim in range(len(inparr.shape))]))},
@@ -694,10 +837,8 @@ def _simple_call(sdfg: SDFG, state: SDFGState, inpname: str, func: str, restype:
     else:
         state.add_mapped_tasklet(
             name=func,
-            map_ranges={
-                '__i%d' % i: '0:%s' % n
-                for i, n in enumerate(inparr.shape)
-            },
+            map_ranges={'__i%d' % i: '0:%s' % n
+                        for i, n in enumerate(inparr.shape)},
             inputs={'__inp': Memlet.simple(inpname, ','.join(['__i%d' % i for i in range(len(inparr.shape))]))},
             code='__out = {f}(__inp)'.format(f=func),
             outputs={'__out': Memlet.simple(outname, ','.join(['__i%d' % i for i in range(len(inparr.shape))]))},
@@ -707,9 +848,9 @@ def _simple_call(sdfg: SDFG, state: SDFGState, inpname: str, func: str, restype:
 
 
 def _complex_to_scalar(complex_type: dace.typeclass):
-    if complex_type is dace.complex64:
+    if complex_type == dace.complex64:
         return dace.float32
-    elif complex_type is dace.complex128:
+    elif complex_type == dace.complex128:
         return dace.float64
     else:
         return complex_type
@@ -813,7 +954,8 @@ def _len_array(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, a: str):
             return sdfg.arrays[a].shape[0]
         if a in sdfg.constants_prop:
             return len(sdfg.constants[a])
-    raise TypeError(f'`len` is not supported for input "{a}" (type {type(a)})')
+    else:
+        return len(a)
 
 
 @oprepo.replaces('transpose')
@@ -1046,27 +1188,22 @@ def _argminmax(pv: ProgramVisitor,
     code = "__init = _val_and_idx(val={}, idx=-1)".format(
         dtypes.min_value(a_arr.dtype) if func == 'max' else dtypes.max_value(a_arr.dtype))
 
-    nest.add_state().add_mapped_tasklet(name="_arg{}_convert_".format(func),
-                                        map_ranges={
-                                            '__i%d' % i: '0:%s' % n
-                                            for i, n in enumerate(a_arr.shape) if i != axis
-                                        },
-                                        inputs={},
-                                        code=code,
-                                        outputs={
-                                            '__init':
-                                            Memlet.simple(
-                                                reduced_structs,
-                                                ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis))
-                                        },
-                                        external_edges=True)
+    nest.add_state().add_mapped_tasklet(
+        name="_arg{}_convert_".format(func),
+        map_ranges={'__i%d' % i: '0:%s' % n
+                    for i, n in enumerate(a_arr.shape) if i != axis},
+        inputs={},
+        code=code,
+        outputs={
+            '__init': Memlet.simple(reduced_structs,
+                                    ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis))
+        },
+        external_edges=True)
 
     nest.add_state().add_mapped_tasklet(
         name="_arg{}_reduce_".format(func),
-        map_ranges={
-            '__i%d' % i: '0:%s' % n
-            for i, n in enumerate(a_arr.shape)
-        },
+        map_ranges={'__i%d' % i: '0:%s' % n
+                    for i, n in enumerate(a_arr.shape)},
         inputs={'__in': Memlet.simple(a, ','.join('__i%d' % i for i in range(len(a_arr.shape))))},
         code="__out = _val_and_idx(idx={}, val=__in)".format("__i%d" % axis),
         outputs={
@@ -1086,10 +1223,8 @@ def _argminmax(pv: ProgramVisitor,
 
         nest.add_state().add_mapped_tasklet(
             name="_arg{}_extract_".format(func),
-            map_ranges={
-                '__i%d' % i: '0:%s' % n
-                for i, n in enumerate(a_arr.shape) if i != axis
-            },
+            map_ranges={'__i%d' % i: '0:%s' % n
+                        for i, n in enumerate(a_arr.shape) if i != axis},
             inputs={
                 '__in': Memlet.simple(reduced_structs,
                                       ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis))
@@ -1212,10 +1347,9 @@ def _unop(sdfg: SDFG, state: SDFGState, op1: str, opcode: str, opname: str):
         opcode = 'not'
 
     name, _ = sdfg.add_temp_transient(arr1.shape, restype, arr1.storage)
-    state.add_mapped_tasklet("_%s_" % opname, {
-        '__i%d' % i: '0:%s' % s
-        for i, s in enumerate(arr1.shape)
-    }, {'__in1': Memlet.simple(op1, ','.join(['__i%d' % i for i in range(len(arr1.shape))]))},
+    state.add_mapped_tasklet("_%s_" % opname, {'__i%d' % i: '0:%s' % s
+                                               for i, s in enumerate(arr1.shape)},
+                             {'__in1': Memlet.simple(op1, ','.join(['__i%d' % i for i in range(len(arr1.shape))]))},
                              '__out = %s __in1' % opcode,
                              {'__out': Memlet.simple(name, ','.join(['__i%d' % i for i in range(len(arr1.shape))]))},
                              external_edges=True)
@@ -1639,8 +1773,17 @@ def _result_type(arguments: Sequence[Union[str, Number, symbolic.symbol, sp.Basi
 
     else:  # Operators with 3 or more arguments
         result_type = _np_result_type(dtypes_for_result)
+        coarse_result_type = None
+        if result_type in complex_types:
+            coarse_result_type = 3  # complex
+        elif result_type in float_types:
+            coarse_result_type = 2  # float
+        elif result_type in signed_types:
+            coarse_result_type = 1  # signed integer, bool
+        else:
+            coarse_result_type = 0  # unsigned integer
         for i, t in enumerate(coarse_types):
-            if t != result_type:
+            if t != coarse_result_type:
                 casting[i] = _cast_str(result_type)
 
     return result_type, casting
@@ -2519,6 +2662,13 @@ def _matmult(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, op1: str, op
                code="__out = log1p(__in1)",
                reduce=None,
                initial=np.log1p.identity),
+    clip=dict(name="_numpy_clip_",
+              operator=None,
+              inputs=["__in_a", "__in_amin", "__in_amax"],
+              outputs=["__out"],
+              code="__out = min(max(__in_a, __in_amin), __in_amax)",
+              reduce=None,
+              initial=np.inf),
     sqrt=dict(name="_numpy_sqrt_",
               operator="Sqrt",
               inputs=["__in1"],
@@ -4094,14 +4244,13 @@ def implement_ufunc_outer(visitor: ProgramVisitor, ast_node: ast.Call, sdfg: SDF
 
 
 @oprepo.replaces('numpy.reshape')
-def reshape(
-    pv: ProgramVisitor,
-    sdfg: SDFG,
-    state: SDFGState,
-    arr: str,
-    newshape: Union[str, symbolic.SymbolicType, Tuple[Union[str, symbolic.SymbolicType]]],
-    order: StringLiteral = StringLiteral('C')
-) -> str:
+def reshape(pv: ProgramVisitor,
+            sdfg: SDFG,
+            state: SDFGState,
+            arr: str,
+            newshape: Union[str, symbolic.SymbolicType, Tuple[Union[str, symbolic.SymbolicType]]],
+            order: StringLiteral = StringLiteral('C'),
+            strides: Optional[Any] = None) -> str:
     if isinstance(arr, (list, tuple)) and len(arr) == 1:
         arr = arr[0]
     desc = sdfg.arrays[arr]
@@ -4115,10 +4264,11 @@ def reshape(
 
     # New shape and strides as symbolic expressions
     newshape = [symbolic.pystr_to_symbolic(s) for s in newshape]
-    if fortran_strides:
-        strides = [data._prod(newshape[:i]) for i in range(len(newshape))]
-    else:
-        strides = [data._prod(newshape[i + 1:]) for i in range(len(newshape))]
+    if strides is None:
+        if fortran_strides:
+            strides = [data._prod(newshape[:i]) for i in range(len(newshape))]
+        else:
+            strides = [data._prod(newshape[i + 1:]) for i in range(len(newshape))]
 
     newarr, newdesc = sdfg.add_view(arr,
                                     newshape,
@@ -4316,10 +4466,8 @@ def _ndarray_fill(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, va
     shape = sdfg.arrays[arr].shape
     state.add_mapped_tasklet(
         '_numpy_fill_',
-        map_ranges={
-            f"__i{dim}": f"0:{s}"
-            for dim, s in enumerate(shape)
-        },
+        map_ranges={f"__i{dim}": f"0:{s}"
+                    for dim, s in enumerate(shape)},
         inputs=inputs,
         code=f"__out = {body}",
         outputs={'__out': dace.Memlet.simple(arr, ",".join([f"__i{dim}" for dim in range(len(shape))]))},
@@ -4335,9 +4483,13 @@ def _ndarray_reshape(
     sdfg: SDFG,
     state: SDFGState,
     arr: str,
-    newshape: Union[str, symbolic.SymbolicType, Tuple[Union[str, symbolic.SymbolicType]]],
+    *newshape: Union[str, symbolic.SymbolicType, Tuple[Union[str, symbolic.SymbolicType]]],
     order: StringLiteral = StringLiteral('C')
 ) -> str:
+    if len(newshape) == 0:
+        raise TypeError('reshape() takes at least 1 argument (0 given)')
+    if len(newshape) == 1 and isinstance(newshape, (list, tuple)):
+        newshape = newshape[0]
     return reshape(pv, sdfg, state, arr, newshape, order)
 
 
@@ -4544,6 +4696,13 @@ def _ndarray_astype(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str,
     return _datatype_converter(sdfg, state, arr, dtype)[0]
 
 
+@oprepo.replaces_operator('Array', 'MatMult', otherclass='StorageType')
+def _cast_storage(visitor: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, arr: str, stype: dace.StorageType) -> str:
+    desc = sdfg.arrays[arr]
+    desc.storage = stype
+    return arr
+
+
 # Replacements that need ufuncs ###############################################
 # TODO: Fix by separating to different modules and importing
 
@@ -4747,13 +4906,7 @@ def _tensordot(pv: 'ProgramVisitor',
 
 @oprepo.replaces("cupy._core.core.ndarray")
 @oprepo.replaces("cupy.ndarray")
-def _define_cupy_local(
-    pv: "ProgramVisitor",
-    sdfg: SDFG,
-    state: SDFGState,
-    shape: Shape,
-    dtype: typeclass
-):
+def _define_cupy_local(pv: "ProgramVisitor", sdfg: SDFG, state: SDFGState, shape: Shape, dtype: typeclass):
     """Defines a local array in a DaCe program."""
     if not isinstance(shape, (list, tuple)):
         shape = [shape]
@@ -4781,10 +4934,8 @@ def _cupy_full(pv: ProgramVisitor,
     name, _ = sdfg.add_temp_transient(shape, dtype, storage=dtypes.StorageType.GPU_Global)
 
     state.add_mapped_tasklet(
-        '_cupy_full_', {
-            "__i{}".format(i): "0: {}".format(s)
-            for i, s in enumerate(shape)
-        }, {},
+        '_cupy_full_', {"__i{}".format(i): "0: {}".format(s)
+                        for i, s in enumerate(shape)}, {},
         "__out = {}".format(fill_value),
         dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))),
         external_edges=True)
@@ -4843,3 +4994,407 @@ def _op(visitor: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: StringLite
 
 for op, method in _boolop_to_method.items():
     _makeboolop(op, method)
+
+
+@oprepo.replaces('numpy.concatenate')
+def _concat(visitor: ProgramVisitor,
+            sdfg: SDFG,
+            state: SDFGState,
+            arrays: Tuple[Any],
+            axis: Optional[int] = 0,
+            out: Optional[Any] = None,
+            *,
+            dtype=None,
+            casting: str = 'same_kind'):
+    if dtype is not None and out is not None:
+        raise ValueError('Arguments dtype and out cannot be given together')
+    if casting != 'same_kind':
+        raise NotImplementedError('The casting argument is currently unsupported')
+    if not isinstance(arrays, (tuple, list)):
+        raise ValueError('List of arrays is not iterable, cannot compile concatenation')
+    if axis is not None and not isinstance(axis, Integral):
+        raise ValueError('Axis is not a compile-time evaluatable integer, cannot compile concatenation')
+    if len(arrays) == 1:
+        return arrays[0]
+    for i in range(len(arrays)):
+        if arrays[i] not in sdfg.arrays:
+            raise TypeError(f'Index {i} is not an array')
+    if out is not None:
+        if out not in sdfg.arrays:
+            raise TypeError('Output is not an array')
+        dtype = sdfg.arrays[out].dtype
+
+    descs = [sdfg.arrays[arr] for arr in arrays]
+    shape = list(descs[0].shape)
+
+    if axis is None:  # Flatten arrays, then concatenate
+        arrays = [flat(visitor, sdfg, state, arr) for arr in arrays]
+        descs = [sdfg.arrays[arr] for arr in arrays]
+        shape = list(descs[0].shape)
+        axis = 0
+    else:
+        # Check shapes for validity
+        first_shape = copy.copy(shape)
+        first_shape[axis] = 0
+        for i, d in enumerate(descs[1:]):
+            other_shape = list(d.shape)
+            other_shape[axis] = 0
+            if other_shape != first_shape:
+                raise ValueError(f'Array shapes do not match at index {i}')
+
+    shape[axis] = sum(desc.shape[axis] for desc in descs)
+    if out is None:
+        if dtype is None:
+            dtype = descs[0].dtype
+        name, odesc = sdfg.add_temp_transient(shape, dtype, storage=descs[0].storage, lifetime=descs[0].lifetime)
+    else:
+        name = out
+        odesc = sdfg.arrays[out]
+
+    # Make copies
+    w = state.add_write(name)
+    offset = 0
+    subset = subsets.Range.from_array(odesc)
+    for arr, desc in zip(arrays, descs):
+        r = state.add_read(arr)
+        subset = copy.deepcopy(subset)
+        subset[axis] = (offset, offset + desc.shape[axis] - 1, 1)
+        state.add_edge(r, None, w, None, Memlet(data=name, subset=subset))
+        offset += desc.shape[axis]
+
+    return name
+
+
+@oprepo.replaces('numpy.stack')
+def _stack(visitor: ProgramVisitor,
+           sdfg: SDFG,
+           state: SDFGState,
+           arrays: Tuple[Any],
+           axis: int = 0,
+           out: Any = None,
+           *,
+           dtype=None,
+           casting: str = 'same_kind'):
+    if dtype is not None and out is not None:
+        raise ValueError('Arguments dtype and out cannot be given together')
+    if casting != 'same_kind':
+        raise NotImplementedError('The casting argument is currently unsupported')
+    if not isinstance(arrays, (tuple, list)):
+        raise ValueError('List of arrays is not iterable, cannot compile stack call')
+    if not isinstance(axis, Integral):
+        raise ValueError('Axis is not a compile-time evaluatable integer, cannot compile stack call')
+
+    for i in range(len(arrays)):
+        if arrays[i] not in sdfg.arrays:
+            raise TypeError(f'Index {i} is not an array')
+
+    descs = [sdfg.arrays[a] for a in arrays]
+    shape = descs[0].shape
+    for i, d in enumerate(descs[1:]):
+        if d.shape != shape:
+            raise ValueError(f'Array shapes are not equal ({shape} != {d.shape} at index {i})')
+
+    if axis > len(shape):
+        raise ValueError(f'axis {axis} is out of bounds for array of dimension {len(shape)}')
+    if axis < 0:
+        naxis = len(shape) + 1 + axis
+        if naxis < 0 or naxis > len(shape):
+            raise ValueError(f'axis {axis} is out of bounds for array of dimension {len(shape)}')
+        axis = naxis
+
+    # Stacking is implemented as a reshape followed by concatenation
+    reshaped = []
+    for arr, desc in zip(arrays, descs):
+        # Make a reshaped view with the inserted dimension
+        new_shape = [0] * (len(shape) + 1)
+        new_strides = [0] * (len(shape) + 1)
+        for i in range(len(shape) + 1):
+            if i == axis:
+                new_shape[i] = 1
+                new_strides[i] = desc.strides[i - 1] if i != 0 else desc.strides[i]
+            elif i < axis:
+                new_shape[i] = shape[i]
+                new_strides[i] = desc.strides[i]
+            else:
+                new_shape[i] = shape[i - 1]
+                new_strides[i] = desc.strides[i - 1]
+
+        rname = reshape(visitor, sdfg, state, arr, new_shape, strides=new_strides)
+        reshaped.append(rname)
+
+    return _concat(visitor, sdfg, state, reshaped, axis, out, dtype=dtype, casting=casting)
+
+
+@oprepo.replaces('numpy.vstack')
+@oprepo.replaces('numpy.row_stack')
+def _vstack(visitor: ProgramVisitor,
+            sdfg: SDFG,
+            state: SDFGState,
+            tup: Tuple[Any],
+            *,
+            dtype=None,
+            casting: str = 'same_kind'):
+    if not isinstance(tup, (tuple, list)):
+        raise ValueError('List of arrays is not iterable, cannot compile stack call')
+    if tup[0] not in sdfg.arrays:
+        raise TypeError(f'Index 0 is not an array')
+
+    # In the 1-D case, stacking is performed along the first axis
+    if len(sdfg.arrays[tup[0]].shape) == 1:
+        return _stack(visitor, sdfg, state, tup, axis=0, out=None, dtype=dtype, casting=casting)
+    # Otherwise, concatenation is performed
+    return _concat(visitor, sdfg, state, tup, axis=0, out=None, dtype=dtype, casting=casting)
+
+
+@oprepo.replaces('numpy.hstack')
+@oprepo.replaces('numpy.column_stack')
+def _hstack(visitor: ProgramVisitor,
+            sdfg: SDFG,
+            state: SDFGState,
+            tup: Tuple[Any],
+            *,
+            dtype=None,
+            casting: str = 'same_kind'):
+    if not isinstance(tup, (tuple, list)):
+        raise ValueError('List of arrays is not iterable, cannot compile stack call')
+    if tup[0] not in sdfg.arrays:
+        raise TypeError(f'Index 0 is not an array')
+
+    # In the 1-D case, concatenation is performed along the first axis
+    if len(sdfg.arrays[tup[0]].shape) == 1:
+        return _concat(visitor, sdfg, state, tup, axis=0, out=None, dtype=dtype, casting=casting)
+
+    return _concat(visitor, sdfg, state, tup, axis=1, out=None, dtype=dtype, casting=casting)
+
+
+@oprepo.replaces('numpy.dstack')
+def _dstack(visitor: ProgramVisitor,
+            sdfg: SDFG,
+            state: SDFGState,
+            tup: Tuple[Any],
+            *,
+            dtype=None,
+            casting: str = 'same_kind'):
+    if not isinstance(tup, (tuple, list)):
+        raise ValueError('List of arrays is not iterable, cannot compile a stack call')
+    if tup[0] not in sdfg.arrays:
+        raise TypeError(f'Index 0 is not an array')
+    if len(sdfg.arrays[tup[0]].shape) < 3:
+        raise NotImplementedError('dstack is not implemented for arrays that are smaller than 3D')
+
+    return _concat(visitor, sdfg, state, tup, axis=2, out=None, dtype=dtype, casting=casting)
+
+
+def _split_core(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, ary: str,
+                indices_or_sections: Union[int, Sequence[symbolic.SymbolicType], str], axis: int, allow_uneven: bool):
+    # Argument checks
+    if not isinstance(ary, str) or ary not in sdfg.arrays:
+        raise TypeError('Split object must be an array')
+    if not isinstance(axis, Integral):
+        raise ValueError('Cannot determine split dimension, axis is not a compile-time evaluatable integer')
+
+    desc = sdfg.arrays[ary]
+
+    # Test validity of axis
+    orig_axis = axis
+    if axis < 0:
+        axis = len(desc.shape) + axis
+    if axis < 0 or axis >= len(desc.shape):
+        raise ValueError(f'axis {orig_axis} is out of bounds for array of dimension {len(desc.shape)}')
+
+    # indices_or_sections may only be an integer (not symbolic), list of integers, list of symbols, or an array
+    if isinstance(indices_or_sections, str):
+        raise ValueError('Array-indexed split cannot be compiled due to data-dependent sizes. '
+                         'Consider using numpy.reshape instead.')
+    elif isinstance(indices_or_sections, (list, tuple)):
+        if any(isinstance(i, str) for i in indices_or_sections):
+            raise ValueError('Array-indexed split cannot be compiled due to data-dependent sizes. '
+                             'Use symbolic values as an argument instead.')
+        # Sequence is given
+        sections = indices_or_sections
+    elif isinstance(indices_or_sections, Integral):  # Constant integer given
+        if indices_or_sections <= 0:
+            raise ValueError('Number of sections must be larger than zero.')
+
+        # If uneven sizes are not allowed and ary shape is numeric, check evenness
+        if not allow_uneven and not symbolic.issymbolic(desc.shape[axis]):
+            if desc.shape[axis] % indices_or_sections != 0:
+                raise ValueError('Array split does not result in an equal division. Consider using numpy.array_split '
+                                 'instead.')
+        if indices_or_sections > desc.shape[axis]:
+            raise ValueError('Cannot compile array split as it will result in empty arrays.')
+
+        # Sequence is not given, compute sections
+        # Mimic behavior of array_split in numpy: Sections are [s+1 x N%s], s, ..., s
+        size = desc.shape[axis] // indices_or_sections
+        remainder = desc.shape[axis] % indices_or_sections
+        sections = []
+        offset = 0
+        for _ in range(min(remainder, indices_or_sections)):
+            offset += size + 1
+            sections.append(offset)
+        for _ in range(remainder, indices_or_sections - 1):
+            offset += size
+            sections.append(offset)
+
+    elif symbolic.issymbolic(indices_or_sections):
+        raise ValueError('Symbolic split cannot be compiled due to output tuple size being unknown. '
+                         'Consider using numpy.reshape instead.')
+    else:
+        raise TypeError(f'Unsupported type {type(indices_or_sections)} for indices_or_sections in numpy.split')
+
+    # Split according to sections
+    r = state.add_read(ary)
+    result = []
+    offset = 0
+    for section in sections:
+        shape = list(desc.shape)
+        shape[axis] = section - offset
+        name, _ = sdfg.add_temp_transient(shape, desc.dtype, storage=desc.storage, lifetime=desc.lifetime)
+        # Add copy
+        w = state.add_write(name)
+        subset = subsets.Range.from_array(desc)
+        subset[axis] = (offset, offset + shape[axis] - 1, 1)
+        offset += shape[axis]
+        state.add_nedge(r, w, Memlet(data=ary, subset=subset))
+        result.append(name)
+
+    # Add final section
+    shape = list(desc.shape)
+    shape[axis] -= offset
+    name, _ = sdfg.add_temp_transient(shape, desc.dtype, storage=desc.storage, lifetime=desc.lifetime)
+    w = state.add_write(name)
+    subset = subsets.Range.from_array(desc)
+    subset[axis] = (offset, offset + shape[axis] - 1, 1)
+    state.add_nedge(r, w, Memlet(data=ary, subset=subset))
+    result.append(name)
+
+    # Always return a list of results, even if the size is 1
+    return result
+
+
+@oprepo.replaces('numpy.split')
+def _split(visitor: ProgramVisitor,
+           sdfg: SDFG,
+           state: SDFGState,
+           ary: str,
+           indices_or_sections: Union[symbolic.SymbolicType, List[symbolic.SymbolicType], str],
+           axis: int = 0):
+    return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis, allow_uneven=False)
+
+
+@oprepo.replaces('numpy.array_split')
+def _array_split(visitor: ProgramVisitor,
+                 sdfg: SDFG,
+                 state: SDFGState,
+                 ary: str,
+                 indices_or_sections: Union[symbolic.SymbolicType, List[symbolic.SymbolicType], str],
+                 axis: int = 0):
+    return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis, allow_uneven=True)
+
+
+@oprepo.replaces('numpy.dsplit')
+def _dsplit(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, ary: str,
+            indices_or_sections: Union[symbolic.SymbolicType, List[symbolic.SymbolicType], str]):
+    if isinstance(ary, str) and ary in sdfg.arrays:
+        if len(sdfg.arrays[ary].shape) < 3:
+            raise ValueError('Array dimensionality must be 3 or above for dsplit')
+    return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis=2, allow_uneven=False)
+
+
+@oprepo.replaces('numpy.hsplit')
+def _hsplit(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, ary: str,
+            indices_or_sections: Union[symbolic.SymbolicType, List[symbolic.SymbolicType], str]):
+    if isinstance(ary, str) and ary in sdfg.arrays:
+        # In case of a 1D array, split with axis=0
+        if len(sdfg.arrays[ary].shape) <= 1:
+            return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis=0, allow_uneven=False)
+    return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis=1, allow_uneven=False)
+
+
+@oprepo.replaces('numpy.vsplit')
+def _vsplit(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, ary: str,
+            indices_or_sections: Union[symbolic.SymbolicType, List[symbolic.SymbolicType], str]):
+    return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis=0, allow_uneven=False)
+
+
+############################################################################################################
+# Fast Fourier Transform numpy package (numpy.fft)
+
+def _real_to_complex(real_type: dace.typeclass):
+    if real_type == dace.float32:
+        return dace.complex64
+    elif real_type == dace.float64:
+        return dace.complex128
+    else:
+        return real_type
+
+
+def _fft_core(pv: 'ProgramVisitor',
+              sdfg: SDFG,
+              state: SDFGState,
+              a: str,
+              n: Optional[dace.symbolic.SymbolicType] = None,
+              axis=-1,
+              norm: StringLiteral = StringLiteral('backward'),
+              is_inverse: bool = False):
+    from dace.libraries.fft.nodes import FFT, IFFT  # Avoid import loops
+    if axis != 0 and axis != -1:
+        raise NotImplementedError('Only one dimensional arrays are supported at the moment')
+    if not isinstance(a, str) or a not in sdfg.arrays:
+        raise ValueError('Input must be a valid array')
+
+    libnode = FFT('fft') if not is_inverse else IFFT('ifft')
+
+    desc = sdfg.arrays[a]
+    N = desc.shape[axis]
+
+    # If n is not None, either pad input or slice and add a view
+    if n is not None:
+        raise NotImplementedError
+
+    # Compute factor
+    if norm == 'forward':
+        factor = (1 / N) if not is_inverse else 1
+    elif norm == 'backward':
+        factor = 1 if not is_inverse else (1 / N)
+    elif norm == 'ortho':
+        factor = sp.sqrt(1 / N)
+    else:
+        raise ValueError('norm argument can only be one of "forward", "backward", or "ortho".')
+    libnode.factor = factor
+
+    # Compute output type from input type
+    if is_inverse and desc.dtype not in (dace.complex64, dace.complex128):
+        raise TypeError(f'Inverse FFT only accepts complex inputs, got {desc.dtype}')
+    dtype = _real_to_complex(desc.dtype)
+
+    name, odesc = sdfg.add_temp_transient_like(desc, dtype)
+    r = state.add_read(a)
+    w = state.add_write(name)
+    state.add_edge(r, None, libnode, '_inp', Memlet.from_array(a, desc))
+    state.add_edge(libnode, '_out', w, None, Memlet.from_array(name, odesc))
+
+    return name
+
+
+@oprepo.replaces('numpy.fft.fft')
+def _fft(pv: 'ProgramVisitor',
+         sdfg: SDFG,
+         state: SDFGState,
+         a: str,
+         n: Optional[dace.symbolic.SymbolicType] = None,
+         axis=-1,
+         norm: StringLiteral = StringLiteral('backward')):
+    return _fft_core(pv, sdfg, state, a, n, axis, norm, False)
+
+
+@oprepo.replaces('numpy.fft.ifft')
+def _ifft(pv: 'ProgramVisitor',
+          sdfg: SDFG,
+          state: SDFGState,
+          a,
+          n=None,
+          axis=-1,
+          norm: StringLiteral = StringLiteral('backward')):
+    return _fft_core(pv, sdfg, state, a, n, axis, norm, True)
diff --git a/dace/libraries/blas/nodes/gemv.py b/dace/libraries/blas/nodes/gemv.py
index baf6fb415d..52091c6864 100644
--- a/dace/libraries/blas/nodes/gemv.py
+++ b/dace/libraries/blas/nodes/gemv.py
@@ -730,6 +730,9 @@ def expansion(node: 'Gemv', state, sdfg, m=None, n=None, **kwargs):
         dtype_a = outer_array_a.dtype.type
         dtype = outer_array_x.dtype.base_type
         veclen = outer_array_x.dtype.veclen
+        alpha = f'{dtype.ctype}({node.alpha})'
+        beta = f'{dtype.ctype}({node.beta})'
+
         m = m or node.m
         n = n or node.n
         if m is None:
@@ -765,8 +768,17 @@ def expansion(node: 'Gemv', state, sdfg, m=None, n=None, **kwargs):
 
         func = func.lower() + 'gemv'
 
-        code = f"""cblas_{func}({layout}, {trans}, {m}, {n}, {node.alpha}, _A, {lda},
-                                _x, {strides_x[0]}, {node.beta}, _y, {strides_y[0]});"""
+        code = ''
+        if dtype in (dace.complex64, dace.complex128):
+            code = f'''
+            {dtype.ctype} __alpha = {alpha};
+            {dtype.ctype} __beta = {beta};
+            '''
+            alpha = '&__alpha'
+            beta = '&__beta'
+
+        code += f"""cblas_{func}({layout}, {trans}, {m}, {n}, {alpha}, _A, {lda},
+                                _x, {strides_x[0]}, {beta}, _y, {strides_y[0]});"""
 
         tasklet = dace.sdfg.nodes.Tasklet(node.name,
                                           node.in_connectors,
diff --git a/dace/libraries/fft/__init__.py b/dace/libraries/fft/__init__.py
new file mode 100644
index 0000000000..71fb014f32
--- /dev/null
+++ b/dace/libraries/fft/__init__.py
@@ -0,0 +1,6 @@
+# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved.
+from dace.library import register_library
+from .nodes import *
+from .environments import *
+
+register_library(__name__, "fft")
diff --git a/dace/libraries/fft/algorithms/__init__.py b/dace/libraries/fft/algorithms/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/dace/libraries/fft/algorithms/dft.py b/dace/libraries/fft/algorithms/dft.py
new file mode 100644
index 0000000000..340dfed22d
--- /dev/null
+++ b/dace/libraries/fft/algorithms/dft.py
@@ -0,0 +1,45 @@
+# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+One-dimensional Discrete Fourier Transform (DFT) native implementations.
+"""
+import dace
+import numpy as np
+import math
+
+
+# Native, naive version of the Discrete Fourier Transform
+@dace.program
+def dft(_inp, _out, N: dace.compiletime, factor: dace.compiletime):
+    i = np.arange(N)
+    e = np.exp(-2j * np.pi * i * i[:, None] / N)
+    _out[:] = factor * (e @ _inp.astype(dace.complex128))
+
+
+@dace.program
+def idft(_inp, _out, N: dace.compiletime, factor: dace.compiletime):
+    i = np.arange(N)
+    e = np.exp(2j * np.pi * i * i[:, None] / N)
+    _out[:] = factor * (e @ _inp.astype(dace.complex128))
+
+
+# Single-map version of DFT, useful for integrating small Fourier transforms into other operations
+@dace.program
+def dft_explicit(_inp, _out, N: dace.compiletime, factor: dace.compiletime):
+    _out[:] = 0
+    for i, n in dace.map[0:N, 0:N]:
+        with dace.tasklet:
+            inp << _inp[n]
+            exponent = 2 * math.pi * i * n / N
+            b = decltype(b)(math.cos(exponent), -math.sin(exponent)) * inp * factor
+            b >> _out(1, lambda a, b: a + b)[i]
+
+
+@dace.program
+def idft_explicit(_inp, _out, N: dace.compiletime, factor: dace.compiletime):
+    _out[:] = 0
+    for i, n in dace.map[0:N, 0:N]:
+        with dace.tasklet:
+            inp << _inp[n]
+            exponent = 2 * math.pi * i * n / N
+            b = decltype(b)(math.cos(exponent), math.sin(exponent)) * inp * factor
+            b >> _out(1, lambda a, b: a + b)[i]
diff --git a/dace/libraries/fft/environments/__init__.py b/dace/libraries/fft/environments/__init__.py
new file mode 100644
index 0000000000..0900214e68
--- /dev/null
+++ b/dace/libraries/fft/environments/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved.
+from .cufft import *
diff --git a/dace/libraries/fft/environments/cufft.py b/dace/libraries/fft/environments/cufft.py
new file mode 100644
index 0000000000..dd243d376a
--- /dev/null
+++ b/dace/libraries/fft/environments/cufft.py
@@ -0,0 +1,21 @@
+# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved.
+import dace.library
+
+
+@dace.library.environment
+class cuFFT:
+
+    cmake_minimum_version = None
+    cmake_packages = ["CUDA"]
+    cmake_variables = {}
+    cmake_includes = []
+    cmake_libraries = ["cufft"]
+    cmake_compile_flags = []
+    cmake_link_flags = []
+    cmake_files = []
+
+    headers = {'frame': ["cufft.h", "cufftXt.h"], 'cuda': ["cufft.h", "cufftXt.h"]}
+    state_fields = []
+    init_code = ""
+    finalize_code = ""
+    dependencies = []
diff --git a/dace/libraries/fft/nodes/__init__.py b/dace/libraries/fft/nodes/__init__.py
new file mode 100644
index 0000000000..dd8f132aa4
--- /dev/null
+++ b/dace/libraries/fft/nodes/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved.
+from .fft import FFT, IFFT
diff --git a/dace/libraries/fft/nodes/fft.py b/dace/libraries/fft/nodes/fft.py
new file mode 100644
index 0000000000..bc85f8785b
--- /dev/null
+++ b/dace/libraries/fft/nodes/fft.py
@@ -0,0 +1,204 @@
+# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+Implements Forward and Inverse Fast Fourier Transform (FFT) library nodes
+"""
+import warnings
+
+from dace import data, dtypes, SDFG, SDFGState, symbolic, library, nodes, properties
+from dace import transformation as xf
+from dace.libraries.fft import environments as env
+
+
+# Define the library nodes
+@library.node
+class FFT(nodes.LibraryNode):
+    implementations = {}
+    default_implementation = 'pure'
+
+    factor = properties.SymbolicProperty(desc='Coefficient to multiply outputs. Used for normalization', default=1.0)
+
+    def __init__(self, name, *args, schedule=None, **kwargs):
+        super().__init__(name, *args, schedule=schedule, inputs={'_inp'}, outputs={'_out'}, **kwargs)
+
+
+@library.node
+class IFFT(nodes.LibraryNode):
+    implementations = {}
+    default_implementation = 'pure'
+
+    factor = properties.SymbolicProperty(desc='Coefficient to multiply outputs. Used for normalization', default=1.0)
+
+    def __init__(self, name, *args, schedule=None, **kwargs):
+        super().__init__(name, *args, schedule=schedule, inputs={'_inp'}, outputs={'_out'}, **kwargs)
+
+
+##################################################################################################
+# Native SDFG expansions
+##################################################################################################
+
+
+@library.register_expansion(FFT, 'pure')
+class DFTExpansion(xf.ExpandTransformation):
+    environments = []
+
+    @staticmethod
+    def expansion(node: FFT, parent_state: SDFGState, parent_sdfg: SDFG) -> SDFG:
+        from dace.libraries.fft.algorithms import dft  # Lazy import functions
+        input, output = _get_input_and_output(parent_state, node)
+        indesc = parent_sdfg.arrays[input]
+        outdesc = parent_sdfg.arrays[output]
+        if len(indesc.shape) != 1:
+            raise NotImplementedError('Native SDFG expansion for FFT does not yet support N-dimensional inputs')
+
+        warnings.warn('Performance Warning: No assumptions on FFT input size, falling back to DFT')
+        return dft.dft_explicit.to_sdfg(indesc, outdesc, N=indesc.shape[0], factor=node.factor)
+
+
+@library.register_expansion(IFFT, 'pure')
+class IDFTExpansion(xf.ExpandTransformation):
+    environments = []
+
+    @staticmethod
+    def expansion(node: IFFT, parent_state: SDFGState, parent_sdfg: SDFG) -> SDFG:
+        from dace.libraries.fft.algorithms import dft  # Lazy import functions
+        input, output = _get_input_and_output(parent_state, node)
+        indesc = parent_sdfg.arrays[input]
+        outdesc = parent_sdfg.arrays[output]
+        if len(indesc.shape) != 1:
+            raise NotImplementedError('Native SDFG expansion for IFFT does not yet support N-dimensional inputs')
+
+        warnings.warn('Performance Warning: No assumptions on IFFT input size, falling back to DFT')
+        return dft.idft_explicit.to_sdfg(indesc, outdesc, N=indesc.shape[0], factor=node.factor)
+
+
+##################################################################################################
+# cuFFT expansions
+##################################################################################################
+
+
+@library.register_expansion(FFT, 'cuFFT')
+class cuFFTFFTExpansion(xf.ExpandTransformation):
+    environments = [env.cuFFT]
+    plan_uid = 0
+
+    @staticmethod
+    def expansion(node: FFT, parent_state: SDFGState, parent_sdfg: SDFG) -> SDFG:
+        input, output = _get_input_and_output(parent_state, node)
+        indesc = parent_sdfg.arrays[input]
+        outdesc = parent_sdfg.arrays[output]
+        if str(node.factor) != '1':
+            raise NotImplementedError('Multiplicative post-FFT factors are not yet implemented')
+        return _generate_cufft_code(indesc, outdesc, parent_sdfg, False)
+
+
+@library.register_expansion(IFFT, 'cuFFT')
+class cuFFTIFFTExpansion(xf.ExpandTransformation):
+    environments = [env.cuFFT]
+    plan_uid = 0
+
+    @staticmethod
+    def expansion(node: IFFT, parent_state: SDFGState, parent_sdfg: SDFG) -> SDFG:
+        input, output = _get_input_and_output(parent_state, node)
+        indesc = parent_sdfg.arrays[input]
+        outdesc = parent_sdfg.arrays[output]
+        if str(node.factor) != '1':
+            raise NotImplementedError('Multiplicative post-FFT factors are not yet implemented')
+        return _generate_cufft_code(indesc, outdesc, parent_sdfg, True)
+
+
+def _generate_cufft_code(indesc: data.Data, outdesc: data.Data, sdfg: SDFG, is_inverse: bool):
+    from dace.codegen.targets import cpp  # Avoid import loops
+    if len(indesc.shape) not in (1, 2, 3):
+        raise ValueError('cuFFT only supports 1/2/3-dimensional FFT')
+    if indesc.storage != dtypes.StorageType.GPU_Global:
+        raise ValueError('cuFFT implementation requires input array to be on GPU')
+    if outdesc.storage != dtypes.StorageType.GPU_Global:
+        raise ValueError('cuFFT implementation requires output array to be on GPU')
+
+    cufft_type = _types_to_cufft(indesc.dtype, outdesc.dtype)
+    init_code = ''
+    exit_code = ''
+    callsite_code = ''
+
+    # Make a unique name for this plan
+    if not is_inverse:
+        plan_name = f'fwdplan{cuFFTFFTExpansion.plan_uid}'
+        cuFFTFFTExpansion.plan_uid += 1
+        direction = 'CUFFT_FORWARD'
+        tasklet_prefix = ''
+    else:
+        plan_name = f'invplan{cuFFTIFFTExpansion.plan_uid}'
+        cuFFTIFFTExpansion.plan_uid += 1
+        direction = 'CUFFT_INVERSE'
+        tasklet_prefix = 'i'
+
+    fields = [
+        f'cufftHandle {plan_name};',
+    ]
+    plan_name = f'__state->{plan_name}'
+
+    init_code += f'''
+    cufftCreate(&{plan_name});
+    '''
+    exit_code += f'''
+    cufftDestroy({plan_name});
+    '''
+
+    cdims = ', '.join([cpp.sym2cpp(s) for s in indesc.shape])
+    make_plan = f'''
+    {{
+        size_t __work_size = 0;
+        cufftMakePlan{len(indesc.shape)}d({plan_name}, {cdims}, {cufft_type}, /*batch=*/1, &__work_size);
+    }}
+    '''
+
+    # Make plan in init if not symbolic or not data-dependent, otherwise make at callsite.
+    symbols_that_change = set(s for ise in sdfg.edges() for s in ise.data.assignments.keys())
+    symbols_that_change &= set(map(str, sdfg.symbols.keys()))
+
+    def _fsyms(x):
+        if symbolic.issymbolic(x):
+            return set(map(str, x.free_symbols))
+        return set()
+
+    if symbols_that_change and any(_fsyms(s) & symbols_that_change for s in indesc.shape):
+        callsite_code += make_plan
+    else:
+        init_code += make_plan
+
+    # Execute plan
+    callsite_code += f'''
+    cufftSetStream({plan_name}, __dace_current_stream);
+    cufftXtExec({plan_name}, _inp, _out, {direction});
+    '''
+
+    return nodes.Tasklet(f'cufft_{tasklet_prefix}fft', {'_inp'}, {'_out'},
+                         callsite_code,
+                         language=dtypes.Language.CPP,
+                         state_fields=fields,
+                         code_init=init_code,
+                         code_exit=exit_code)
+
+
+##################################################################################################
+# Helper functions
+##################################################################################################
+
+
+def _get_input_and_output(state: SDFGState, node: nodes.LibraryNode):
+    """
+    Helper function that returns the input and output arrays of the library node
+    """
+    in_edge = next(e for e in state.in_edges(node) if e.dst_conn)
+    out_edge = next(e for e in state.out_edges(node) if e.src_conn)
+    return in_edge.data.data, out_edge.data.data
+
+
+def _types_to_cufft(indtype: dtypes.typeclass, outdtype: dtypes.typeclass):
+    typedict = {
+        dtypes.float32: 'R',
+        dtypes.float64: 'D',
+        dtypes.complex64: 'C',
+        dtypes.complex128: 'Z',
+    }
+    return f'CUFFT_{typedict[indtype]}2{typedict[outdtype]}'
diff --git a/dace/libraries/standard/nodes/transpose.py b/dace/libraries/standard/nodes/transpose.py
index 58c6cfc33e..e2795ef951 100644
--- a/dace/libraries/standard/nodes/transpose.py
+++ b/dace/libraries/standard/nodes/transpose.py
@@ -100,6 +100,12 @@ class ExpandTransposeMKL(ExpandTransformation):
     @staticmethod
     def expansion(node, state, sdfg):
         node.validate(sdfg, state)
+
+        # Fall back to native implementation if input and output types are not the same
+        if (sdfg.arrays[list(state.in_edges_by_connector(node, '_inp'))[0].data.data].dtype != sdfg.arrays[list(
+                state.out_edges_by_connector(node, '_out'))[0].data.data].dtype):
+            return ExpandTransposePure.make_sdfg(node, state, sdfg)
+
         dtype = node.dtype
         if dtype == dace.float32:
             func = "somatcopy"
@@ -141,22 +147,30 @@ class ExpandTransposeOpenBLAS(ExpandTransformation):
     @staticmethod
     def expansion(node, state, sdfg):
         node.validate(sdfg, state)
+
+        # Fall back to native implementation if input and output types are not the same
+        if (sdfg.arrays[list(state.in_edges_by_connector(node, '_inp'))[0].data.data].dtype != sdfg.arrays[list(
+                state.out_edges_by_connector(node, '_out'))[0].data.data].dtype):
+            return ExpandTransposePure.make_sdfg(node, state, sdfg)
+
         dtype = node.dtype
         cast = ""
         if dtype == dace.float32:
             func = "somatcopy"
             alpha = "1.0f"
+            cast = ''
         elif dtype == dace.float64:
             func = "domatcopy"
             alpha = "1.0"
+            cast = ''
         elif dtype == dace.complex64:
             func = "comatcopy"
-            cast = "(float*)"
-            alpha = f"{cast}dace::blas::BlasConstants::Get().Complex64Pone()"
+            alpha = "dace::blas::BlasConstants::Get().Complex64Pone()"
+            cast = '(float*)'
         elif dtype == dace.complex128:
             func = "zomatcopy"
-            cast = "(double*)"
-            alpha = f"{cast}dace::blas::BlasConstants::Get().Complex128Pone()"
+            alpha = "dace::blas::BlasConstants::Get().Complex128Pone()"
+            cast = '(double*)'
         else:
             raise ValueError("Unsupported type for OpenBLAS omatcopy extension: " + str(dtype))
         # TODO: Add stride support
@@ -164,8 +178,8 @@ def expansion(node, state, sdfg):
         # Adaptations for BLAS API
         order = 'CblasRowMajor'
         trans = 'CblasTrans'
-        code = ("cblas_{f}({o}, {t}, {m}, {n}, {a}, {c}_inp, "
-                "{n}, {c}_out, {m});").format(f=func, o=order, t=trans, m=m, n=n, a=alpha, c=cast)
+        code = ("cblas_{f}({o}, {t}, {m}, {n}, {cast}{a}, {cast}_inp, "
+                "{n}, {cast}_out, {m});").format(f=func, o=order, t=trans, m=m, n=n, a=alpha, cast=cast)
         tasklet = dace.sdfg.nodes.Tasklet(node.name,
                                           node.in_connectors,
                                           node.out_connectors,
@@ -184,6 +198,11 @@ def expansion(node, state, sdfg, **kwargs):
         node.validate(sdfg, state)
         dtype = node.dtype
 
+        # Fall back to native implementation if input and output types are not the same
+        if (sdfg.arrays[list(state.in_edges_by_connector(node, '_inp'))[0].data.data].dtype != sdfg.arrays[list(
+                state.out_edges_by_connector(node, '_out'))[0].data.data].dtype):
+            return ExpandTransposePure.make_sdfg(node, state, sdfg)
+
         try:
             func, cdtype, factort = blas_helpers.cublas_type_metadata(dtype)
         except TypeError as ex:
diff --git a/dace/memlet.py b/dace/memlet.py
index 1e39b4179d..f78da3a6b7 100644
--- a/dace/memlet.py
+++ b/dace/memlet.py
@@ -230,7 +230,7 @@ def is_empty(self) -> bool:
         primarily used for connecting nodes to scopes without transferring 
         data to them. 
         """
-        return (self.data is None and self.src_subset is None and self.dst_subset is None)
+        return (self.data is None and self.subset is None and self.other_subset is None)
 
     @property
     def num_accesses(self):
@@ -561,20 +561,18 @@ def used_symbols(self, all_symbols: bool, edge=None) -> Set[str]:
                 view_edge = True
 
         if not view_edge:
-            if self.src_subset:
-                result |= self.src_subset.free_symbols
-
-            if self.dst_subset:
-                result |= self.dst_subset.free_symbols
+            if self.subset:
+                result |= self.subset.free_symbols
+            if self.other_subset:
+                result |= self.other_subset.free_symbols
         else:
             # View edges do not require the end of the range nor strides
-            if self.src_subset:
-                for rb, _, _ in self.src_subset.ndrange():
+            if self.subset:
+                for rb, _, _ in self.subset.ndrange():
                     if symbolic.issymbolic(rb):
                         result |= set(map(str, rb.free_symbols))
-
-            if self.dst_subset:
-                for rb, _, _ in self.dst_subset.ndrange():
+            if self.other_subset:
+                for rb, _, _ in self.other_subset.ndrange():
                     if symbolic.issymbolic(rb):
                         result |= set(map(str, rb.free_symbols))
 
diff --git a/dace/sdfg/infer_types.py b/dace/sdfg/infer_types.py
index cf58cf76cc..97010e95a7 100644
--- a/dace/sdfg/infer_types.py
+++ b/dace/sdfg/infer_types.py
@@ -116,8 +116,7 @@ def infer_connector_types(sdfg: SDFG):
             for e in state.out_edges(node):
                 cname = e.src_conn
                 if cname and node.out_connectors[cname] is None:
-                    raise TypeError('Ambiguous or uninferable type in'
-                                    ' connector "%s" of node "%s"' % (cname, node))
+                    raise TypeError('Ambiguous or uninferable type in' ' connector "%s" of node "%s"' % (cname, node))
 
 
 #############################################################################
@@ -301,6 +300,12 @@ def _set_default_schedule_in_scope(state: SDFGState,
     else:
         child_schedule = _determine_child_schedule(parent_schedules)
 
+        # Special case for dynamic thread-block neighboring schedules
+        if child_schedule == dtypes.ScheduleType.GPU_ThreadBlock:
+            from dace.transformation.helpers import gpu_map_has_explicit_dyn_threadblocks  # Avoid import loops
+            if gpu_map_has_explicit_dyn_threadblocks(state, parent_node):
+                child_schedule = dtypes.ScheduleType.GPU_ThreadBlock_Dynamic
+
     # Set child schedule type in scope
     for node in child_nodes[parent_node]:
         # Set default schedule types
@@ -393,6 +398,7 @@ def _get_storage_from_parent(data_name: str, sdfg: SDFG) -> dtypes.StorageType:
 
     raise ValueError(f'Could not find data descriptor {data_name} in parent SDFG')
 
+
 def infer_aliasing(node: nodes.NestedSDFG, sdfg: SDFG, state: SDFGState) -> None:
     """
     Infers aliasing information on nested SDFG arrays based on external edges and connectors.
diff --git a/dace/sdfg/propagation.py b/dace/sdfg/propagation.py
index f62bb6eb58..a24db0c72b 100644
--- a/dace/sdfg/propagation.py
+++ b/dace/sdfg/propagation.py
@@ -1430,10 +1430,15 @@ def propagate_subset(memlets: List[Memlet],
         tmp_subset = None
 
         subset = None
-        if use_dst and md.dst_subset is not None:
-            subset = md.dst_subset
-        elif not use_dst and md.src_subset is not None:
-            subset = md.src_subset
+        src, dst = md.subset, md.other_subset
+        if md._is_data_src is not None:
+            # Ideally, this should always be the case. In practice, it is not always so. So, if the memlet is uninitialized
+            # for some reason, we just explicitly fallback to `subset` and `other_subset` to retain the prior behaviour.
+            src, dst = md.src_subset, md.dst_subset
+        if use_dst and dst is not None:
+            subset = dst
+        elif not use_dst and src is not None:
+            subset = src
         else:
             subset = md.subset
 
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 6b001f2547..e0c727bda5 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -771,13 +771,13 @@ def add_symbol(self, name, stype, find_new_name: bool = False):
             if name in self.symbols:
                 raise FileExistsError(f'Symbol "{name}" already exists in SDFG')
             if name in self.arrays:
-                raise FileExistsError(f'Can not create symbol "{name}", the name is used by a data descriptor.')
+                raise FileExistsError(f'Cannot create symbol "{name}", the name is used by a data descriptor.')
             if name in self._subarrays:
-                raise FileExistsError(f'Can not create symbol "{name}", the name is used by a subarray.')
+                raise FileExistsError(f'Cannot create symbol "{name}", the name is used by a subarray.')
             if name in self._rdistrarrays:
-                raise FileExistsError(f'Can not create symbol "{name}", the name is used by a RedistrArray.')
+                raise FileExistsError(f'Cannot create symbol "{name}", the name is used by a RedistrArray.')
             if name in self._pgrids:
-                raise FileExistsError(f'Can not create symbol "{name}", the name is used by a ProcessGrid.')
+                raise FileExistsError(f'Cannot create symbol "{name}", the name is used by a ProcessGrid.')
         if not isinstance(stype, dtypes.typeclass):
             stype = dtypes.dtype_to_typeclass(stype)
         self.symbols[name] = stype
diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py
index 09e7607d65..b982dfd718 100644
--- a/dace/sdfg/state.py
+++ b/dace/sdfg/state.py
@@ -849,6 +849,8 @@ def unordered_arglist(self,
         for node in self.nodes():
             if isinstance(node, nd.AccessNode):
                 descs[node.data] = node.desc(sdfg)
+                # NOTE: In case of multiple nodes of the same data this will
+                #   override previously found nodes.
                 descs_with_nodes[node.data] = node
                 if isinstance(node.desc(sdfg), dt.Scalar):
                     scalars_with_nodes.add(node.data)
@@ -865,19 +867,57 @@ def unordered_arglist(self,
                     else:
                         data_args[node.data] = desc
 
-        # Add data arguments from memlets, if do not appear in any of the nodes
-        # (i.e., originate externally)
+        # Add data arguments from memlets, if do not appear in any of the nodes (i.e., originate externally)
+        #  TODO: Investigate is scanning the adjacent edges of the input and output connectors is better.
         for edge in self.edges():
-            if edge.data.data is not None and edge.data.data not in descs:
-                desc = sdfg.arrays[edge.data.data]
-                if isinstance(desc, dt.Scalar):
-                    # Ignore code->code edges.
-                    if (isinstance(edge.src, nd.CodeNode) and isinstance(edge.dst, nd.CodeNode)):
-                        continue
+            if edge.data.is_empty():
+                continue
+
+            elif edge.data.data not in descs:
+                # The edge reads data from the outside, and the Memlet is directly indicating what is read.
+                if (isinstance(edge.src, nd.CodeNode) and isinstance(edge.dst, nd.CodeNode)):
+                    continue    # Ignore code->code edges.
+                additional_descs = {edge.data.data: sdfg.arrays[edge.data.data]}
+
+            elif isinstance(edge.dst, (nd.AccessNode, nd.CodeNode)) and isinstance(edge.src, nd.EntryNode):
+                # Special case from the above; An AccessNode reads data from the Outside, but
+                #  the Memlet references the data on the inside. Thus we have to follow the data
+                #  to where it originates from.
+                # NOTE: We have to use a memlet path, because we have to go "against the flow"
+                #   Furthermore, in a valid SDFG the data will only come from one source anyway.
+                top_source_edge = self.graph.memlet_path(edge)[0]
+                if not isinstance(top_source_edge.src, nd.AccessNode):
+                    continue
+                additional_descs = (
+                        {top_source_edge.src.data: top_source_edge.src.desc(sdfg)}
+                        if top_source_edge.src.data not in descs
+                        else {}
+                )
+
+            elif isinstance(edge.dst, nd.ExitNode) and isinstance(edge.src, (nd.AccessNode, nd.CodeNode)):
+                # Same case as above, but for outgoing Memlets.
+                # NOTE: We have to use a memlet tree here, because the data could potentially
+                #   go to multiple sources. We have to do it this way, because if we would call
+                #   `memlet_tree()` here, then we would just get the edge back.
+                additional_descs = {}
+                connector_to_look = "OUT_" + edge.dst_conn[3:]
+                for oedge in self.graph.out_edges_by_connector(edge.dst, connector_to_look):
+                    if (
+                        (not oedge.data.is_empty()) and (oedge.data.data not in descs)
+                        and (oedge.data.data not in additional_descs)
+                    ):
+                        additional_descs[oedge.data.data] = sdfg.arrays[oedge.data.data]
+
+            else:
+                # Case is ignored.
+                continue
 
-                    scalar_args[edge.data.data] = desc
+            # Now processing the list of newly found data.
+            for aname, additional_desc in additional_descs.items():
+                if isinstance(additional_desc, dt.Scalar):
+                    scalar_args[aname] = additional_desc
                 else:
-                    data_args[edge.data.data] = desc
+                    data_args[aname] = additional_desc
 
         # Loop over locally-used data descriptors
         for name, desc in descs.items():
diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index e75099276f..2df9e17445 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -1,17 +1,22 @@
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved.
 """ Exception classes and methods for validation of SDFGs. """
+
 import copy
-from dace.dtypes import DebugInfo
 import os
-from typing import TYPE_CHECKING, Dict, List, Set
 import warnings
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, List, Set
+
+import networkx as nx
+
 from dace import dtypes, subsets, symbolic
+from dace.dtypes import DebugInfo
 
 if TYPE_CHECKING:
     import dace
+    from dace.memlet import Memlet
     from dace.sdfg import SDFG
     from dace.sdfg import graph as gr
-    from dace.memlet import Memlet
     from dace.sdfg.state import ControlFlowRegion
 
 ###########################################
@@ -34,8 +39,8 @@ def validate_control_flow_region(sdfg: 'SDFG',
                                  symbols: dict,
                                  references: Set[int] = None,
                                  **context: bool):
-    from dace.sdfg.state import SDFGState, ControlFlowRegion, ConditionalBlock
     from dace.sdfg.scope import is_in_scope
+    from dace.sdfg.state import ConditionalBlock, ControlFlowRegion, SDFGState
 
     if len(region.source_nodes()) > 1 and region.start_block is None:
         raise InvalidSDFGError("Starting block undefined", sdfg, None)
@@ -200,7 +205,7 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
     # Avoid import loop
     from dace import data as dt
     from dace.codegen.targets import fpga
-    from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga
+    from dace.sdfg.scope import is_devicelevel_fpga, is_devicelevel_gpu
 
     references = references or set()
 
@@ -383,14 +388,14 @@ def validate_state(state: 'dace.sdfg.SDFGState',
     from dace.sdfg import SDFG
     from dace.sdfg import nodes as nd
     from dace.sdfg import utils as sdutil
-    from dace.sdfg.scope import scope_contains_scope, is_devicelevel_gpu, is_devicelevel_fpga
+    from dace.sdfg.scope import (is_devicelevel_fpga, is_devicelevel_gpu,
+                                 scope_contains_scope)
 
     sdfg = sdfg or state.parent
     state_id = state_id if state_id is not None else state.parent_graph.node_id(state)
     symbols = symbols or {}
     initialized_transients = (initialized_transients if initialized_transients is not None else {'__pystate'})
     references = references or set()
-    scope = state.scope_dict()
 
     # Obtain whether we are already in an accelerator context
     if not hasattr(context, 'in_gpu'):
@@ -420,6 +425,8 @@ def validate_state(state: 'dace.sdfg.SDFGState',
     if state.has_cycles():
         raise InvalidSDFGError('State should be acyclic but contains cycles', sdfg, state_id)
 
+    scope = state.scope_dict()
+
     for nid, node in enumerate(state.nodes()):
         # Reference check
         if id(node) in references:
@@ -839,6 +846,37 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                     continue
                 raise error
 
+    if Config.get_bool('experimental.check_race_conditions'):
+        node_labels = []
+        write_accesses = defaultdict(list)
+        read_accesses = defaultdict(list)
+        for node in state.data_nodes():
+            node_labels.append(node.label)
+            write_accesses[node.label].extend(
+                [{'subset': e.data.dst_subset, 'node': node, 'wcr': e.data.wcr} for e in state.in_edges(node)])
+            read_accesses[node.label].extend(
+                [{'subset': e.data.src_subset, 'node': node} for e in state.out_edges(node)])
+
+        for node_label in node_labels:
+            writes = write_accesses[node_label]
+            reads = read_accesses[node_label]
+            # Check write-write data races.
+            for i in range(len(writes)):
+                for j in range(i+1, len(writes)):
+                    same_or_unreachable_nodes = (writes[i]['node'] == writes[j]['node'] or
+                                                 not nx.has_path(state.nx, writes[i]['node'], writes[j]['node']))
+                    no_wcr = writes[i]['wcr'] is None and writes[j]['wcr'] is None
+                    if same_or_unreachable_nodes and no_wcr:
+                        subsets_intersect = subsets.intersects(writes[i]['subset'], writes[j]['subset'])
+                        if subsets_intersect:
+                            warnings.warn(f'Memlet range overlap while writing to "{node}" in state "{state.label}"')
+            # Check read-write data races.
+            for write in writes:
+                for read in reads:
+                    if (not nx.has_path(state.nx, read['node'], write['node']) and
+                        subsets.intersects(write['subset'], read['subset'])):
+                        warnings.warn(f'Memlet range overlap while writing to "{node}" in state "{state.label}"')
+
     ########################################
 
 
diff --git a/dace/transformation/dataflow/copy_to_map.py b/dace/transformation/dataflow/copy_to_map.py
index 5b4260ad55..9c4dbce627 100644
--- a/dace/transformation/dataflow/copy_to_map.py
+++ b/dace/transformation/dataflow/copy_to_map.py
@@ -1,12 +1,13 @@
 # Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
 
-from dace import dtypes, symbolic, data, subsets, Memlet
+from dace import dtypes, symbolic, data, subsets, Memlet, properties
 from dace.sdfg.scope import is_devicelevel_gpu
 from dace.transformation import transformation as xf
 from dace.sdfg import SDFGState, SDFG, nodes, utils as sdutil
 from typing import Tuple
+import itertools
 
-
+@properties.make_properties
 class CopyToMap(xf.SingleStateTransformation):
     """
     Converts an access node -> access node copy into a map. Useful for generating manual code and
@@ -14,6 +15,10 @@ class CopyToMap(xf.SingleStateTransformation):
     """
     a = xf.PatternNode(nodes.AccessNode)
     b = xf.PatternNode(nodes.AccessNode)
+    ignore_strides = properties.Property(
+            default=False,
+            desc='Ignore the stride of the data container; Defaults to `False`.',
+    )
 
     @classmethod
     def expressions(cls):
@@ -31,7 +36,10 @@ def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissi
         if isinstance(self.b.desc(sdfg), data.View):
             if sdutil.get_view_node(graph, self.b) == self.a:
                 return False
-        if self.a.desc(sdfg).strides == self.b.desc(sdfg).strides:
+        if (not self.ignore_strides) and self.a.desc(sdfg).strides == self.b.desc(sdfg).strides:
+            return False
+        # Ensures that the edge goes from `a` -> `b`.
+        if not any(edge.dst is self.b for edge in graph.out_edges(self.a)):
             return False
 
         return True
@@ -62,31 +70,69 @@ def delinearize_linearize(self, desc: data.Array, copy_shape: Tuple[symbolic.Sym
         return subsets.Range([(ind, ind, 1) for ind in cur_index])
 
     def apply(self, state: SDFGState, sdfg: SDFG):
-        adesc = self.a.desc(sdfg)
-        bdesc = self.b.desc(sdfg)
-        edge = state.edges_between(self.a, self.b)[0]
+        avnode = self.a
+        av = avnode.data
+        adesc = avnode.desc(sdfg)
+        bvnode = self.b
+        bv = bvnode.data
+        bdesc = bvnode.desc(sdfg)
+
+        edge = state.edges_between(avnode, bvnode)[0]
+        src_subset = edge.data.get_src_subset(edge, state)
+        if src_subset is None:
+            src_subset = subsets.Range.from_array(adesc)
+        src_subset_size = src_subset.size()
+        red_src_subset_size = tuple(s for s in src_subset_size if s != 1)
+
+        dst_subset = edge.data.get_dst_subset(edge, state)
+        if dst_subset is None:
+            dst_subset = subsets.Range.from_array(bdesc)
+        dst_subset_size = dst_subset.size()
+        red_dst_subset_size = tuple(s for s in dst_subset_size if s != 1)
 
         if len(adesc.shape) >= len(bdesc.shape):
-            copy_shape = edge.data.get_src_subset(edge, state).size()
+            copy_shape = src_subset_size
             copy_a = True
         else:
-            copy_shape = edge.data.get_dst_subset(edge, state).size()
+            copy_shape = dst_subset_size
             copy_a = False
 
-        maprange = {f'__i{i}': (0, s - 1, 1) for i, s in enumerate(copy_shape)}
-
-        av = self.a.data
-        bv = self.b.data
-        avnode = self.a
-        bvnode = self.b
-
-        # Linearize and delinearize to get index expression for other side
-        if copy_a:
-            a_index = [symbolic.pystr_to_symbolic(f'__i{i}') for i in range(len(copy_shape))]
-            b_index = self.delinearize_linearize(bdesc, copy_shape, edge.data.get_dst_subset(edge, state))
+        if tuple(src_subset_size) == tuple(dst_subset_size):
+            # The two subsets have exactly the same shape, so we can just copying with an offset.
+            #  We use another index variables for the tests only.
+            maprange = {f'__j{i}': (0, s - 1, 1) for i, s in enumerate(copy_shape)}
+            a_index = [symbolic.pystr_to_symbolic(f'__j{i} + ({src_subset[i][0]})') for i in range(len(copy_shape))]
+            b_index = [symbolic.pystr_to_symbolic(f'__j{i} + ({dst_subset[i][0]})') for i in range(len(copy_shape))]
+        elif red_src_subset_size == red_dst_subset_size and (len(red_dst_subset_size) > 0):
+            # If we remove all size 1 dimensions that the two subsets have the same size.
+            #  This is essentially the memlet `a[0:10, 2, 0:10] -> 0:10, 10:20`
+            #  We use another index variable only for the tests but we would have to
+            #  recreate the index anyways.
+            maprange = {f'__j{i}': (0, s - 1, 1) for i, s in enumerate(red_src_subset_size)}
+            cnt = itertools.count(0)
+            a_index = [
+                symbolic.pystr_to_symbolic(f'{src_subset[i][0]}')
+                if s == 1
+                else symbolic.pystr_to_symbolic(f'__j{next(cnt)} + ({src_subset[i][0]})')
+                for i, s in enumerate(src_subset_size)
+            ]
+            cnt = itertools.count(0)
+            b_index = [
+                symbolic.pystr_to_symbolic(f'{dst_subset[i][0]}')
+                if s == 1
+                else symbolic.pystr_to_symbolic(f'__j{next(cnt)} + ({dst_subset[i][0]})')
+                for i, s in enumerate(dst_subset_size)
+            ]
         else:
-            a_index = self.delinearize_linearize(adesc, copy_shape, edge.data.get_src_subset(edge, state))
-            b_index = [symbolic.pystr_to_symbolic(f'__i{i}') for i in range(len(copy_shape))]
+            # We have to delinearize and linearize
+            #  We use another index variable for the tests.
+            maprange = {f'__i{i}': (0, s - 1, 1) for i, s in enumerate(copy_shape)}
+            if copy_a:
+                a_index = [symbolic.pystr_to_symbolic(f'__i{i}') for i in range(len(copy_shape))]
+                b_index = self.delinearize_linearize(bdesc, copy_shape, edge.data.get_dst_subset(edge, state))
+            else:
+                a_index = self.delinearize_linearize(adesc, copy_shape, edge.data.get_src_subset(edge, state))
+                b_index = [symbolic.pystr_to_symbolic(f'__i{i}') for i in range(len(copy_shape))]
 
         a_subset = subsets.Range([(ind, ind, 1) for ind in a_index])
         b_subset = subsets.Range([(ind, ind, 1) for ind in b_index])
@@ -101,7 +147,7 @@ def apply(self, state: SDFGState, sdfg: SDFG):
                 schedule = dtypes.ScheduleType.GPU_Device
 
         # Add copy map
-        t, _, _ = state.add_mapped_tasklet('copy',
+        t, _, _ = state.add_mapped_tasklet(f'copy_{av}_{bv}',
                                            maprange,
                                            dict(__inp=Memlet(data=av, subset=a_subset)),
                                            '__out = __inp',
diff --git a/dace/transformation/dataflow/warp_tiling.py b/dace/transformation/dataflow/warp_tiling.py
index 362b51d9ac..f9091950e3 100644
--- a/dace/transformation/dataflow/warp_tiling.py
+++ b/dace/transformation/dataflow/warp_tiling.py
@@ -55,6 +55,10 @@ def apply(self, graph: SDFGState, sdfg: SDFG) -> nodes.MapEntry:
         # Stride and offset all internal maps
         maps_to_stride = xfh.get_internal_scopes(graph, new_me, immediate=True)
         for nstate, nmap in maps_to_stride:
+            # Skip sequential maps
+            if nmap.schedule == dtypes.ScheduleType.Sequential:
+                continue
+
             nsdfg = nstate.parent
             nsdfg_node = nsdfg.parent_nsdfg_node
 
diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py
index 6ca4602079..b7bf49e62b 100644
--- a/dace/transformation/helpers.py
+++ b/dace/transformation/helpers.py
@@ -934,11 +934,7 @@ def replicate_scope(sdfg: SDFG, state: SDFGState, scope: ScopeSubgraphView) -> S
     return ScopeSubgraphView(state, new_nodes, new_entry)
 
 
-def offset_map(state: SDFGState,
-               entry: nodes.MapEntry,
-               dim: int,
-               offset: symbolic.SymbolicType,
-               negative: bool = True):
+def offset_map(state: SDFGState, entry: nodes.MapEntry, dim: int, offset: symbolic.SymbolicType, negative: bool = True):
     """
     Offsets a map parameter and its contents by a value.
 
@@ -1270,6 +1266,17 @@ def gpu_map_has_explicit_threadblocks(state: SDFGState, entry: nodes.EntryNode)
     return False
 
 
+def gpu_map_has_explicit_dyn_threadblocks(state: SDFGState, entry: nodes.EntryNode) -> bool:
+    """
+    Returns True if GPU_Device map has explicit thread-block maps nested within.
+    """
+    internal_maps = get_internal_scopes(state, entry)
+    if any(m.schedule == dtypes.ScheduleType.GPU_ThreadBlock_Dynamic for _, m in internal_maps):
+        return True
+
+    return False
+
+
 def reconnect_edge_through_map(
         state: SDFGState, edge: graph.MultiConnectorEdge[Memlet], new_node: Union[nodes.EntryNode, nodes.ExitNode],
         keep_src: bool) -> Tuple[graph.MultiConnectorEdge[Memlet], graph.MultiConnectorEdge[Memlet]]:
diff --git a/doc/codegen/codegen.rst b/doc/codegen/codegen.rst
index a000022ee6..f3058c1440 100644
--- a/doc/codegen/codegen.rst
+++ b/doc/codegen/codegen.rst
@@ -32,8 +32,8 @@ There are many features that are enabled by generating code from SDFGs:
 
 .. note::
 
-    You can also extend the code generator with new backends externally, see the `Customizing Code Generation tutorial <https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/codegen.ipynb>`_ 
-    and the `Tensor Core sample <https://github.com/spcl/dace/blob/master/samples/codegen/tensor_cores.py>`_ for more information.
+    You can also extend the code generator with new backends externally, see the `Customizing Code Generation tutorial <https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/codegen.ipynb>`_ 
+    and the `Tensor Core sample <https://github.com/spcl/dace/blob/main/samples/codegen/tensor_cores.py>`_ for more information.
 
 
 After the code is generated, ``compiler.py`` will invoke CMake on the build folder (e.g., ``.dacecache/<program>/build``)
@@ -145,7 +145,7 @@ necessary headers. The runtime is used for:
     match Python interfaces. This is especially useful to generate matching code when calling functions such as ``range``
     inside Tasklets.
 
-The folder also contains other files and helper functions, refer to its contents `on GitHub <https://github.com/spcl/dace/tree/master/dace/runtime/include/dace>`_ 
+The folder also contains other files and helper functions, refer to its contents `on GitHub <https://github.com/spcl/dace/tree/main/dace/runtime/include/dace>`_ 
 for more information.
 
 
diff --git a/doc/extensions/extensions.rst b/doc/extensions/extensions.rst
index 4644bef109..3f73a924bc 100644
--- a/doc/extensions/extensions.rst
+++ b/doc/extensions/extensions.rst
@@ -17,10 +17,10 @@ The three key mechanisms of extensibility are class inheritance, :ref:`replaceme
 
 For more examples of how to extend DaCe, see the following resources:
 
-   * Library nodes: `Einsum specialization library node <https://github.com/spcl/dace/blob/master/dace/libraries/blas/nodes/einsum.py>`_
-   * Transformations: `Using and Creating Transformations <https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/transformations.ipynb>`_
-   * Code generators: `Extending the Code Generator <https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/codegen.ipynb>`_
-   * Frontend extensions (enumerations and replacements): `Tensor Core code sample <https://github.com/spcl/dace/blob/master/samples/codegen/tensor_cores.py>`_
+   * Library nodes: `Einsum specialization library node <https://github.com/spcl/dace/blob/main/dace/libraries/blas/nodes/einsum.py>`_
+   * Transformations: `Using and Creating Transformations <https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/transformations.ipynb>`_
+   * Code generators: `Extending the Code Generator <https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/codegen.ipynb>`_
+   * Frontend extensions (enumerations and replacements): `Tensor Core code sample <https://github.com/spcl/dace/blob/main/samples/codegen/tensor_cores.py>`_
 
 .. .. toctree
 ..    :maxdepth: 1
diff --git a/doc/frontend/daceprograms.rst b/doc/frontend/daceprograms.rst
index c21ac34722..4229fe422d 100644
--- a/doc/frontend/daceprograms.rst
+++ b/doc/frontend/daceprograms.rst
@@ -9,7 +9,7 @@ This includes standard Python code (loops, functions, context managers, etc.), b
 and (most) functions. 
 
 .. note::
-    For more examples, see the `Getting Started <https://nbviewer.org/github/spcl/dace/blob/master/tutorials/getting_started.ipynb>`_
+    For more examples, see the `Getting Started <https://nbviewer.org/github/spcl/dace/blob/main/tutorials/getting_started.ipynb>`_
     Jupyter Notebook tutorial.
 
 Usage
@@ -349,7 +349,7 @@ Explicit Dataflow Mode
 
 
 The DaCe Python frontend allows users to write SDFG tasklets and memlets directly in Python code.
-For more example uses, see the `Explicit Dataflow <https://nbviewer.org/github/spcl/dace/blob/master/tutorials/explicit.ipynb>`_
+For more example uses, see the `Explicit Dataflow <https://nbviewer.org/github/spcl/dace/blob/main/tutorials/explicit.ipynb>`_
 tutorial.
 
 Memlets
diff --git a/doc/ide/cli.rst b/doc/ide/cli.rst
index d73d32fdfc..1f63397841 100644
--- a/doc/ide/cli.rst
+++ b/doc/ide/cli.rst
@@ -123,4 +123,4 @@ nothing is given, the tool will time the entire execution of each program using
 +---------------------------+--------------+-----------------------------------------------------------+
 
 For a more detailed guide on how to profile SDFGs and work with the resulting data, see :ref:`profiling` and
-`this tutorial <https://nbviewer.org/github/spcl/dace/blob/master/tutorials/benchmarking.ipynb#Benchmarking-and-Instrumentation-API>`_.
+`this tutorial <https://nbviewer.org/github/spcl/dace/blob/main/tutorials/benchmarking.ipynb#Benchmarking-and-Instrumentation-API>`_.
diff --git a/doc/optimization/gpu.rst b/doc/optimization/gpu.rst
index a08877de3b..f94d377b51 100644
--- a/doc/optimization/gpu.rst
+++ b/doc/optimization/gpu.rst
@@ -170,7 +170,7 @@ Optimizing GPU SDFGs
 
 When optimizing GPU SDFGs, there are a few things to keep in mind. Below is a non-exhaustive list of common GPU optimization
 practices and how DaCe helps achieve them. To see some of these optimizations in action, check out the ``optimize_for_gpu``
-function in the `Matrix Multiplication optimization example <https://github.com/spcl/dace/blob/master/samples/optimization/matmul.py>`_.
+function in the `Matrix Multiplication optimization example <https://github.com/spcl/dace/blob/main/samples/optimization/matmul.py>`_.
 
     * **Minimize host<->GPU transfers**: It is important to keep as much data as possible on the GPU across the application.
       This is especially true for data that is accessed frequently, such as data that is used in a loop.
@@ -234,7 +234,7 @@ function in the `Matrix Multiplication optimization example <https://github.com/
 
     * **Specialized hardware**: Specialized hardware, such as NVIDIA Tensor Cores or AMD's matrix instructions, can
       significantly improve performance. DaCe will not automatically emit such instructions, but you can use such operations
-      in your code. See the `Tensor Core code sample <https://github.com/spcl/dace/blob/master/samples/codegen/tensor_cores.py>`_ 
+      in your code. See the `Tensor Core code sample <https://github.com/spcl/dace/blob/main/samples/codegen/tensor_cores.py>`_ 
       to see how to make use of such units.
 
     * **Advanced GPU Map schedules**: DaCe provides two additional built-in map schedules: :class:`~dace.dtypes.ScheduleType.GPU_ThreadBlock_Dynamic`
diff --git a/doc/optimization/optimization.rst b/doc/optimization/optimization.rst
index f1eb84005b..592ab5e6fc 100644
--- a/doc/optimization/optimization.rst
+++ b/doc/optimization/optimization.rst
@@ -36,9 +36,9 @@ tunes the data layout of arrays.
 
 The following resources are available to help you optimize your SDFG:
 
-  * Using transformations: `Using and Creating Transformations <https://nbviewer.org/github/spcl/dace/blob/master/tutorials/transformations.ipynb>`_
-  * Creating optimized schedules that can match optimized libraries: `Matrix multiplication CPU and GPU optimization example <https://github.com/spcl/dace/blob/master/samples/optimization/matmul.py>`_
-  * Auto-tuning and instrumentation: `Tuning data layouts sample <https://github.com/spcl/dace/blob/master/samples/optimization/tuning.py>`_
+  * Using transformations: `Using and Creating Transformations <https://nbviewer.org/github/spcl/dace/blob/main/tutorials/transformations.ipynb>`_
+  * Creating optimized schedules that can match optimized libraries: `Matrix multiplication CPU and GPU optimization example <https://github.com/spcl/dace/blob/main/samples/optimization/matmul.py>`_
+  * Auto-tuning and instrumentation: `Tuning data layouts sample <https://github.com/spcl/dace/blob/main/samples/optimization/tuning.py>`_
 
 The following subsections provide more information on the different types of optimization methods:
 
diff --git a/doc/optimization/profiling.rst b/doc/optimization/profiling.rst
index 497dc81ae8..617b3a9cb9 100644
--- a/doc/optimization/profiling.rst
+++ b/doc/optimization/profiling.rst
@@ -5,7 +5,7 @@ Profiling and Instrumentation
 
 .. note::
 
-  For more information and examples, see the `Benchmarking and Instrumentation <https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/benchmarking.ipynb>`_ tutorial.
+  For more information and examples, see the `Benchmarking and Instrumentation <https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/benchmarking.ipynb>`_ tutorial.
 
 Simple profiling
 ----------------
@@ -120,7 +120,7 @@ There are more instrumentation types available, such as fine-grained GPU kernel
 Instrumentation can also collect performance counters on CPUs and GPUs using `LIKWID <https://github.com/RRZE-HPC/likwid>`_.
 The :class:`~dace.dtypes.InstrumentationType.LIKWID_Counters` instrumentation type can be configured to collect
 a wide variety of performance counters on CPUs and GPUs. An example use can be found in the
-`LIKWID instrumentation code sample <https://github.com/spcl/dace/blob/master/samples/instrumentation/matmul_likwid.py>`_.
+`LIKWID instrumentation code sample <https://github.com/spcl/dace/blob/main/samples/instrumentation/matmul_likwid.py>`_.
 
 
 Instrumentation file format
diff --git a/doc/optimization/vscode.rst b/doc/optimization/vscode.rst
index 1b72effbcc..07f7797b4e 100644
--- a/doc/optimization/vscode.rst
+++ b/doc/optimization/vscode.rst
@@ -145,5 +145,5 @@ transformations |add-xform-by-folder-btn|. The latter recursively traverses the
 for any Python source code files and attempts to load each one as a transformation.
 
 For more information on how to use and author data-centric transformations,
-see :ref:`transforming` and the `Using and Creating Transformations <https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/transformations.ipynb>`_
+see :ref:`transforming` and the `Using and Creating Transformations <https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/transformations.ipynb>`_
 tutorial.
diff --git a/doc/sdfg/ir.rst b/doc/sdfg/ir.rst
index 61dc8d4858..1a7a8368cb 100644
--- a/doc/sdfg/ir.rst
+++ b/doc/sdfg/ir.rst
@@ -627,7 +627,7 @@ override default implementations for a library node type, or for an entire libra
 Internally, an expansion is a subclass of :class:`~dace.transformation.transformation.ExpandTransformation`. It is
 responsible for creating a new SDFG that implements the library node, and for connecting the inputs and outputs of the
 library node to the new SDFG. An example of such an expansion is Einstein summation specialization 
-(`see full file <https://github.com/spcl/dace/blob/master/dace/libraries/blas/nodes/einsum.py>`_):
+(`see full file <https://github.com/spcl/dace/blob/main/dace/libraries/blas/nodes/einsum.py>`_):
 
 .. code-block:: python
 
diff --git a/doc/sdfg/transformations.rst b/doc/sdfg/transformations.rst
index 0a9791ca66..470d413271 100644
--- a/doc/sdfg/transformations.rst
+++ b/doc/sdfg/transformations.rst
@@ -23,7 +23,7 @@ All transformations extend the :class:`~dace.transformation.transformation.Trans
 
 Transformations can have properties and those can be used when applying them: for example, tile sizes in :class:`~dace.transformation.dataflow.tiling.MapTiling`.
 
-For more information on how to use and author data-centric transformations, see the `Using and Creating Transformations <https://nbviewer.jupyter.org/github/spcl/dace/blob/master/tutorials/transformations.ipynb>`_
+For more information on how to use and author data-centric transformations, see the `Using and Creating Transformations <https://nbviewer.jupyter.org/github/spcl/dace/blob/main/tutorials/transformations.ipynb>`_
 tutorial.
 
 
diff --git a/doc/setup/integration.rst b/doc/setup/integration.rst
index 3e1fc5fa70..78607feda9 100644
--- a/doc/setup/integration.rst
+++ b/doc/setup/integration.rst
@@ -79,7 +79,7 @@ you to call the SDFG's entry point function, perform basic type checking, and ar
 Python callback to function pointer, etc.).
 
 Since the compiled SDFG is a low-level interface, it is much faster to call than the Python interface. 
-`We show this behavior in the Benchmarking tutorial <https://nbviewer.org/github/spcl/dace/blob/master/tutorials/benchmarking.ipynb>`_. 
+`We show this behavior in the Benchmarking tutorial <https://nbviewer.org/github/spcl/dace/blob/main/tutorials/benchmarking.ipynb>`_. 
 However, it requires caution as opposed to calling the ``@dace.program`` or the ``SDFG`` object because:
 
     * Each array return value is represented internally as a single array (not reallocated every call) and will be 
diff --git a/doc/setup/quickstart.rst b/doc/setup/quickstart.rst
index 4a54de720c..70f24cbfb1 100644
--- a/doc/setup/quickstart.rst
+++ b/doc/setup/quickstart.rst
@@ -36,5 +36,5 @@ From here on out, you can optimize (:ref:`interactively <vscode>`, :ref:`program
 your code. 
 
 
-For more examples of how to use DaCe, see the `samples <https://github.com/spcl/dace/tree/master/samples>`_ and 
-`tutorials <https://github.com/spcl/dace/tree/master/tutorials>`_ folders on GitHub.
+For more examples of how to use DaCe, see the `samples <https://github.com/spcl/dace/tree/main/samples>`_ and 
+`tutorials <https://github.com/spcl/dace/tree/main/tutorials>`_ folders on GitHub.
diff --git a/requirements.txt b/requirements.txt
index 7332dc0419..3cc37cc468 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,6 +16,6 @@ ply==3.11
 PyYAML==6.0.1
 six==1.16.0
 sympy==1.9
-urllib3==2.0.7
+urllib3==2.2.2
 websockets==11.0.3
 zipp==3.15.0
diff --git a/tests/codegen/argumet_signature_test.py b/tests/codegen/argumet_signature_test.py
new file mode 100644
index 0000000000..376724439f
--- /dev/null
+++ b/tests/codegen/argumet_signature_test.py
@@ -0,0 +1,197 @@
+import dace
+import copy
+
+def test_argument_signature_test():
+    """Tests if the argument signature is computed correctly.
+
+    The test is focused on if data dependencies are picked up if they are only
+    referenced indirectly. This effect is only directly visible for GPU.
+    The test also runs on GPU, but will only compile for GPU.
+    """
+
+    def make_sdfg() -> dace.SDFG:
+        sdfg = dace.SDFG("Repr")
+        state = sdfg.add_state(is_start_block=True)
+        N = dace.symbol(sdfg.add_symbol("N", dace.int32))
+        for name in "BC":
+            sdfg.add_array(
+                name=name,
+                dtype=dace.float64,
+                shape=(N, N),
+                strides=(N, 1),
+                transient=False,
+            )
+
+        # `A` uses a stride that is not used by any of the other arrays.
+        #  However, the stride is used if we want to index array `A`.
+        second_stride_A = dace.symbol(sdfg.add_symbol("second_stride_A", dace.int32))
+        sdfg.add_array(
+                name="A",
+                dtype=dace.float64,
+                shape=(N,),
+                strides=(second_stride_A,),
+                transient=False,
+                
+        )
+
+        # Also array `D` uses a stride that is not used by any other array.
+        second_stride_D = dace.symbol(sdfg.add_symbol("second_stride_D", dace.int32))
+        sdfg.add_array(
+                name="D",
+                dtype=dace.float64,
+                shape=(N, N),
+                strides=(second_stride_D, 1),
+                transient=False,
+                
+        )
+
+        # Simplest way to generate a mapped Tasklet, we will later modify it.
+        state.add_mapped_tasklet(
+            "computation",
+            map_ranges={"__i0": "0:N", "__i1": "0:N"},
+            inputs={
+                "__in0": dace.Memlet("A[__i1]"),
+                "__in1": dace.Memlet("B[__i0, __i1]"),
+            },
+            code="__out = __in0 + __in1",
+            outputs={"__out": dace.Memlet("C[__i0, __i1]")},
+            external_edges=True,
+        )
+
+        # Instead of going from the MapEntry to the Tasklet we will go through
+        #  an temporary AccessNode that is only used inside the map scope.
+        #  Thus there is no direct reference to `A` inside the map scope, that would
+        #  need `second_stride_A`.
+        sdfg.add_scalar("tmp_in", transient=True, dtype=dace.float64)
+        tmp_in = state.add_access("tmp_in")
+        for e in state.edges():
+            if e.dst_conn == "__in0":
+                iedge = e
+                break
+        state.add_edge(
+            iedge.src,
+            iedge.src_conn,
+            tmp_in,
+            None,
+            # The important thing is that the Memlet, that connects the MapEntry with the
+            #  AccessNode, does not refers to the memory outside (its source) but to the transient
+            #  inside (its destination)
+            dace.Memlet(data="tmp_in", subset="0", other_subset="__i1"),  # This does not work!
+            #dace.Memlet(data="A", subset="__i1", other_subset="0"),   # This would work!
+        )
+        state.add_edge(
+            tmp_in,
+            None,
+            iedge.dst,
+            iedge.dst_conn,
+            dace.Memlet(f"{tmp_in.data}[0]"),
+        )
+        state.remove_edge(iedge)
+
+        # Here we are doing something similar as for `A`, but this time for the output.
+        #  The output of the Tasklet is stored inside a temporary scalar.
+        #  From that scalar we then go to `C`, here the Memlet on the inside is still
+        #  referring to `C`, thus it is referenced directly.
+        #  We also add a second output that goes to `D` , but the inner Memlet does
+        #  not refer to `D` but to the temporary. Thus there is no direct mention of
+        #  `D` inside the map scope.
+        sdfg.add_scalar("tmp_out", transient=True, dtype=dace.float64)
+        tmp_out = state.add_access("tmp_out")
+        for e in state.edges():
+            if e.src_conn == "__out":
+                oedge = e
+                assert oedge.data.data == "C"
+                break
+
+        state.add_edge(
+            oedge.src,
+            oedge.src_conn,
+            tmp_out,
+            None,
+            dace.Memlet(data="tmp_out", subset="0"),
+        )
+        state.add_edge(
+            tmp_out,
+            None,
+            oedge.dst,
+            oedge.dst_conn,
+            dace.Memlet(data="C", subset="__i0, __i1"),
+        )
+
+        # Now we create a new output that uses `tmp_out` but goes into `D`.
+        #  The memlet on the inside will not use `D` but `tmp_out`.
+        state.add_edge(
+            tmp_out,
+            None,
+            oedge.dst,
+            "IN_D",
+            dace.Memlet(data=tmp_out.data, subset="0", other_subset="__i1, __i0"),
+        )
+        state.add_edge(
+            oedge.dst,
+            "OUT_D",
+            state.add_access("D"),
+            None,
+            dace.Memlet(data="D", subset="__i0, __i1", other_subset="0"),
+        )
+        oedge.dst.add_in_connector("IN_D", force=True)
+        oedge.dst.add_out_connector("OUT_D", force=True)
+        state.remove_edge(oedge)
+
+        # Without this the test does not work properly
+        #  It is related to [Issue#1703](https://github.com/spcl/dace/issues/1703)
+        sdfg.validate()
+        for edge in state.edges():
+            edge.data.try_initialize(edge=edge, sdfg=sdfg, state=state)
+
+        for array in sdfg.arrays.values():
+            if isinstance(array, dace.data.Array):
+                array.storage = dace.StorageType.GPU_Global
+            else:
+                array.storage = dace.StorageType.Register
+        sdfg.apply_gpu_transformations(simplify=False)
+        sdfg.validate()
+
+        return sdfg
+
+    # Build the SDFG
+    sdfg = make_sdfg()
+
+    map_entry = None
+    for state in sdfg.states():
+        for node in state.nodes():
+            if isinstance(node, dace.nodes.MapEntry):
+                map_entry = node
+                break
+        if map_entry is not None:
+            break
+
+    # Now get the argument list of the map.
+    res_arglist = { k:v for k, v in state.scope_subgraph(map_entry).arglist().items()}
+
+    ref_arglist = {
+            'A': dace.data.Array,
+            'B': dace.data.Array,
+            'C': dace.data.Array,
+            'D': dace.data.Array,
+            'N': dace.data.Scalar,
+            'second_stride_A': dace.data.Scalar,
+            'second_stride_D': dace.data.Scalar,
+    }
+
+    assert len(ref_arglist) == len(res_arglist), f"Expected {len(ref_arglist)} but got {len(res_arglist)}"
+    for aname in ref_arglist.keys():
+        atype_ref = ref_arglist[aname]
+        atype_res = res_arglist[aname]
+        assert isinstance(atype_res, atype_ref), f"Expected '{aname}' to have type {atype_ref}, but it had {type(atype_res)}."
+
+    # If we have cupy we will also compile it.
+    try:
+        import cupy as cp
+    except ImportError:
+        return
+
+    csdfg = sdfg.compile()
+
+if __name__ == "__main__":
+    test_argument_signature_test()
diff --git a/tests/dynamic_tb_map_cudatest.py b/tests/dynamic_tb_map_cudatest.py
index b24e5f2ea6..edc1eac9f2 100644
--- a/tests/dynamic_tb_map_cudatest.py
+++ b/tests/dynamic_tb_map_cudatest.py
@@ -12,10 +12,8 @@
 
 @dace.program(dace.uint32[H + 1], dace.uint32[nnz], dace.float32[nnz], dace.float32[W], dace.float32[H])
 def spmv(A_row, A_col, A_val, x, b):
-
     @dace.mapscope(_[0:H])
     def compute_row(i):
-
         @dace.map(_[A_row[i]:A_row[i + 1]])
         def compute(j):
             a << A_val[j]
@@ -292,8 +290,76 @@ def sddvm(D_vals: dace.float32[nnz_D], A2_crd: dace.int32[nnz_A], A2_pos: dace.i
     assert np.allclose(val, ref.data)
 
 
+@pytest.mark.gpu
+def test_dynamic_multidim_map():
+    @dace.program
+    def tester(a: dace.float32[H, W, nnz]):
+        A = dace.ndarray([H, W, nnz], dtype=dace.float32, storage=dace.StorageType.GPU_Global)
+        A[:] = a
+        for i, j in dace.map[0:H, 0:W] @ dace.ScheduleType.GPU_Device:
+            for k in dace.map[0:nnz] @ dace.ScheduleType.GPU_ThreadBlock_Dynamic:
+                A[i, j, k] = i * 110 + j * 11 + k
+        a[:] = A
+
+    a = np.zeros((10, 11, 65), dtype=np.float32)
+    tester(a)
+    assert np.allclose(a, np.fromfunction(lambda i, j, k: i * 110 + j * 11 + k, (10, 11, 65), dtype=np.float32))
+
+
+@pytest.mark.skip('Nested maps with work-stealing thread-block schedule are currently unsupported')
+def test_dynamic_nested_map():
+    @dace.program
+    def nested2(A: dace.float32[W], i: dace.int32, j: dace.int32):
+        A[j] = i * 10 + j
+
+    @dace.program
+    def nested1(A: dace.float32[W], i: dace.int32):
+        for j in dace.map[0:W] @ dace.ScheduleType.GPU_ThreadBlock_Dynamic:
+            nested2(A, i, j)
+
+    @dace.program
+    def dynamic_nested_map(a: dace.float32[H, W]):
+        A = dace.ndarray([H, W], dtype=dace.float32, storage=dace.StorageType.GPU_Global)
+        A[:] = a
+        for i in dace.map[0:H] @ dace.ScheduleType.GPU_Device:
+            nested1(A[i], i)
+
+        a[:] = A
+
+    a = np.zeros((10, 11), dtype=np.float32)
+    sdfg = dynamic_nested_map.to_sdfg(simplify=False)
+    for _, _, arr in sdfg.arrays_recursive():
+        if arr.storage in (dace.StorageType.GPU_Shared, dace.StorageType.Default):
+            arr.storage = dace.StorageType.Register
+    sdfg(a, H=10, W=11)
+    assert np.allclose(a, np.fromfunction(lambda i, j: i * 10 + j, (10, 11), dtype=np.float32))
+
+
+@pytest.mark.gpu
+def test_dynamic_default_schedule():
+    N = dace.symbol('N')
+
+    @dace.program
+    def tester(a: dace.float32[N, 10]):
+        A = dace.ndarray([N, 10], dtype=dace.float32, storage=dace.StorageType.GPU_Global)
+        A[:] = a
+        for i in dace.map[0:N] @ dace.ScheduleType.GPU_Device:
+            smem = np.empty((10, ), dtype=np.float32) @ dace.StorageType.GPU_Shared
+            smem[:] = 1
+            for j in dace.map[0:10] @ dace.ScheduleType.GPU_ThreadBlock_Dynamic:
+                A[i, j] = i * 65 + smem[j]
+        a[:] = A
+
+    a = np.zeros((65, 10), dtype=np.float32)
+    tester(a)
+    assert np.allclose(a, np.fromfunction(lambda i, j: i * 65 + 1, (65, 10), dtype=np.float32))
+
+
 if __name__ == '__main__':
     test_dynamic_map()
     test_dynamic_maps()
     test_nested_dynamic_map()
     test_dynamic_map_with_step()
+    test_dynamic_multidim_map()
+    # test_dynamic_nested_map()
+    test_dynamic_default_schedule()
diff --git a/tests/library/fft_test.py b/tests/library/fft_test.py
new file mode 100644
index 0000000000..440d0a46cf
--- /dev/null
+++ b/tests/library/fft_test.py
@@ -0,0 +1,101 @@
+# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved.
+import pytest
+import numpy as np
+
+import dace
+
+
+@pytest.mark.parametrize('symbolic', (False, True))
+def test_fft(symbolic):
+    if symbolic:
+        N = dace.symbol('N')
+    else:
+        N = 21
+
+    @dace.program
+    def tester(x: dace.complex128[N]):
+        return np.fft.fft(x)
+
+    a = np.random.rand(21) + 1j * np.random.rand(21)
+    b = tester(a)
+    assert np.allclose(b, np.fft.fft(a))
+
+
+def test_fft_r2c():
+    """
+    Tests implicit conversion to complex types
+    """
+
+    @dace.program
+    def tester(x: dace.float32[20]):
+        return np.fft.fft(x)
+
+    a = np.random.rand(20).astype(np.float32)
+    b = tester(a)
+    assert b.dtype == np.complex64
+    assert np.allclose(b, np.fft.fft(a))
+
+
+@pytest.mark.parametrize('norm', ('backward', 'forward', 'ortho'))
+def test_ifft(norm):
+
+    @dace.program
+    def tester(x: dace.complex128[21]):
+        return np.fft.ifft(x, norm=norm)
+
+    a = np.random.rand(21) + 1j * np.random.rand(21)
+    b = tester(a)
+    assert np.allclose(b, np.fft.ifft(a, norm=norm))
+
+
+@pytest.mark.gpu
+def test_cufft():
+    import dace.libraries.fft as fftlib
+
+    @dace.program
+    def tester(x: dace.complex128[210]):
+        return np.fft.fft(x)
+
+    sdfg = tester.to_sdfg()
+    sdfg.apply_gpu_transformations()
+    fftlib.FFT.default_implementation = 'cuFFT'
+    sdfg.expand_library_nodes()
+    fftlib.FFT.default_implementation = 'pure'
+
+    a = np.random.rand(210) + 1j * np.random.rand(210)
+    b = sdfg(a)
+    assert np.allclose(b, np.fft.fft(a))
+
+
+@pytest.mark.gpu
+def test_cufft_twoplans():
+    import dace.libraries.fft as fftlib
+
+    @dace.program
+    def tester(x: dace.complex128[210], y: dace.complex64[19]):
+        return np.fft.fft(x), np.fft.ifft(y, norm='forward')
+
+    sdfg = tester.to_sdfg()
+    sdfg.apply_gpu_transformations()
+    fftlib.FFT.default_implementation = 'cuFFT'
+    fftlib.IFFT.default_implementation = 'cuFFT'
+    sdfg.expand_library_nodes()
+    fftlib.FFT.default_implementation = 'pure'
+    fftlib.IFFT.default_implementation = 'pure'
+
+    a = np.random.rand(210) + 1j * np.random.rand(210)
+    b = (np.random.rand(19) + 1j * np.random.rand(19)).astype(np.complex64)
+    c, d = sdfg(a, b)
+    assert np.allclose(c, np.fft.fft(a))
+    assert np.allclose(d, np.fft.ifft(b, norm='forward'))
+
+
+if __name__ == '__main__':
+    test_fft(False)
+    test_fft(True)
+    test_fft_r2c()
+    test_ifft('backward')
+    test_ifft('forward')
+    test_ifft('ortho')
+    test_cufft()
+    test_cufft_twoplans()
diff --git a/tests/numpy/array_creation_test.py b/tests/numpy/array_creation_test.py
index 85908c7a1f..a1f6d0329f 100644
--- a/tests/numpy/array_creation_test.py
+++ b/tests/numpy/array_creation_test.py
@@ -1,7 +1,9 @@
 # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
+from dace.frontend.python.common import DaceSyntaxError
 import numpy as np
 from common import compare_numpy_output
+import pytest
 
 # M = dace.symbol('M')
 # N = dace.symbol('N')
@@ -150,11 +152,47 @@ def test_arange_6():
     return np.arange(2.5, 10, 3)
 
 
+@compare_numpy_output()
+def test_linspace_1():
+    return np.linspace(2.5, 10, num=3)
+
+
+@compare_numpy_output()
+def test_linspace_2():
+    space, step = np.linspace(2.5, 10, num=3, retstep=True)
+    return space, step
+
+
+@compare_numpy_output()
+def test_linspace_3():
+    a = np.array([1, 2, 3])
+    return np.linspace(a, 5, num=10)
+
+
+@compare_numpy_output()
+def test_linspace_4():
+    a = np.array([[1, 2, 3], [4, 5, 6]])
+    space, step = np.linspace(a, 10, endpoint=False, retstep=True)
+    return space, step
+
+
+@compare_numpy_output()
+def test_linspace_5():
+    a = np.array([[1, 2, 3], [4, 5, 6]])
+    b = np.array([[5], [10]])
+    return np.linspace(a, b, endpoint=False, axis=1)
+
+
+@compare_numpy_output()
+def test_linspace_6():
+    return np.linspace(-5, 5.5, dtype=np.float32)
+
+
 @dace.program
 def program_strides_0():
     A = dace.ndarray((2, 2), dtype=dace.int32, strides=(2, 1))
     for i, j in dace.map[0:2, 0:2]:
-            A[i, j] = i * 2 + j
+        A[i, j] = i * 2 + j
     return A
 
 
@@ -168,7 +206,7 @@ def test_strides_0():
 def program_strides_1():
     A = dace.ndarray((2, 2), dtype=dace.int32, strides=(4, 2))
     for i, j in dace.map[0:2, 0:2]:
-            A[i, j] = i * 2 + j
+        A[i, j] = i * 2 + j
     return A
 
 
@@ -182,7 +220,7 @@ def test_strides_1():
 def program_strides_2():
     A = dace.ndarray((2, 2), dtype=dace.int32, strides=(1, 2))
     for i, j in dace.map[0:2, 0:2]:
-            A[i, j] = i * 2 + j
+        A[i, j] = i * 2 + j
     return A
 
 
@@ -196,7 +234,7 @@ def test_strides_2():
 def program_strides_3():
     A = dace.ndarray((2, 2), dtype=dace.int32, strides=(2, 4))
     for i, j in dace.map[0:2, 0:2]:
-            A[i, j] = i * 2 + j
+        A[i, j] = i * 2 + j
     return A
 
 
@@ -206,6 +244,42 @@ def test_strides_3():
     assert np.allclose(A, [[0, 1], [2, 3]])
 
 
+def test_zeros_symbolic_size_scalar():
+    K = dace.symbol('K')
+
+    @dace.program
+    def zeros_symbolic_size():
+        return np.zeros((K), dtype=np.uint32)
+
+    out = zeros_symbolic_size(K=10)
+    assert (list(out.shape) == [10])
+    assert (out.dtype == np.uint32)
+
+
+def test_ones_scalar_size_scalar():
+
+    @dace.program
+    def ones_scalar_size(k: dace.int32):
+        a = np.ones(k, dtype=np.uint32)
+        return np.sum(a)
+
+    with pytest.raises(DaceSyntaxError):
+        out = ones_scalar_size(20)
+        assert out == 20
+
+
+def test_ones_scalar_size():
+
+    @dace.program
+    def ones_scalar_size(k: dace.int32):
+        a = np.ones((k, k), dtype=np.uint32)
+        return np.sum(a)
+
+    with pytest.raises(DaceSyntaxError):
+        out = ones_scalar_size(20)
+        assert out == 20 * 20
+
+
 if __name__ == "__main__":
     test_empty()
     test_empty_like1()
@@ -229,7 +303,16 @@ def test_strides_3():
     test_arange_4()
     test_arange_5()
     test_arange_6()
+    test_linspace_1()
+    test_linspace_2()
+    test_linspace_3()
+    test_linspace_4()
+    test_linspace_5()
+    test_linspace_6()
     test_strides_0()
     test_strides_1()
     test_strides_2()
     test_strides_3()
+    test_zeros_symbolic_size_scalar()
+    test_ones_scalar_size_scalar()
+    test_ones_scalar_size()
diff --git a/tests/numpy/attention_simple_test.py b/tests/numpy/attention_simple_test.py
index 49558a154b..2ce0205e3f 100644
--- a/tests/numpy/attention_simple_test.py
+++ b/tests/numpy/attention_simple_test.py
@@ -11,7 +11,7 @@
 def dace_softmax(X_in: dace.float32[N], X_out: dace.float32[N]):
 
     tmp_max = dace.reduce(lambda a, b: max(a, b), X_in)
-    X_out[:] = exp(X_in - tmp_max)
+    X_out[:] = np.exp(X_in - tmp_max)
     tmp_sum = dace.reduce(lambda a, b: a + b, X_out, identity=0)
     X_out[:] /= tmp_sum
 
diff --git a/tests/numpy/attribute_test.py b/tests/numpy/attribute_test.py
index 2181883015..e011eafc89 100644
--- a/tests/numpy/attribute_test.py
+++ b/tests/numpy/attribute_test.py
@@ -54,7 +54,50 @@ def fn(a: dace.float64[N, F_in], b: dace.float64[N, heads, F_out], c: dace.float
     assert np.allclose(c, c_expected)
 
 
+def test_nested_attribute():
+
+    @dace.program
+    def tester(a: dace.complex128[20, 10]):
+        return a.T.real
+
+    r = np.random.rand(20, 10)
+    im = np.random.rand(20, 10)
+    a = r + 1j * im
+    res = tester(a)
+    assert np.allclose(res, r.T)
+
+
+def test_attribute_of_expr():
+    """
+    Regression reported in Issue #1295.
+    """
+
+    @dace.program
+    def tester(a: dace.float64[20, 20], b: dace.float64[20, 20], c: dace.float64[20, 20]):
+        c[:, :] = (a @ b).T
+
+    a = np.random.rand(20, 20)
+    b = np.random.rand(20, 20)
+    c = np.random.rand(20, 20)
+    ref = (a @ b).T
+    tester(a, b, c)
+    assert np.allclose(c, ref)
+
+
+def test_attribute_function():
+
+    @dace.program
+    def tester():
+        return np.arange(10).reshape(10, 1)
+
+    a = tester()
+    assert np.allclose(a, np.arange(10).reshape(10, 1))
+
+
 if __name__ == '__main__':
     test_attribute_in_ranged_loop()
     test_attribute_in_ranged_loop_symbolic()
     test_attribute_new_state()
+    test_nested_attribute()
+    test_attribute_of_expr()
+    test_attribute_function()
diff --git a/tests/numpy/concat_test.py b/tests/numpy/concat_test.py
new file mode 100644
index 0000000000..614258e34f
--- /dev/null
+++ b/tests/numpy/concat_test.py
@@ -0,0 +1,133 @@
+# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+import numpy as np
+from common import compare_numpy_output
+import pytest
+
+M = 10
+N = 20
+K = 30
+
+
+@compare_numpy_output()
+def test_concatenate():
+    a = np.zeros([N, N], dtype=np.float32)
+    b = np.ones([N, 1], dtype=np.float32)
+    return np.concatenate((a, b), axis=-1)
+
+
+@compare_numpy_output()
+def test_concatenate_four():
+    a = np.zeros([N, N], dtype=np.float32)
+    b = np.ones([N, 1], dtype=np.float32)
+    c = np.full([N, M], 2.0, dtype=np.float32)
+    return np.concatenate((a, b, c, a), axis=-1)
+
+
+@compare_numpy_output()
+def test_concatenate_out():
+    a = np.zeros([N, N], dtype=np.float32)
+    b = np.ones([M, N], dtype=np.float32)
+    c = np.full([N + M, N], -1, dtype=np.float32)
+    np.concatenate([a, b], out=c)
+    return c + 1
+
+
+def test_concatenate_symbolic():
+    n = dace.symbol('n')
+    m = dace.symbol('m')
+    k = dace.symbol('k')
+
+    @dace.program
+    def tester(a: dace.float64[k, m], b: dace.float64[k, n]):
+        return np.concatenate((a, b), axis=1)
+
+    aa = np.random.rand(10, 4)
+    bb = np.random.rand(10, 5)
+    cc = tester(aa, bb)
+    assert tuple(cc.shape) == (10, 9)
+    assert np.allclose(np.concatenate((aa, bb), axis=1), cc)
+
+
+def test_concatenate_fail():
+    with pytest.raises(ValueError):
+
+        @dace.program
+        def tester(a: dace.float64[K, M], b: dace.float64[N, K]):
+            return np.concatenate((a, b), axis=1)
+
+        aa = np.random.rand(K, M)
+        bb = np.random.rand(N, K)
+        tester(aa, bb)
+
+
+@compare_numpy_output()
+def test_concatenate_flatten():
+    a = np.zeros([1, 2, 3], dtype=np.float32)
+    b = np.ones([4, 5, 6], dtype=np.float32)
+    return np.concatenate([a, b], axis=None)
+
+
+@compare_numpy_output()
+def test_stack():
+    a = np.zeros([N, M, K], dtype=np.float32)
+    b = np.ones([N, M, K], dtype=np.float32)
+    return np.stack((a, b), axis=-1)
+
+
+@compare_numpy_output()
+def test_vstack():
+    a = np.zeros([N, M], dtype=np.float32)
+    b = np.ones([N, M], dtype=np.float32)
+    return np.vstack((a, b))
+
+
+@compare_numpy_output()
+def test_vstack_1d():
+    a = np.zeros([N], dtype=np.float32)
+    b = np.ones([N], dtype=np.float32)
+    return np.vstack((a, b))
+
+
+@compare_numpy_output()
+def test_hstack():
+    a = np.zeros([N, M], dtype=np.float32)
+    b = np.ones([N, M], dtype=np.float32)
+    return np.hstack((a, b))
+
+
+@compare_numpy_output()
+def test_hstack_1d():
+    a = np.zeros([N], dtype=np.float32)
+    b = np.ones([N], dtype=np.float32)
+    return np.hstack((a, b))
+
+
+@compare_numpy_output()
+def test_dstack():
+    a = np.zeros([N, M, K], dtype=np.float32)
+    b = np.ones([N, M, K], dtype=np.float32)
+    return np.dstack((a, b))
+
+
+@compare_numpy_output()
+def test_dstack_4d():
+    a = np.zeros([N, M, K, K], dtype=np.float32)
+    b = np.ones([N, M, K, K], dtype=np.float32)
+    return np.dstack((a, b))
+
+
+if __name__ == "__main__":
+    test_concatenate()
+    test_concatenate_four()
+    test_concatenate_out()
+    test_concatenate_symbolic()
+    test_concatenate_fail()
+    test_concatenate_flatten()
+    test_stack()
+    test_vstack()
+    test_vstack_1d()
+    test_hstack()
+    test_hstack_1d()
+    test_dstack()
+    test_dstack_4d()
diff --git a/tests/numpy/map_syntax_test.py b/tests/numpy/map_syntax_test.py
index fe7af1d644..27a0cfe018 100644
--- a/tests/numpy/map_syntax_test.py
+++ b/tests/numpy/map_syntax_test.py
@@ -1,6 +1,7 @@
 # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
 import numpy as np
 import dace
+import pytest
 
 M, N, K = (dace.symbol(name) for name in ['M', 'N', 'K'])
 
@@ -35,6 +36,57 @@ def test_map_python():
     assert np.allclose(A[:, 1:], B[:, 1:])
 
 
+@pytest.mark.skip('Fails due to bug in Python frontend')
+def test_nested_map_with_indirection():
+    N = dace.symbol('N')
+
+    @dace.program
+    def indirect_to_indirect(arr1: dace.float64[N], ind: dace.int32[10], arr2: dace.float64[N]):
+        for i in dace.map[0:9]:
+            begin, end, stride = ind[i], ind[i + 1], 1
+            for _ in dace.map[0:1]:
+                for j in dace.map[begin:end:stride]:
+                    arr2[j] = arr1[j] + i
+
+    a = np.random.rand(50)
+    b = np.zeros(50)
+    ind = np.array([0, 5, 10, 15, 20, 25, 30, 35, 40, 45], dtype=np.int32)
+    sdfg = indirect_to_indirect.to_sdfg(simplify=False)
+    sdfg(a, ind, b)
+
+    ref = np.zeros(50)
+    for i in range(9):
+        begin, end = ind[i], ind[i + 1]
+        ref[begin:end] = a[begin:end] + i
+
+    assert np.allclose(b, ref)
+
+
+@pytest.mark.skip('Fails due to bug in Python frontend')
+def test_dynamic_map_range_scalar():
+    """
+    From issue #650.
+    """
+
+    @dace.program
+    def test(A: dace.float64[20], B: dace.float64[20]):
+        N = dace.define_local_scalar(dace.int32)
+        N = 5
+        for i in dace.map[0:N]:
+            for j in dace.map[0:N]:
+                with dace.tasklet:
+                    a << A[i]
+                    b >> B[j]
+                    b = a + 1
+
+    A = np.random.rand(20)
+    B = np.zeros(20)
+    test(A, B)
+    assert np.allclose(B[:5], A[:5] + 1)
+
+
 if __name__ == '__main__':
     test_copy3d()
     test_map_python()
+    # test_nested_map_with_indirection()
+    # test_dynamic_map_range_scalar()
diff --git a/tests/numpy/nested_call_subarray_test.py b/tests/numpy/nested_call_subarray_test.py
index 6a92b004fa..7501652328 100644
--- a/tests/numpy/nested_call_subarray_test.py
+++ b/tests/numpy/nested_call_subarray_test.py
@@ -8,7 +8,7 @@
 @dace.program
 def dace_softmax_ncs(X_in: dace.float32[N], X_out: dace.float32[N]):
     tmp_max = dace.reduce(lambda a, b: a + b, X_in, identity=0)
-    X_out[:] = exp(X_in - tmp_max)
+    X_out[:] = np.exp(X_in - tmp_max)
     tmp_sum = dace.reduce(lambda a, b: max(a, b), X_in)
     X_out[:] /= tmp_sum
 
@@ -22,7 +22,7 @@ def test_ncs_local_program():
     @dace.program
     def dace_softmax_localprog(X_in: dace.float32[N], X_out: dace.float32[N]):
         tmp_max = dace.reduce(lambda a, b: a + b, X_in, identity=0)
-        X_out[:] = exp(X_in - tmp_max)
+        X_out[:] = np.exp(X_in - tmp_max)
         tmp_sum = dace.reduce(lambda a, b: max(a, b), X_in)
         X_out[:] /= tmp_sum
 
diff --git a/tests/numpy/split_test.py b/tests/numpy/split_test.py
new file mode 100644
index 0000000000..e4088754e8
--- /dev/null
+++ b/tests/numpy/split_test.py
@@ -0,0 +1,142 @@
+# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved.
+"""
+Tests variants of the numpy split array manipulation.
+"""
+import dace
+import numpy as np
+from common import compare_numpy_output
+import pytest
+
+M = 9
+N = 20
+K = 30
+
+
+@compare_numpy_output()
+def test_split():
+    arr = np.arange(M)
+    a, b, c = np.split(arr, 3)
+    return a + b + c
+
+
+def test_uneven_split_fail():
+    with pytest.raises(ValueError):
+
+        @dace.program
+        def tester():
+            arr = np.arange(N)
+            a, b, c = np.split(arr, 3)
+            return a + b + c
+
+        tester()
+
+
+def test_symbolic_split_fail():
+    with pytest.raises(ValueError):
+        n = dace.symbol('n')
+
+        @dace.program
+        def tester():
+            arr = np.arange(N)
+            a, b, c = np.split(arr, n)
+            return a + b + c
+
+        tester()
+
+
+def test_array_split_fail():
+    with pytest.raises(ValueError):
+
+        @dace.program
+        def tester():
+            arr = np.arange(N)
+            split = np.arange(N)
+            a, b, c = np.split(arr, split)
+            return a + b + c
+
+        tester()
+
+
+@compare_numpy_output()
+def test_array_split():
+    arr = np.arange(N)
+    a, b, c = np.array_split(arr, 3)
+    return a, b, c
+
+
+@compare_numpy_output()
+def test_array_split_multidim():
+    arr = np.ones((N, N))
+    a, b, c = np.array_split(arr, 3, axis=1)
+    return a, b, c
+
+
+@compare_numpy_output()
+def test_split_sequence():
+    arr = np.arange(N)
+    a, b = np.split(arr, [3])
+    return a, b
+
+
+@compare_numpy_output()
+def test_split_sequence_2():
+    arr = np.arange(M)
+    a, b, c = np.split(arr, [3, 6])
+    return a + b + c
+
+
+def test_split_sequence_symbolic():
+    n = dace.symbol('n')
+
+    @dace.program
+    def tester(arr: dace.float64[3 * n]):
+        a, b, c = np.split(arr, [n, n + 2])
+        return a, b, c
+
+    nval = K // 3
+    a = np.random.rand(K)
+    ra, rb, rc = tester(a)
+    assert ra.shape[0] == nval
+    assert rb.shape[0] == 2
+    assert rc.shape[0] == K - nval - 2
+    ref = np.split(a, [nval, nval + 2])
+    assert len(ref) == 3
+    assert np.allclose(ra, ref[0])
+    assert np.allclose(rb, ref[1])
+    assert np.allclose(rc, ref[2])
+
+
+@compare_numpy_output()
+def test_vsplit():
+    arr = np.ones((N, M))
+    a, b = np.vsplit(arr, 2)
+    return a, b
+
+
+@compare_numpy_output()
+def test_hsplit():
+    arr = np.ones((M, N))
+    a, b = np.hsplit(arr, 2)
+    return a, b
+
+
+@compare_numpy_output()
+def test_dsplit_4d():
+    arr = np.ones([N, M, K, K], dtype=np.float32)
+    a, b, c = np.dsplit(arr, 3)
+    return a, b, c
+
+
+if __name__ == "__main__":
+    test_split()
+    test_uneven_split_fail()
+    test_symbolic_split_fail()
+    test_array_split_fail()
+    test_array_split()
+    test_array_split_multidim()
+    test_split_sequence()
+    test_split_sequence_2()
+    test_split_sequence_symbolic()
+    test_vsplit()
+    test_hsplit()
+    test_dsplit_4d()
diff --git a/tests/numpy/ufunc_test.py b/tests/numpy/ufunc_test.py
index 06bd4c3189..b769ab1082 100644
--- a/tests/numpy/ufunc_test.py
+++ b/tests/numpy/ufunc_test.py
@@ -1304,6 +1304,11 @@ def test_ufunc_trunc_u(A: dace.uint32[10]):
     return np.trunc(A)
 
 
+@compare_numpy_output()
+def test_ufunc_clip(A: dace.float32[10]):
+    return np.clip(A, 0.2, 0.5)
+
+
 if __name__ == "__main__":
     test_ufunc_add_ff()
     test_ufunc_subtract_ff()
@@ -1542,3 +1547,4 @@ def test_ufunc_trunc_u(A: dace.uint32[10]):
     test_ufunc_trunc_c()
     test_ufunc_trunc_f()
     test_ufunc_trunc_u()
+    test_ufunc_clip()
diff --git a/tests/python_frontend/device_annotations_test.py b/tests/python_frontend/device_annotations_test.py
index 65c8501b23..d6b512f00b 100644
--- a/tests/python_frontend/device_annotations_test.py
+++ b/tests/python_frontend/device_annotations_test.py
@@ -1,16 +1,19 @@
-# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved.
 import dace
 import pytest
+import numpy as np
 
 from dace.dtypes import StorageType, DeviceType, ScheduleType
 from dace import dtypes
 
-cupy = pytest.importorskip("cupy")
+try:
+    import cupy
+except (ImportError, ModuleNotFoundError):
+    cupy = None
 
 
 @pytest.mark.gpu
 def test_storage():
-
     @dace.program
     def add(X: dace.float32[32, 32] @ StorageType.GPU_Global):
         return X + 1
@@ -46,7 +49,6 @@ def add2(X: dace.float32[32, 32] @ StorageType.GPU_Global):
 
 @pytest.mark.gpu
 def test_pythonmode():
-
     def runs_on_gpu(a: dace.float64[20] @ StorageType.GPU_Global, b: dace.float64[20] @ StorageType.GPU_Global):
         # This map will become a GPU kernel
         for i in dace.map[0:20] @ ScheduleType.GPU_Device:
@@ -58,7 +60,40 @@ def runs_on_gpu(a: dace.float64[20] @ StorageType.GPU_Global, b: dace.float64[20
     assert cupy.allclose(gpu_b, gpu_a + 1)
 
 
+def test_inline_storage_hint():
+    N = dace.symbol('N')
+
+    @dace.program
+    def tester():
+        b = np.ones(N, dtype=np.float32) @ dace.StorageType.CPU_ThreadLocal
+        return b + 1
+
+    sdfg = tester.to_sdfg(simplify=False)
+    assert sdfg.arrays['b'].storage == StorageType.CPU_ThreadLocal
+
+    b = tester(N=10)
+    assert np.allclose(b, 2)
+
+
+def test_annotated_storage_hint():
+    N = dace.symbol('N')
+
+    @dace.program
+    def tester():
+        b: dace.float32[N] @ dace.StorageType.CPU_ThreadLocal = np.ones(N, dtype=np.float32)
+        return b + 1
+
+    sdfg = tester.to_sdfg(simplify=False)
+    assert sdfg.arrays['b'].storage == StorageType.CPU_ThreadLocal
+
+    b = tester(N=10)
+    assert np.allclose(b, 2)
+
+
 if __name__ == "__main__":
-    test_storage()
-    test_schedule()
-    test_pythonmode()
+    if cupy is not None:
+        test_storage()
+        test_schedule()
+        test_pythonmode()
+    test_inline_storage_hint()
+    test_annotated_storage_hint()
diff --git a/tests/sdfg/cycles_test.py b/tests/sdfg/cycles_test.py
index 480392ab2d..b01aec55fd 100644
--- a/tests/sdfg/cycles_test.py
+++ b/tests/sdfg/cycles_test.py
@@ -2,7 +2,7 @@
 import pytest
 
 import dace
-
+from dace.sdfg.validation import InvalidSDFGError
 
 def test_cycles():
     with pytest.raises(ValueError, match="Found cycles.*"):
@@ -29,6 +29,23 @@ def test_cycles_memlet_path():
         sdfg.validate()
 
 
+def test_cycles_1562():
+    """
+    Test for issue #1562.
+    """
+    with pytest.raises(InvalidSDFGError, match="cycles"):
+        sdfg = dace.SDFG("foo")
+        state = sdfg.add_state()
+        mentry_2, mexit_2 = state.add_map("map_2", dict(i="0:9"))
+        mentry_6, mexit_6 = state.add_map("map_6", dict(i="0:9"))
+        mentry_8, mexit_8 = state.add_map("map_8", dict(i="0:9"))
+        state.add_edge(mentry_8, "OUT_0", mentry_6, "IN_0", dace.Memlet(data="bla", subset='0:9'))
+        state.add_edge(mentry_6, "OUT_0", mentry_2, "IN_0", dace.Memlet(data="bla", subset='0:9'))
+        state.add_edge(mentry_2, "OUT_0", mentry_6, "IN_0", dace.Memlet(data="bla", subset='0:9'))
+        sdfg.validate()
+
+
 if __name__ == '__main__':
     test_cycles()
     test_cycles_memlet_path()
+    test_cycles_1562()
diff --git a/tests/sdfg/warn_on_potential_data_race_test.py b/tests/sdfg/warn_on_potential_data_race_test.py
new file mode 100644
index 0000000000..8f17409a2f
--- /dev/null
+++ b/tests/sdfg/warn_on_potential_data_race_test.py
@@ -0,0 +1,316 @@
+# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved.
+
+import warnings
+import dace
+import pytest
+
+def test_memlet_range_not_overlap_ranges():
+    sdfg = dace.SDFG('memlet_range_not_overlap_ranges')
+    state = sdfg.add_state()
+    N = dace.symbol("N", dtype=dace.int32)
+    sdfg.add_array("A", (N//2,), dace.int32)
+    A = state.add_access("A")
+    sdfg.add_array("B", (N,), dace.int32)
+    B = state.add_access("B")
+    state.add_mapped_tasklet(
+        name="first_tasklet",
+        code="b = a + 10",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"b": dace.Memlet(data="B", subset="k")},
+        map_ranges={"k": "0:N//2"},
+        external_edges=True,
+        input_nodes={"A": A},
+        output_nodes={"B": B}
+    )
+    state.add_mapped_tasklet(
+        name="second_tasklet",
+        code="b = a - 20",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"b": dace.Memlet(data="B", subset="k+N//2")},
+        map_ranges={"k": "0:N//2"},
+        external_edges=True,
+        input_nodes={"A": A},
+        output_nodes={"B": B}
+    )
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        with dace.config.set_temporary("experimental.check_race_conditions", value=True):
+            sdfg.validate()
+
+
+def test_memlet_range_write_write_overlap_ranges():
+    sdfg = dace.SDFG('memlet_range_overlap_ranges')
+    state = sdfg.add_state()
+    N = dace.symbol("N", dtype=dace.int32)
+    sdfg.add_array("A", (N,), dace.int32)
+    A = state.add_access("A")
+    sdfg.add_array("B", (N,), dace.int32)
+    B = state.add_access("B")
+    state.add_mapped_tasklet(
+        name="first_tasklet",
+        code="b = a + 10",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"b": dace.Memlet(data="B", subset="k")},
+        map_ranges={"k": "0:N"},
+        external_edges=True,
+        input_nodes={"A": A},
+        output_nodes={"B": B}
+    )
+    state.add_mapped_tasklet(
+        name="second_tasklet",
+        code="b = a - 20",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"b": dace.Memlet(data="B", subset="k")},
+        map_ranges={"k": "0:N"},
+        external_edges=True,
+        input_nodes={"A": A},
+        output_nodes={"B": B}
+    )
+
+    with pytest.warns(UserWarning):
+        with dace.config.set_temporary("experimental.check_race_conditions", value=True):
+            sdfg.validate()
+
+def test_memlet_range_write_read_overlap_ranges():
+    sdfg = dace.SDFG('memlet_range_write_read_overlap_ranges')
+    state = sdfg.add_state()
+    N = dace.symbol("N", dtype=dace.int32)
+    sdfg.add_array("A", (N,), dace.int32)
+    A_read = state.add_read("A")
+    A_write = state.add_write("A")
+    sdfg.add_array("B", (N,), dace.int32)
+    B = state.add_access("B")
+    sdfg.add_array("C", (N,), dace.int32)
+    C = state.add_access("C")
+    state.add_mapped_tasklet(
+        name="first_tasklet",
+        code="b = a + 10",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"b": dace.Memlet(data="B", subset="k")},
+        map_ranges={"k": "0:N"},
+        external_edges=True,
+        input_nodes={"A": A_read},
+        output_nodes={"B": B}
+    )
+    state.add_mapped_tasklet(
+        name="second_tasklet",
+        code="a = c - 20",
+        inputs={"c": dace.Memlet(data="C", subset="k")},
+        outputs={"a": dace.Memlet(data="A", subset="k")},
+        map_ranges={"k": "0:N"},
+        external_edges=True,
+        input_nodes={"C": C},
+        output_nodes={"A": A_write}
+    )
+
+    with pytest.warns(UserWarning):
+        with dace.config.set_temporary('experimental', 'check_race_conditions', value=True):
+            sdfg.validate()
+
+def test_memlet_overlap_ranges_two_access_nodes():
+    sdfg = dace.SDFG('memlet_range_write_read_overlap_ranges')
+    state = sdfg.add_state()
+    N = dace.symbol("N", dtype=dace.int32)
+    sdfg.add_array("A", (N,), dace.int32)
+    A1 = state.add_access("A")
+    A2 = state.add_access("A")
+    sdfg.add_array("B", (N,), dace.int32)
+    B1 = state.add_access("B")
+    B2 = state.add_access("B")
+
+    state.add_mapped_tasklet(
+        name="first_tasklet",
+        code="b = a + 10",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"b": dace.Memlet(data="B", subset="k")},
+        map_ranges={"k": "0:N"},
+        external_edges=True,
+        input_nodes={"A": A1},
+        output_nodes={"B": B1}
+    )
+    state.add_mapped_tasklet(
+        name="second_tasklet",
+        code="b = a - 20",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"b": dace.Memlet(data="B", subset="k")},
+        map_ranges={"k": "0:N"},
+        external_edges=True,
+        input_nodes={"A": A2},
+        output_nodes={"B": B2}
+    )
+
+    with pytest.warns(UserWarning):
+        with dace.config.set_temporary('experimental', 'check_race_conditions', value=True):
+            sdfg.validate()
+
+def test_memlet_overlap_symbolic_ranges():
+    sdfg = dace.SDFG('memlet_overlap_symbolic_ranges')
+    state = sdfg.add_state()
+    N = dace.symbol("N", dtype=dace.int32)
+    sdfg.add_array("A", (2*N,), dace.int32)
+    A = state.add_access("A")
+    sdfg.add_array("B", (2*N,), dace.int32)
+    B = state.add_access("B")
+
+    state.add_mapped_tasklet(
+        name="first_tasklet",
+        code="b = a + 10",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"b": dace.Memlet(data="B", subset="k")},
+        map_ranges={"k": "0:N"},
+        external_edges=True,
+        input_nodes={"A": A},
+        output_nodes={"B": B}
+    )
+    state.add_mapped_tasklet(
+        name="second_tasklet",
+        code="b = a - 20",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"b": dace.Memlet(data="B", subset="k")},
+        map_ranges={"k": "0:2*N"},
+        external_edges=True,
+        input_nodes={"A": A},
+        output_nodes={"B": B}
+    )
+
+    with pytest.warns(UserWarning):
+        with dace.config.set_temporary('experimental', 'check_race_conditions', value=True):
+            sdfg.validate()
+
+def test_constant_memlet_overlap():
+    sdfg = dace.SDFG('constant_memlet_overlap')
+    state = sdfg.add_state()
+    sdfg.add_array("A", (12,), dace.int32)
+    A = state.add_access("A")
+    sdfg.add_array("B", (12,), dace.int32)
+    B = state.add_access("B")
+
+    state.add_mapped_tasklet(
+        name="first_tasklet",
+        code="b = a + 10",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"b": dace.Memlet(data="B", subset="k")},
+        map_ranges={"k": "3:10"},
+        external_edges=True,
+        input_nodes={"A": A},
+        output_nodes={"B": B}
+    )
+    state.add_mapped_tasklet(
+        name="second_tasklet",
+        code="b = a - 20",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"b": dace.Memlet(data="B", subset="k")},
+        map_ranges={"k": "6:12"},
+        external_edges=True,
+        input_nodes={"A": A},
+        output_nodes={"B": B}
+    )
+
+    with pytest.warns(UserWarning):
+        with dace.config.set_temporary('experimental', 'check_race_conditions', value=True):
+            sdfg.validate()
+
+def test_constant_memlet_almost_overlap():
+    sdfg = dace.SDFG('constant_memlet_almost_overlap')
+    state = sdfg.add_state()
+    sdfg.add_array("A", (20,), dace.int32)
+    A = state.add_access("A")
+    sdfg.add_array("B", (20,), dace.int32)
+    B = state.add_access("B")
+
+    state.add_mapped_tasklet(
+        name="first_tasklet",
+        code="b = a + 10",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"b": dace.Memlet(data="B", subset="k")},
+        map_ranges={"k": "3:10"},
+        external_edges=True,
+        input_nodes={"A": A},
+        output_nodes={"B": B}
+    )
+    state.add_mapped_tasklet(
+        name="second_tasklet",
+        code="b = a - 20",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"b": dace.Memlet(data="B", subset="k")},
+        map_ranges={"k": "10:20"},
+        external_edges=True,
+        input_nodes={"A": A},
+        output_nodes={"B": B}
+    )
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        with dace.config.set_temporary('experimental', 'check_race_conditions', value=True):
+            sdfg.validate()
+
+def test_elementwise_map():
+    sdfg = dace.SDFG('elementwise_map')
+    state = sdfg.add_state()
+    sdfg.add_array("A", (20,), dace.int32)
+    A_read = state.add_read("A")
+    A_write = state.add_write("A")
+
+    state.add_mapped_tasklet(
+        name="first_tasklet",
+        code="aa = a + 10",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"aa": dace.Memlet(data="A", subset="k")},
+        map_ranges={"k": "0:20"},
+        external_edges=True,
+        input_nodes={"A": A_read},
+        output_nodes={"A": A_write}
+    )
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        with dace.config.set_temporary('experimental', 'check_race_conditions', value=True):
+            sdfg.validate()
+
+def test_memlet_overlap_with_wcr():
+    sdfg = dace.SDFG('memlet_overlap_with_wcr')
+    state = sdfg.add_state()
+    sdfg.add_array("A", (20,), dace.int32)
+    sdfg.add_array("B", (1,), dace.int32)
+    A = state.add_read("A")
+    B = state.add_write("B")
+
+    state.add_mapped_tasklet(
+        name="first_reduction",
+        code="b = a",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"b": dace.Memlet(data="B", subset="0", wcr="lambda old, new: old + new")},
+        map_ranges={"k": "0:20"},
+        external_edges=True,
+        input_nodes={"A": A},
+        output_nodes={"B": B}
+    )
+
+    state.add_mapped_tasklet(
+        name="second_reduction",
+        code="b = a",
+        inputs={"a": dace.Memlet(data="A", subset="k")},
+        outputs={"b": dace.Memlet(data="B", subset="0", wcr="lambda old, new: old + new")},
+        map_ranges={"k": "0:20"},
+        external_edges=True,
+        input_nodes={"A": A},
+        output_nodes={"B": B}
+    )
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        with dace.config.set_temporary('experimental', 'check_race_conditions', value=True):
+            sdfg.validate()
+
+
+if __name__ == '__main__':
+    test_memlet_range_not_overlap_ranges()
+    test_memlet_range_write_write_overlap_ranges()
+    test_memlet_range_write_read_overlap_ranges()
+    test_memlet_overlap_ranges_two_access_nodes()
+    test_memlet_overlap_symbolic_ranges()
+    test_constant_memlet_overlap()
+    test_constant_memlet_almost_overlap()
+    test_elementwise_map()
+    test_memlet_overlap_with_wcr()
diff --git a/tests/transformations/copy_to_map_test.py b/tests/transformations/copy_to_map_test.py
index 2b237d84d5..a0931fa1b8 100644
--- a/tests/transformations/copy_to_map_test.py
+++ b/tests/transformations/copy_to_map_test.py
@@ -4,6 +4,8 @@
 import copy
 import pytest
 import numpy as np
+import re
+from typing import Tuple, Optional
 
 
 def _copy_to_map(storage: dace.StorageType):
@@ -102,9 +104,165 @@ def test_preprocess():
     assert np.allclose(out, inp)
 
 
+def _perform_non_lin_delin_test(
+        sdfg: dace.SDFG,
+) -> bool:
+    """Performs test for the special case CopyToMap that bypasses linearizing and delinearaziong.
+    """
+    assert sdfg.number_of_nodes() == 1
+    state: dace.SDFGState = sdfg.states()[0]
+    assert state.number_of_nodes() == 2
+    assert state.number_of_edges() == 1
+    assert all(isinstance(node, dace.nodes.AccessNode) for node in state.nodes())
+    sdfg.validate()
+
+    a = np.random.rand(*sdfg.arrays["a"].shape)
+    b_unopt = np.random.rand(*sdfg.arrays["b"].shape)
+    b_opt = b_unopt.copy()
+    sdfg(a=a, b=b_unopt)
+
+    nb_runs = sdfg.apply_transformations_repeated(CopyToMap, validate=True, options={"ignore_strides": True})
+    assert nb_runs == 1, f"Expected 1 application, but {nb_runs} were performed."
+
+    # Now looking for the tasklet and checking if the memlets follows the expected
+    #  simple pattern.
+    tasklet: dace.nodes.Tasklet = next(iter([node for node in state.nodes() if isinstance(node, dace.nodes.Tasklet)]))
+    pattern: re.Pattern = re.compile(r"(__j[0-9])|(__j[0-9]+\s*\+\s*[0-9]+)|([0-9]+)")
+
+    assert state.in_degree(tasklet) == 1
+    assert state.out_degree(tasklet) == 1
+    in_edge = next(iter(state.in_edges(tasklet)))
+    out_edge = next(iter(state.out_edges(tasklet)))
+
+    assert all(pattern.fullmatch(str(idxs[0]).strip()) for idxs in in_edge.data.src_subset), f"IN: {in_edge.data.src_subset}"
+    assert all(pattern.fullmatch(str(idxs[0]).strip()) for idxs in out_edge.data.dst_subset), f"OUT: {out_edge.data.dst_subset}"
+
+    # Now call it again after the optimization.
+    sdfg(a=a, b=b_opt)
+    assert np.allclose(b_unopt, b_opt)
+
+    return True
+
+def _make_non_lin_delin_sdfg(
+        shape_a: Tuple[int, ...],
+        shape_b: Optional[Tuple[int, ...]] = None
+) -> Tuple[dace.SDFG, dace.SDFGState, dace.nodes.AccessNode, dace.nodes.AccessNode]:
+
+    if shape_b is None:
+        shape_b = shape_a
+
+    sdfg = dace.SDFG("bypass1")
+    state = sdfg.add_state(is_start_block=True)
+
+    ac = []
+    for name, shape in [('a', shape_a), ('b', shape_b)]:
+        sdfg.add_array(
+                name=name,
+                shape=shape,
+                dtype=dace.float64,
+                transient=False,
+        )
+        ac.append(state.add_access(name))
+
+    return sdfg, state, ac[0], ac[1]
+
+
+def test_non_lin_delin_1():
+    sdfg, state, a, b = _make_non_lin_delin_sdfg((10, 10))
+    state.add_nedge(
+            a,
+            b,
+            dace.Memlet("a[0:10, 0:10] -> [0:10, 0:10]"),
+    )
+    _perform_non_lin_delin_test(sdfg)
+
+def test_non_lin_delin_2():
+    sdfg, state, a, b = _make_non_lin_delin_sdfg((10, 10), (100, 100))
+    state.add_nedge(
+            a,
+            b,
+            dace.Memlet("a[0:10, 0:10] -> [50:60, 40:50]"),
+    )
+    _perform_non_lin_delin_test(sdfg)
+
+
+def test_non_lin_delin_3():
+    sdfg, state, a, b = _make_non_lin_delin_sdfg((100, 100), (100, 100))
+    state.add_nedge(
+            a,
+            b,
+            dace.Memlet("a[1:11, 20:30] -> [50:60, 40:50]"),
+    )
+    _perform_non_lin_delin_test(sdfg)
+
+
+def test_non_lin_delin_4():
+    sdfg, state, a, b = _make_non_lin_delin_sdfg((100, 4, 100), (100, 100))
+    state.add_nedge(
+            a,
+            b,
+            dace.Memlet("a[1:11, 2, 20:30] -> [50:60, 40:50]"),
+    )
+    _perform_non_lin_delin_test(sdfg)
+
+
+def test_non_lin_delin_5():
+    sdfg, state, a, b = _make_non_lin_delin_sdfg((100, 4, 100), (100, 10, 100))
+    state.add_nedge(
+            a,
+            b,
+            dace.Memlet("a[1:11, 2, 20:30] -> [50:60, 4, 40:50]"),
+    )
+    _perform_non_lin_delin_test(sdfg)
+
+
+def test_non_lin_delin_6():
+    sdfg, state, a, b = _make_non_lin_delin_sdfg((100, 100), (100, 10, 100))
+    state.add_nedge(
+            a,
+            b,
+            dace.Memlet("a[1:11, 20:30] -> [50:60, 4, 40:50]"),
+    )
+    _perform_non_lin_delin_test(sdfg)
+
+
+def test_non_lin_delin_7():
+    sdfg, state, a, b = _make_non_lin_delin_sdfg((10, 10), (20, 20))
+    state.add_nedge(
+            a,
+            b,
+            dace.Memlet("b[5:15, 6:16]"),
+    )
+    _perform_non_lin_delin_test(sdfg)
+
+
+def test_non_lin_delin_8():
+    sdfg, state, a, b = _make_non_lin_delin_sdfg((20, 20), (10, 10))
+    state.add_nedge(
+            a,
+            b,
+            dace.Memlet("a[5:15, 6:16]"),
+    )
+    _perform_non_lin_delin_test(sdfg)
+
+
 if __name__ == '__main__':
+    test_non_lin_delin_1()
+    test_non_lin_delin_2()
+    test_non_lin_delin_3()
+    test_non_lin_delin_4()
+    test_non_lin_delin_5()
+    test_non_lin_delin_6()
+    test_non_lin_delin_7()
+    test_non_lin_delin_8()
+
     test_copy_to_map()
-    test_copy_to_map_gpu()
     test_flatten_to_map()
-    test_flatten_to_map_gpu()
-    test_preprocess()
+    try:
+        import cupy
+        test_copy_to_map_gpu()
+        test_flatten_to_map_gpu()
+        test_preprocess()
+    except ModuleNotFoundError as E:
+        if "'cupy'" not in str(E):
+            raise
diff --git a/tutorials/benchmarking.ipynb b/tutorials/benchmarking.ipynb
index f2330957a3..59302e8090 100644
--- a/tutorials/benchmarking.ipynb
+++ b/tutorials/benchmarking.ipynb
@@ -1260,7 +1260,7 @@
    "source": [
     "### Instrumentation API\n",
     "\n",
-    "The Instrumentation API allows more fine-grained control over measuring program metrics. It creates a JSON report in `.dacecache/<program>/perf`, which can be obtained with the API or viewed with any Chrome Tracing capable viewer. More usage information and how to use the API to tune programs can be found in the [program tuning sample](https://github.com/spcl/dace/blob/master/samples/optimization/tuning.py)."
+    "The Instrumentation API allows more fine-grained control over measuring program metrics. It creates a JSON report in `.dacecache/<program>/perf`, which can be obtained with the API or viewed with any Chrome Tracing capable viewer. More usage information and how to use the API to tune programs can be found in the [program tuning sample](https://github.com/spcl/dace/blob/main/samples/optimization/tuning.py)."
    ]
   },
   {
diff --git a/tutorials/codegen.ipynb b/tutorials/codegen.ipynb
index a6effd7996..2c79f1a2e0 100644
--- a/tutorials/codegen.ipynb
+++ b/tutorials/codegen.ipynb
@@ -480,48 +480,50 @@
     "        self.frame = frame_codegen\n",
     "        # Can be used to dispatch other code generators for allocation/nodes\n",
     "        self.dispatcher = frame_codegen.dispatcher\n",
-    "        \n",
+    "\n",
     "        ################################################################\n",
-    "        # Register handlers/hooks through dispatcher: Can be used for \n",
+    "        # Register handlers/hooks through dispatcher: Can be used for\n",
     "        # nodes, memory copy/allocation, scopes, states, and more.\n",
-    "        \n",
+    "\n",
     "        # In this case, register scopes\n",
     "        self.dispatcher.register_map_dispatcher(dace.ScheduleType.LoopyLoop, self)\n",
-    "        \n",
+    "\n",
     "        # You can similarly use register_{array,copy,node,state}_dispatcher\n",
-    "        \n",
-    "    # A scope dispatcher will trigger a method called generate_scope whenever \n",
+    "\n",
+    "    # A scope dispatcher will trigger a method called generate_scope whenever\n",
     "    # an SDFG has a scope with that schedule\n",
-    "    def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView,\n",
-    "                       state_id: int, function_stream: CodeIOStream,\n",
-    "                       callsite_stream: CodeIOStream):\n",
+    "    def generate_scope(self, sdfg: dace.SDFG, cfg: dace.ControlFlowRegion,\n",
+    "                       scope: ScopeSubgraphView, state_id: int,\n",
+    "                       function_stream: CodeIOStream, callsite_stream: CodeIOStream):\n",
     "        # The parameters here are:\n",
     "        # sdfg: The SDFG we are currently generating.\n",
+    "        # cfg: The current control flow graph (CFG) we are currently generating. For example,
+    "               it can be the SDFG or a loop region.
     "        # scope: The subgraph of the state containing only the scope (map contents)\n",
     "        #        we want to generate the code for.\n",
-    "        # state_id: The state in the SDFG the subgraph is taken from (i.e., \n",
+    "        # state_id: The state in the SDFG the subgraph is taken from (i.e.,\n",
     "        #           `sdfg.node(state_id)` is the same as `scope.graph`)\n",
     "        # function_stream: A cursor to the global code (which can be used to define\n",
     "        #                  functions, hence the name).\n",
     "        # callsite_stream: A cursor to the current location in the code, most of\n",
     "        #                  the code is generated here.\n",
-    "        \n",
+    "\n",
     "        # We can get the map entry node from the scope graph\n",
     "        entry_node = scope.source_nodes()[0]\n",
-    "        \n",
+    "\n",
     "        # First, generate an opening brace (for instrumentation and dynamic map ranges)\n",
     "        callsite_stream.write('{', sdfg, state_id, entry_node)\n",
-    "        \n",
+    "\n",
     "        ################################################################\n",
-    "        # Generate specific code: We will generate a reversed loop with a \n",
+    "        # Generate specific code: We will generate a reversed loop with a\n",
     "        # comment for each dimension of the map. For the sake of simplicity,\n",
     "        # dynamic map ranges are not supported.\n",
-    "        \n",
+    "\n",
     "        for param, rng in zip(entry_node.map.params, entry_node.map.range):\n",
     "            # We use the sym2cpp function from the cpp support functions\n",
     "            # to convert symbolic expressions to proper C++\n",
     "            begin, end, stride = (sym2cpp(r) for r in rng)\n",
-    "            \n",
+    "\n",
     "            # Every write is optionally (but recommended to be) tagged with\n",
     "            # 1-3 extra arguments, serving as line information to match\n",
     "            # SDFG, state, and graph nodes/edges to written code.\n",
@@ -529,17 +531,17 @@
     "            for (auto {param} = {end}; {param} >= {begin}; {param} -= {stride}) {{''',\n",
     "                                  sdfg, state_id, entry_node\n",
     "            )\n",
-    "        \n",
+    "\n",
     "            # NOTE: CodeIOStream will automatically take care of indentation for us.\n",
-    "        \n",
-    "        \n",
+    "\n",
+    "\n",
     "        # Now that the loops have been defined, use the dispatcher to invoke any\n",
     "        # code generator (including this one) that is registered to deal with\n",
     "        # the internal nodes in the subgraph. We skip the MapEntry node.\n",
-    "        self.dispatcher.dispatch_subgraph(sdfg, scope, state_id,\n",
+    "        self.dispatcher.dispatch_subgraph(sdfg, cfg, scope, state_id,\n",
     "                                          function_stream, callsite_stream,\n",
     "                                          skip_entry_node=True)\n",
-    "        \n",
+    "\n",
     "        # NOTE: Since skip_exit_node above is set to False, closing braces will\n",
     "        #       be automatically generated"
    ]