diff --git a/src/frameworks/cuda/helper.rs b/src/frameworks/cuda/helper.rs
index ac44ea5..4e088a5 100644
--- a/src/frameworks/cuda/helper.rs
+++ b/src/frameworks/cuda/helper.rs
@@ -1,34 +1,56 @@
+// Those macros should be removed when read()/read_only()/write() are refactored
+// to return typed memory. For now they remove a lot of visual clutter and
+// lessen probability of stupid mistakes.
+macro_rules! read {
+    ($x:ident, $slf:ident) => (
+        try!($x.read($slf.device())).as_cuda()
+            .expect("Broken invariant: not a CUDA memory")
+    )
+}
+
+macro_rules! read_write {
+    ($x:ident, $slf:ident) => (
+        try!($x.read_write($slf.device())).as_cuda()
+            .expect("Broken invariant: not a CUDA memory")
+    )
+}
+
+macro_rules! write_only {
+    ($x:ident, $slf:ident) => (
+        try!($x.write_only($slf.device())).as_cuda()
+            .expect("Broken invariant: not a CUDA memory")
+    )
+}
+
+// trans! cannot be inlined into macros above, because `$mem` would become
+// intermidiate variable and `*mut $t` will outlive it.
+macro_rules! trans {
+    ($mem:ident, $t:ident) => (
+        unsafe { ::std::mem::transmute::<u64, *mut $t>(*$mem.id_c()) }
+    )
+}
+
+macro_rules! exec {
+    ($name:ident, $f:expr) => ({
+        let res = $f;
+        res.map_err(|_| PluginError::Operation(
+            stringify!(Unable to execute operation $name)).into())
+    })
+}
+
+
 #[macro_export]
 macro_rules! iblas_asum_for_cuda {
     ($t:ident) => (
-        fn asum(&self,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            result: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-            match result.add_device(self.device()) { _ => try!(result.sync(self.device())) }
-            self.asum_plain(x, result)
-        }
-
-        fn asum_plain(&self,
-            x: &::collenchyma::tensor::SharedTensor<$t>,
-            result: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            let x_get = try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`")));
-            let r_get = try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`")));
-            let x_mem = try!(x_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `x`.")));
-            let r_mem = try!(r_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `result`.")));
-            unsafe {
-                let res = CONTEXT.asum(::std::mem::transmute::<u64, *mut $t>(*x_mem.id_c()),
-                                 ::std::mem::transmute::<u64, *mut $t>(*r_mem.id_c()),
-                                 x.desc().size() as i32,
-                                 None);
-                if res.is_ok() {
-                    Ok(())
-                } else {
-                    Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation asum.")))
-                }
-            }
+        fn asum(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>)
+                -> Result<(), ::collenchyma::error::Error> {
+            let n = x.desc().size() as i32;
+            let x_mem = read!(x, self);
+            let r_mem = write_only!(result, self);
+            exec!(asum, CONTEXT.asum(
+                trans!(x_mem, $t),
+                trans!(r_mem, $t),
+                n, None))
         }
     );
 }
@@ -36,42 +58,18 @@ macro_rules! iblas_asum_for_cuda {
 #[macro_export]
 macro_rules! iblas_axpy_for_cuda {
     ($t:ident) => (
-        fn axpy(&self,
-            a: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            y: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match a.add_device(self.device()) { _ => try!(a.sync(self.device())) }
-            match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-            match y.add_device(self.device()) { _ => try!(y.sync(self.device())) }
-            self.axpy_plain(a, x, y)
-        }
-
-        fn axpy_plain(&self,
-            a: &::collenchyma::tensor::SharedTensor<$t>,
-            x: &::collenchyma::tensor::SharedTensor<$t>,
-            y: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
+        fn axpy(&self, a: &SharedTensor<$t>, x: &SharedTensor<$t>,
+                y: &mut SharedTensor<$t>)
+                -> Result<(), ::collenchyma::error::Error> {
             let n = x.desc().size() as i32;
-            let a_get = try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`")));
-            let x_get = try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`")));
-            let y_get = try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`")));
-            let a_mem = try!(a_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `a`.")));
-            let x_mem = try!(x_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `x`.")));
-            let y_mem = try!(y_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `y`.")));
-            unsafe {
-                let res = CONTEXT.axpy(::std::mem::transmute::<u64, *mut $t>(*a_mem.id_c()),
-                                       ::std::mem::transmute::<u64, *mut $t>(*x_mem.id_c()),
-                                       ::std::mem::transmute::<u64, *mut $t>(*y_mem.id_c()),
-                                       n,
-                                       None,
-                                       None);
-                if res.is_ok() {
-                    Ok(())
-                } else {
-                    Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation axpy.")))
-                }
-            }
+            let a_mem = read!(a, self);
+            let x_mem = read!(x, self);
+            let y_mem = read_write!(y, self);
+            exec!(axpy, CONTEXT.axpy(
+                trans!(a_mem, $t),
+                trans!(x_mem, $t),
+                trans!(y_mem, $t),
+                n, None, None))
         }
     );
 }
@@ -79,36 +77,15 @@ macro_rules! iblas_axpy_for_cuda {
 #[macro_export]
 macro_rules! iblas_copy_for_cuda {
     ($t:ident) => (
-        fn copy(&self,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            y: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-            match y.add_device(self.device()) { _ => try!(y.sync(self.device())) }
-            self.copy_plain(x, y)
-        }
-
-        fn copy_plain(&self,
-            x: &::collenchyma::tensor::SharedTensor<$t>,
-            y: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
+        fn copy(&self, x: &SharedTensor<$t>, y: &mut SharedTensor<$t>)
+                -> Result<(), ::collenchyma::error::Error> {
             let n = x.desc().size() as i32;
-            let x_get = try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`")));
-            let y_get = try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`")));
-            let x_mem = try!(x_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `x`.")));
-            let y_mem = try!(y_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `y`.")));
-            unsafe {
-                let res = CONTEXT.copy(::std::mem::transmute::<u64, *mut $t>(*x_mem.id_c()),
-                                       ::std::mem::transmute::<u64, *mut $t>(*y_mem.id_c()),
-                                       n,
-                                       None,
-                                       None);
-                if res.is_ok() {
-                    Ok(())
-                } else {
-                    Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation copy.")))
-                }
-            }
+            let x_mem = read!(x, self);
+            let y_mem = write_only!(y, self);
+            exec!(copy, CONTEXT.copy(
+                trans!(x_mem, $t),
+                trans!(y_mem, $t),
+                n, None, None))
         }
     );
 }
@@ -116,42 +93,18 @@ macro_rules! iblas_copy_for_cuda {
 #[macro_export]
 macro_rules! iblas_dot_for_cuda {
     ($t:ident) => (
-        fn dot(&self,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            y: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            result: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-            match y.add_device(self.device()) { _ => try!(y.sync(self.device())) }
-            match result.add_device(self.device()) { _ => try!(result.sync(self.device())) }
-            self.dot_plain(x, y, result)
-        }
-
-        fn dot_plain(&self,
-            x: &::collenchyma::tensor::SharedTensor<$t>,
-            y: &::collenchyma::tensor::SharedTensor<$t>,
-            result: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
+        fn dot(&self, x: &SharedTensor<$t>, y: &SharedTensor<$t>,
+               result: &mut SharedTensor<$t>)
+               -> Result<(), ::collenchyma::error::Error> {
             let n = x.desc().size() as i32;
-            let x_get = try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`")));
-            let y_get = try!(y.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`")));
-            let r_get = try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`")));
-            let x_mem = try!(x_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `x`.")));
-            let y_mem = try!(y_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `y`.")));
-            let r_mem = try!(r_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `result`.")));
-            unsafe {
-                let res = CONTEXT.dot( ::std::mem::transmute::<u64, *mut $t>(*x_mem.id_c()),
-                                       ::std::mem::transmute::<u64, *mut $t>(*y_mem.id_c()),
-                                       ::std::mem::transmute::<u64, *mut $t>(*r_mem.id_c()),
-                                       n,
-                                       None,
-                                       None);
-                if res.is_ok() {
-                    Ok(())
-                } else {
-                    Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation dot.")))
-                }
-            }
+            let x_mem = read!(x, self);
+            let y_mem = read!(y, self);
+            let r_mem = write_only!(result, self);
+            exec!(dot, CONTEXT.dot(
+                trans!(x_mem, $t),
+                trans!(y_mem, $t),
+                trans!(r_mem, $t),
+                n, None, None))
         }
     );
 }
@@ -159,35 +112,15 @@ macro_rules! iblas_dot_for_cuda {
 #[macro_export]
 macro_rules! iblas_nrm2_for_cuda {
     ($t:ident) => (
-        fn nrm2(&self,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            result: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-            match result.add_device(self.device()) { _ => try!(result.sync(self.device())) }
-            self.nrm2_plain(x, result)
-        }
-
-        fn nrm2_plain(&self,
-            x: &::collenchyma::tensor::SharedTensor<$t>,
-            result: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
+        fn nrm2(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>)
+                -> Result<(), ::collenchyma::error::Error> {
             let n = x.desc().size() as i32;
-            let x_get = try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`")));
-            let r_get = try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`")));
-            let x_mem = try!(x_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `x`.")));
-            let r_mem = try!(r_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `result`.")));
-            unsafe {
-                let res = CONTEXT.nrm2(::std::mem::transmute::<u64, *mut $t>(*x_mem.id_c()),
-                                       ::std::mem::transmute::<u64, *mut $t>(*r_mem.id_c()),
-                                       n,
-                                       None);
-                if res.is_ok() {
-                    Ok(())
-                } else {
-                    Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation nrm2.")))
-                }
-            }
+            let x_mem = read!(x, self);
+            let r_mem = write_only!(result, self);
+            exec!(nrm2, CONTEXT.nrm2(
+                trans!(x_mem, $t),
+                trans!(r_mem, $t),
+                n, None))
         }
     );
 }
@@ -195,35 +128,15 @@ macro_rules! iblas_nrm2_for_cuda {
 #[macro_export]
 macro_rules! iblas_scal_for_cuda {
     ($t:ident) => (
-        fn scal(&self,
-            a: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match a.add_device(self.device()) { _ => try!(a.sync(self.device())) }
-            match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-            self.scal_plain(a, x)
-        }
-
-        fn scal_plain(&self,
-            a: &::collenchyma::tensor::SharedTensor<$t>,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
+        fn scal(&self, a: &SharedTensor<$t>, x: &mut SharedTensor<$t>)
+                -> Result<(), ::collenchyma::error::Error> {
             let n = x.desc().size() as i32;
-            let a_get = try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`")));
-            let x_get = try!(x.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`")));
-            let a_mem = try!(a_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `a`.")));
-            let x_mem = try!(x_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `x`.")));
-            unsafe {
-                let res = CONTEXT.scal(::std::mem::transmute::<u64, *mut $t>(*a_mem.id_c()),
-                                       ::std::mem::transmute::<u64, *mut $t>(*x_mem.id_c()),
-                                       n,
-                                       None);
-                if res.is_ok() {
-                    Ok(())
-                } else {
-                    Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation scal.")))
-                }
-            }
+            let a_mem = read!(a, self);
+            let x_mem = read_write!(x, self);
+            exec!(scal, CONTEXT.scal(
+                trans!(a_mem, $t),
+                trans!(x_mem, $t),
+                n, None))
         }
     );
 }
@@ -231,36 +144,15 @@ macro_rules! iblas_scal_for_cuda {
 #[macro_export]
 macro_rules! iblas_swap_for_cuda {
     ($t:ident) => (
-        fn swap(&self,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            y: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-            match y.add_device(self.device()) { _ => try!(y.sync(self.device())) }
-            self.swap_plain(x, y)
-        }
-
-        fn swap_plain(&self,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            y: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
+        fn swap(&self, x: &mut SharedTensor<$t>, y: &mut SharedTensor<$t>)
+                -> Result<(), ::collenchyma::error::Error> {
             let n = x.desc().size() as i32;
-            let x_get = try!(x.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`")));
-            let y_get = try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`")));
-            let x_mem = try!(x_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `x`.")));
-            let y_mem = try!(y_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `y`.")));
-            unsafe {
-                let res = CONTEXT.swap(::std::mem::transmute::<u64, *mut $t>(*x_mem.id_c()),
-                                       ::std::mem::transmute::<u64, *mut $t>(*y_mem.id_c()),
-                                       n,
-                                       None,
-                                       None);
-                if res.is_ok() {
-                    Ok(())
-                } else {
-                    Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation swap.")))
-                }
-            }
+            let x_mem = read_write!(x, self);
+            let y_mem = read_write!(y, self);
+            exec!(swap, CONTEXT.swap(
+                trans!(x_mem, $t),
+                trans!(y_mem, $t),
+                n, None, None))
         }
     );
 }
@@ -269,76 +161,49 @@ macro_rules! iblas_swap_for_cuda {
 macro_rules! iblas_gemm_for_cuda {
     ($t:ident) => (
         fn gemm(&self,
-            alpha: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            at: ::transpose::Transpose,
-            a: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            bt: ::transpose::Transpose,
-            b: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            beta: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            c: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match alpha.add_device(self.device()) { _ => try!(alpha.sync(self.device())) }
-            match a.add_device(self.device()) { _ => try!(a.sync(self.device())) }
-            match beta.add_device(self.device()) { _ => try!(beta.sync(self.device())) }
-            match b.add_device(self.device()) { _ => try!(b.sync(self.device())) }
-            match c.add_device(self.device()) { _ => try!(c.sync(self.device())) }
-            self.gemm_plain(alpha, at, a, bt, b, beta, c)
-        }
-
-        fn gemm_plain(&self,
-            alpha: &::collenchyma::tensor::SharedTensor<$t>,
-            at: ::transpose::Transpose,
-            a: &::collenchyma::tensor::SharedTensor<$t>,
-            bt: ::transpose::Transpose,
-            b: &::collenchyma::tensor::SharedTensor<$t>,
-            beta: &::collenchyma::tensor::SharedTensor<$t>,
-            c: &mut ::collenchyma::tensor::SharedTensor<$t>
+                alpha: &SharedTensor<$t>,
+                at: Transpose,
+                a: &SharedTensor<$t>,
+                bt: Transpose,
+                b: &SharedTensor<$t>,
+                beta: &SharedTensor<$t>,
+                c: &mut SharedTensor<$t>
         ) -> Result<(), ::collenchyma::error::Error> {
             let c_desc = c.desc().clone();
-            let alpha_get = try!(alpha.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `alpha`")));
-            let alpha_mem = try!(alpha_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `alpha`.")));
-            let a_get = try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`")));
-            let a_mem = try!(a_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `a`.")));
-            let b_get = try!(b.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `b`")));
-            let b_mem = try!(b_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `b`.")));
-            let beta_get = try!(beta.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `beta`")));
-            let beta_mem = try!(beta_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `beta`.")));
-            let c_get = try!(c.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `c`")));
-            let c_mem = try!(c_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `c`.")));
-            unsafe {
-                let a_0 = a.desc()[0] as i32;
-                let a_1 = a.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32;
-                let b_0 = b.desc()[0] as i32;
-                let b_1 = b.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32;
-                let c_1 = c_desc.iter().skip(1).fold(1, |prod, i| prod * i) as i32;
-                let n = match bt {
-                    ::transpose::Transpose::NoTrans => b_1,
-                    _ => b_0
-                };
-                let (m, k) = match at {
-                    ::transpose::Transpose::NoTrans => (a_0, a_1),
-                    _ => (a_1, a_0)
-                };
-                let lda = a_1;
-                let ldb = b_1;
-                let ldc = c_1;
-                let res = CONTEXT.gemm(::cublas::api::Operation::from(bt),
-                                       ::cublas::api::Operation::from(at),
-                                       n, m, k,
-                                       ::std::mem::transmute::<u64, *mut $t>(*alpha_mem.id_c()),
-                                       ::std::mem::transmute::<u64, *mut $t>(*b_mem.id_c()), // matrix a and b are switched to make it work with row-major memory layout.
-                                       ldb,
-                                       ::std::mem::transmute::<u64, *mut $t>(*a_mem.id_c()),
-                                       lda,
-                                       ::std::mem::transmute::<u64, *mut $t>(*beta_mem.id_c()),
-                                       ::std::mem::transmute::<u64, *mut $t>(*c_mem.id_c()),
-                                       ldc);
-                if res.is_ok() {
-                    Ok(())
-                } else {
-                    Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation gemm.")))
-                }
-            }
+            let alpha_mem = read!(alpha, self);
+            let beta_mem = read!(beta, self);
+            let a_mem = read!(a, self);
+            let b_mem = read!(b, self);
+            let c_mem = write_only!(c, self);
+
+            let a_0 = a.desc()[0] as i32;
+            let a_1 = a.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32;
+            let b_0 = b.desc()[0] as i32;
+            let b_1 = b.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32;
+            let c_1 = c_desc.iter().skip(1).fold(1, |prod, i| prod * i) as i32;
+            let n = match bt {
+                Transpose::NoTrans => b_1,
+                _ => b_0
+            };
+            let (m, k) = match at {
+                Transpose::NoTrans => (a_0, a_1),
+                _ => (a_1, a_0)
+            };
+            let lda = a_1;
+            let ldb = b_1;
+            let ldc = c_1;
+            exec!(gemm, CONTEXT.gemm(
+                ::cublas::api::Operation::from(bt),
+                ::cublas::api::Operation::from(at),
+                n, m, k,
+                trans!(alpha_mem, $t),
+                trans!(b_mem, $t), // matrix a and b are switched to make it work with row-major memory layout.
+                ldb,
+                trans!(a_mem, $t),
+                lda,
+                trans!(beta_mem, $t),
+                trans!(c_mem, $t),
+                ldc))
         }
     );
 }
diff --git a/src/frameworks/cuda/mod.rs b/src/frameworks/cuda/mod.rs
index 32e810d..ba11068 100644
--- a/src/frameworks/cuda/mod.rs
+++ b/src/frameworks/cuda/mod.rs
@@ -1,11 +1,12 @@
 //! Provides BLAS for a CUDA backend.
 #![allow(missing_docs)]
-use ::plugin::*;
 use collenchyma::backend::Backend;
-use collenchyma::tensor::ITensorDesc;
+use collenchyma::tensor::{SharedTensor, ITensorDesc};
 use collenchyma::plugin::Error as PluginError;
 use collenchyma::frameworks::cuda::Cuda;
 use cublas;
+use ::plugin::*;
+use ::transpose::Transpose;
 
 #[macro_use]
 pub mod helper;
diff --git a/src/frameworks/native.rs b/src/frameworks/native.rs
index 604f9bb..d803205 100644
--- a/src/frameworks/native.rs
+++ b/src/frameworks/native.rs
@@ -1,176 +1,191 @@
 //! Provides BLAS for a Native backend.
 
-use ::operation::*;
 use ::plugin::*;
 use ::transpose::*;
 use collenchyma::backend::Backend;
-use collenchyma::memory::MemoryType;
 use collenchyma::frameworks::native::Native;
-use collenchyma::plugin::Error;
+use collenchyma::tensor::{SharedTensor, ITensorDesc};
 use rblas::math::mat::Mat;
 use rblas::matrix::Matrix;
 use rblas;
 
-macro_rules! impl_asum_for {
-    ($t:ident, $b:ty) => (
-        impl IOperationAsum<$t> for $b {
-            fn compute(&self, x: &MemoryType, result: &mut MemoryType) -> Result<(), Error> {
-                let x_slice = try!(x.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `x`."))).as_slice::<$t>();
-                let mut r_slice = try!(result.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `result`."))).as_mut_slice::<$t>();
-                r_slice[0] = rblas::Asum::asum(x_slice);
-                Ok(())
-            }
+macro_rules! read {
+    ($x:ident, $t:ident, $slf:ident) => (
+        try!($x.read($slf.device())).as_native()
+            .expect("Broken invariant: not a CUDA memory")
+            .as_slice::<$t>();
+    )
+}
+
+macro_rules! read_write {
+    ($x:ident, $t: ident, $slf:ident) => (
+        try!($x.read_write($slf.device())).as_mut_native()
+            .expect("Broken invariant: not a CUDA memory")
+            .as_mut_slice::<$t>();
+    )
+}
+
+macro_rules! write_only {
+    ($x:ident, $t: ident, $slf:ident) => (
+        try!($x.write_only($slf.device())).as_mut_native()
+            .expect("Broken invariant: not a CUDA memory")
+            .as_mut_slice::<$t>();
+    )
+}
+
+
+macro_rules! iblas_asum_for_native {
+    ($t:ident) => (
+        fn asum(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>)
+                -> Result<(), ::collenchyma::error::Error> {
+            let r_slice = write_only!(result, $t, self);
+            r_slice[0] = rblas::Asum::asum(read!(x, $t, self));
+            Ok(())
         }
     );
 }
 
-macro_rules! impl_axpy_for {
-    ($t:ident, $b:ty) => (
-        impl IOperationAxpy<$t> for $b {
-            fn compute(&self, a: &MemoryType, x: &MemoryType, y: &mut MemoryType) -> Result<(), Error> {
-                let a_slice = try!(a.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `a`."))).as_slice::<$t>();
-                let x_slice = try!(x.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `x`."))).as_slice::<$t>();
-                let y_slice = try!(y.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `y`."))).as_mut_slice::<$t>();
-                rblas::Axpy::axpy(&a_slice[0], x_slice, y_slice);
-                Ok(())
-            }
+macro_rules! iblas_axpy_for_native {
+    ($t:ident) => (
+        fn axpy(&self, a: &SharedTensor<$t>, x: &SharedTensor<$t>,
+                y: &mut SharedTensor<$t>)
+                -> Result<(), ::collenchyma::error::Error> {
+            rblas::Axpy::axpy(
+                &read!(a, $t, self)[0],
+                read!(x, $t, self),
+                read_write!(y, $t, self));
+            Ok(())
         }
     );
 }
 
-macro_rules! impl_copy_for {
-    ($t:ident, $b:ty) => (
-        impl IOperationCopy<$t> for $b {
-            fn compute(&self, x: &MemoryType, y: &mut MemoryType) -> Result<(), Error> {
-                let x_slice = try!(x.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `x`."))).as_slice::<$t>();
-                let y_slice = try!(y.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `y`."))).as_mut_slice::<$t>();
-                rblas::Copy::copy(x_slice, y_slice);
-                Ok(())
-            }
+macro_rules! iblas_copy_for_native {
+    ($t:ident) => (
+        fn copy(&self, x: &SharedTensor<$t>, y: &mut SharedTensor<$t>)
+                -> Result<(), ::collenchyma::error::Error> {
+            rblas::Copy::copy(
+                read!(x, $t, self),
+                write_only!(y, $t, self));
+            Ok(())
         }
     );
 }
 
-macro_rules! impl_dot_for {
-    ($t:ident, $b:ty) => (
-        impl IOperationDot<$t> for $b {
-            fn compute(&self, x: &MemoryType, y: &MemoryType, result: &mut MemoryType) -> Result<(), Error> {
-                let x_slice = try!(x.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `x`."))).as_slice::<$t>();
-                let y_slice = try!(y.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `y`."))).as_slice::<$t>();
-                let mut r_slice = try!(result.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `result`."))).as_mut_slice::<$t>();
-                r_slice[0] = rblas::Dot::dot(x_slice, y_slice);
-                Ok(())
-            }
+macro_rules! iblas_dot_for_native {
+    ($t:ident) => (
+        fn dot(&self, x: &SharedTensor<$t>, y: &SharedTensor<$t>,
+               result: &mut SharedTensor<$t>
+               ) -> Result<(), ::collenchyma::error::Error> {
+            let r_slice = write_only!(result, $t, self);
+            r_slice[0] = rblas::Dot::dot(read!(x, $t, self), read!(y, $t, self));
+            Ok(())
         }
     );
 }
 
-macro_rules! impl_nrm2_for {
-    ($t:ident, $b:ty) => (
-        impl IOperationNrm2<$t> for $b {
-            fn compute(&self, x: &MemoryType, result: &mut MemoryType) -> Result<(), Error> {
-                let x_slice = try!(x.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `x`."))).as_slice::<$t>();
-                let mut r_slice = try!(result.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `result`."))).as_mut_slice::<$t>();
-                r_slice[0] = rblas::Nrm2::nrm2(x_slice);
-                Ok(())
-            }
+macro_rules! iblas_nrm2_for_native {
+    ($t:ident) => (
+        fn nrm2(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>)
+                -> Result<(), ::collenchyma::error::Error> {
+            let r_slice = write_only!(result, $t, self);
+            r_slice[0] = rblas::Nrm2::nrm2(read!(x, $t, self));
+            Ok(())
         }
     );
 }
 
-macro_rules! impl_scale_for {
-    ($t:ident, $b:ty) => (
-        impl IOperationScale<$t> for $b {
-            fn compute(&self, a: &MemoryType, x: &mut MemoryType) -> Result<(), Error> {
-                let a_slice = try!(a.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `a`."))).as_slice::<$t>();
-                let mut x_slice = try!(x.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `x`."))).as_mut_slice::<$t>();
-                rblas::Scal::scal(&a_slice[0], x_slice);
-                Ok(())
-            }
+macro_rules! iblas_scal_for_native {
+    ($t:ident) => (
+        fn scal(&self, a: &SharedTensor<$t>, x: &mut SharedTensor<$t>)
+                -> Result<(), ::collenchyma::error::Error> {
+            rblas::Scal::scal(
+                &read!(a, $t, self)[0],
+                read_write!(x, $t, self));
+            Ok(())
         }
     );
 }
 
-macro_rules! impl_swap_for {
-    ($t:ident, $b:ty) => (
-        impl IOperationSwap<$t> for $b {
-            fn compute(&self, x: &mut MemoryType, y: &mut MemoryType) -> Result<(), Error> {
-                let mut x_slice = try!(x.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `x`."))).as_mut_slice::<$t>();
-                let mut y_slice = try!(y.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `y`."))).as_mut_slice::<$t>();
-                rblas::Swap::swap(x_slice, y_slice);
-                Ok(())
-            }
+macro_rules! iblas_swap_for_native {
+    ($t:ident) => (
+        fn swap(&self, x: &mut SharedTensor<$t>, y: &mut SharedTensor<$t>)
+                -> Result<(), ::collenchyma::error::Error> {
+            rblas::Swap::swap(read_write!(x, $t, self), read_write!(y, $t, self));
+            Ok(())
         }
     );
 }
 
-macro_rules! impl_gemm_for {
-    ($t:ident, $b:ty) => (
-        impl IOperationGemm<$t> for $b {
-            fn compute(&self, alpha: &MemoryType, at: Transpose, a_dims: &[usize], a: &MemoryType, bt: Transpose, b_dims: &[usize], b: &MemoryType, beta: &MemoryType, c_dims: &[usize], c: &mut MemoryType) -> Result<(), ::collenchyma::error::Error> {
-                let alpha_slice = try!(alpha.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `alpha`."))).as_slice::<$t>();
-                let a_slice = try!(a.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `a`."))).as_slice::<$t>();
-                let beta_slice = try!(beta.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `beta`."))).as_slice::<$t>();
-                let b_slice = try!(b.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `b`."))).as_slice::<$t>();
-                let mut c_slice = try!(c.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `c`."))).as_mut_slice::<$t>();
-
-                let a_matrix = as_matrix(a_slice, a_dims);
-                let b_matrix = as_matrix(b_slice, b_dims);
-                let mut c_matrix = as_matrix(c_slice, c_dims);
-                rblas::Gemm::gemm(&alpha_slice[0], at.to_rblas(), &a_matrix, bt.to_rblas(), &b_matrix, &beta_slice[0], &mut c_matrix);
-                read_from_matrix(&c_matrix, c_slice);
-                Ok(())
-            }
+macro_rules! iblas_gemm_for_native {
+    ($t:ident) => (
+        fn gemm(&self,
+                alpha: &SharedTensor<$t>,
+                at: Transpose,
+                a: &SharedTensor<$t>,
+                bt: Transpose,
+                b: &SharedTensor<$t>,
+                beta: &SharedTensor<$t>,
+                c: &mut SharedTensor<$t>
+        ) -> Result<(), ::collenchyma::error::Error> {
+            let c_dims = c.desc().clone(); // FIXME: clone() can be removed
+
+            let a_slice = read!(a, $t, self);
+            let b_slice = read!(b, $t, self);
+            let c_slice = write_only!(c, $t, self);
+
+            let a_matrix = as_matrix(a_slice, a.desc().dims());
+            let b_matrix = as_matrix(b_slice, b.desc().dims());
+            let mut c_matrix = as_matrix(c_slice, &c_dims);
+            rblas::Gemm::gemm(
+                &read!(alpha, $t, self)[0],
+                at.to_rblas(),
+                &a_matrix,
+                bt.to_rblas(),
+                &b_matrix,
+                &read!(beta, $t, self)[0],
+                &mut c_matrix);
+            read_from_matrix(&c_matrix, c_slice);
+            Ok(())
         }
     );
 }
 
 macro_rules! impl_iblas_for {
     ($t:ident, $b:ty) => (
-        impl_asum_for!($t, $b);
-        impl_axpy_for!($t, $b);
-        impl_copy_for!($t, $b);
-        impl_dot_for!($t, $b);
-        impl_nrm2_for!($t, $b);
-        impl_scale_for!($t, $b);
-        impl_swap_for!($t, $b);
-
-        impl_gemm_for!($t, $b);
-
         impl IBlas<$t> for $b { }
 
         // Level 1
 
         impl Asum<$t> for $b {
-            iblas_asum_for!($t, $b);
+            iblas_asum_for_native!($t);
         }
 
         impl Axpy<$t> for $b {
-            iblas_axpy_for!($t, $b);
+            iblas_axpy_for_native!($t);
         }
 
         impl Copy<$t> for $b {
-            iblas_copy_for!($t, $b);
+            iblas_copy_for_native!($t);
         }
 
         impl Dot<$t> for $b {
-            iblas_dot_for!($t, $b);
+            iblas_dot_for_native!($t);
         }
 
         impl Nrm2<$t> for $b {
-            iblas_nrm2_for!($t, $b);
+            iblas_nrm2_for_native!($t);
         }
 
         impl Scal<$t> for $b {
-            iblas_scale_for!($t, $b);
+            iblas_scal_for_native!($t);
         }
 
         impl Swap<$t> for $b {
-            iblas_swap_for!($t, $b);
+            iblas_swap_for_native!($t);
         }
 
         impl Gemm<$t> for $b {
-            iblas_gemm_for!($t, $b);
+            iblas_gemm_for_native!($t);
         }
     );
 }
@@ -239,14 +254,15 @@ mod test {
     #[test]
     fn it_converts_correctly_to_and_from_matrix() {
         let backend = get_native_backend();
-        let mut a = SharedTensor::<f32>::new(backend.device(), &vec![3, 2]).unwrap();
-        write_to_memory(a.get_mut(backend.device()).unwrap(),
+        let mut a = SharedTensor::<f32>::new(&vec![3, 2]).unwrap();
+        write_to_memory(a.write_only(backend.device()).unwrap(),
             &[2f32, 5f32,
               2f32, 5f32,
               2f32, 5f32]);
 
         {
-            let a_slice_in = a.get(backend.device()).unwrap().as_native().unwrap().as_slice::<f32>();
+            let a_slice_in = a.read(backend.device()).unwrap()
+                .as_native().unwrap().as_slice::<f32>();
             let a_mat = as_matrix(a_slice_in, &[3, 2]);
             // right
             assert_eq!(a_mat[0][0], 2f32);
diff --git a/src/helper.rs b/src/helper.rs
deleted file mode 100644
index b2b6e46..0000000
--- a/src/helper.rs
+++ /dev/null
@@ -1,289 +0,0 @@
-//! Provides macros for convenient implementation of BLAS operations.
-
-#[macro_export]
-macro_rules! iblas_asum_for {
-    ($t:ident, $b:ty) => (
-        fn asum(&self,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            result: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-            match result.add_device(self.device()) { _ => () }
-            Ok(try!(
-                <$b as IOperationAsum<$t>>::compute(&self,
-                    try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))),
-                    try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`"))),
-                )
-            ))
-        }
-
-        fn asum_plain(&self,
-            x: &::collenchyma::tensor::SharedTensor<$t>,
-            result: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            Ok(try!(
-                <$b as IOperationAsum<$t>>::compute(&self,
-                    try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))),
-                    try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`"))),
-                )
-            ))
-        }
-    );
-}
-
-#[macro_export]
-macro_rules! iblas_axpy_for {
-    ($t:ident, $b:ty) => (
-        fn axpy(&self,
-            a: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            y: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match a.add_device(self.device()) { _ => try!(a.sync(self.device())) }
-            match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-            match y.add_device(self.device()) { _ => try!(y.sync(self.device())) }
-            Ok(try!(
-                <$b as IOperationAxpy<$t>>::compute(&self,
-                    try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`"))),
-                    try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))),
-                    try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))),
-                )
-            ))
-        }
-
-        fn axpy_plain(&self,
-            a: &::collenchyma::tensor::SharedTensor<$t>,
-            x: &::collenchyma::tensor::SharedTensor<$t>,
-            y: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            Ok(try!(
-                <$b as IOperationAxpy<$t>>::compute(&self,
-                    try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`"))),
-                    try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))),
-                    try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))),
-                )
-            ))
-        }
-    );
-}
-
-#[macro_export]
-macro_rules! iblas_copy_for {
-    ($t:ident, $b:ty) => (
-        fn copy(&self,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            y: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-            match y.add_device(self.device()) { _ => () }
-            Ok(try!(
-                <$b as IOperationCopy<$t>>::compute(&self,
-                    try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))),
-                    try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))),
-                )
-            ))
-        }
-
-        fn copy_plain(&self,
-            x: &::collenchyma::tensor::SharedTensor<$t>,
-            y: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            Ok(try!(
-                <$b as IOperationCopy<$t>>::compute(&self,
-                    try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))),
-                    try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))),
-                )
-            ))
-        }
-    );
-}
-
-#[macro_export]
-macro_rules! iblas_dot_for {
-    ($t:ident, $b:ty) => (
-        fn dot(&self,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            y: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            result: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-            match y.add_device(self.device()) { _ => try!(y.sync(self.device())) }
-            match result.add_device(self.device()) { _ => () }
-            Ok(try!(
-                <$b as IOperationDot<$t>>::compute(&self,
-                    try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))),
-                    try!(y.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))),
-                    try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`")))
-                )
-            ))
-        }
-
-        fn dot_plain(&self,
-            x: &::collenchyma::tensor::SharedTensor<$t>,
-            y: &::collenchyma::tensor::SharedTensor<$t>,
-            result: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            Ok(try!(
-                <$b as IOperationDot<$t>>::compute(&self,
-                    try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))),
-                    try!(y.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))),
-                    try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`")))
-                )
-            ))
-        }
-    );
-}
-
-#[macro_export]
-macro_rules! iblas_nrm2_for {
-    ($t:ident, $b:ty) => (
-        fn nrm2(&self,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            result: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-            match result.add_device(self.device()) { _ => () }
-            Ok(try!(
-                <$b as IOperationNrm2<$t>>::compute(&self,
-                    try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))),
-                    try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`"))),
-                )
-            ))
-        }
-
-        fn nrm2_plain(&self,
-            x: &::collenchyma::tensor::SharedTensor<$t>,
-            result: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            Ok(try!(
-                <$b as IOperationNrm2<$t>>::compute(&self,
-                    try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))),
-                    try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`"))),
-                )
-            ))
-        }
-    );
-}
-
-#[macro_export]
-macro_rules! iblas_scale_for {
-    ($t:ident, $b:ty) => (
-        fn scal(&self,
-            a: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match a.add_device(self.device()) { _ => try!(a.sync(self.device())) }
-            match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-            Ok(try!(
-                <$b as IOperationScale<$t>>::compute(&self,
-                    try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`"))),
-                    try!(x.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))),
-                )
-            ))
-        }
-
-        fn scal_plain(&self,
-            a: &::collenchyma::tensor::SharedTensor<$t>,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            Ok(try!(
-                <$b as IOperationScale<$t>>::compute(&self,
-                    try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`"))),
-                    try!(x.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))),
-                )
-            ))
-        }
-    );
-}
-
-#[macro_export]
-macro_rules! iblas_swap_for {
-    ($t:ident, $b:ty) => (
-        fn swap(&self,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            y: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-            match y.add_device(self.device()) { _ => try!(y.sync(self.device())) }
-            Ok(try!(
-                <$b as IOperationSwap<$t>>::compute(&self,
-                    try!(x.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))),
-                    try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))),
-                )
-            ))
-        }
-
-        fn swap_plain(&self,
-            x: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            y: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            Ok(try!(
-                <$b as IOperationSwap<$t>>::compute(&self,
-                    try!(x.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))),
-                    try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))),
-                )
-            ))
-        }
-    );
-}
-
-#[macro_export]
-macro_rules! iblas_gemm_for {
-    ($t:ident, $b:ty) => (
-        fn gemm(&self,
-            alpha: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            at: ::transpose::Transpose,
-            a: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            bt: ::transpose::Transpose,
-            b: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            beta: &mut ::collenchyma::tensor::SharedTensor<$t>,
-            c: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            match alpha.add_device(self.device()) { _ => try!(alpha.sync(self.device())) }
-            match a.add_device(self.device()) { _ => try!(a.sync(self.device())) }
-            match beta.add_device(self.device()) { _ => try!(beta.sync(self.device())) }
-            match b.add_device(self.device()) { _ => try!(b.sync(self.device())) }
-            match c.add_device(self.device()) { _ => try!(c.sync(self.device())) }
-
-            Ok(try!(
-                <$b as IOperationGemm<$t>>::compute(&self,
-                    try!(alpha.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `alpha`"))),
-                    at,
-                    &a.desc().clone(),
-                    try!(a.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`"))),
-                    bt,
-                    &b.desc().clone(),
-                    try!(b.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `b`"))),
-                    try!(beta.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `beta`"))),
-                    &c.desc().clone(),
-                    try!(c.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `c`"))),
-                )
-            ))
-        }
-
-        fn gemm_plain(&self,
-            alpha: &::collenchyma::tensor::SharedTensor<$t>,
-            at: ::transpose::Transpose,
-            a: &::collenchyma::tensor::SharedTensor<$t>,
-            bt: ::transpose::Transpose,
-            b: &::collenchyma::tensor::SharedTensor<$t>,
-            beta: &::collenchyma::tensor::SharedTensor<$t>,
-            c: &mut ::collenchyma::tensor::SharedTensor<$t>
-        ) -> Result<(), ::collenchyma::error::Error> {
-            Ok(try!(
-                <$b as IOperationGemm<$t>>::compute(&self,
-                    try!(alpha.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `alpha`"))),
-                    at,
-                    &a.desc().clone(),
-                    try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`"))),
-                    bt,
-                    &b.desc().clone(),
-                    try!(b.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `b`"))),
-                    try!(beta.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `beta`"))),
-                    &c.desc().clone(),
-                    try!(c.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `c`"))),
-                )
-            ))
-        }
-    );
-}
diff --git a/src/lib.rs b/src/lib.rs
index e3c4245..82407ef 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -61,6 +61,4 @@ pub mod plugin;
 pub mod binary;
 pub mod operation;
 pub mod transpose;
-#[macro_use]
-pub mod helper;
 pub mod frameworks;
diff --git a/src/plugin.rs b/src/plugin.rs
index 08fb4e5..619face 100644
--- a/src/plugin.rs
+++ b/src/plugin.rs
@@ -11,130 +11,64 @@ pub trait IBlas<F> { }
 
 /// Provides the asum operation.
 pub trait Asum<F> {
-    /// Computes the absolute sum of vector `x` with complete memory management.
+    /// Computes the absolute sum of vector `x`.
     ///
     /// Saves the result to `result`.
     /// This is a Level 1 BLAS operation.
-    ///
-    /// For a no-memory managed version see `asum_plain`.
-    fn asum(&self, x: &mut SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
-
-    /// Computes the absolute sum of vector `x` without any memory management.
-    ///
-    /// Saves the result to `result`.
-    /// This is a Level 1 BLAS operation.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `asum`.
-    fn asum_plain(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
+    fn asum(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>)
+            -> Result<(), ::collenchyma::error::Error>;
 }
 
 /// Provides the axpy operation.
 pub trait Axpy<F> {
-    /// Computes a vector `x` times a constant `a` plus a vector `y` aka. `a * x + y` with complete memory management.
+    /// Computes a vector `x` times a constant `a` plus a vector `y` aka. `a * x + y`.
     ///
     /// Saves the resulting vector back into `y`.
     /// This is a Level 1 BLAS operation.
-    ///
-    /// For a no-memory managed version see `axpy_plain`.
-    fn axpy(&self, a: &mut SharedTensor<F>, x: &mut SharedTensor<F>, y: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
-
-    /// Computes a vector `x` times a constant `a` plus a vector `y` aka. `a * x + y` without any memory management.
-    ///
-    /// Saves the resulting vector back into `y`.
-    /// This is a Level 1 BLAS operation.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `axpy`.
-    fn axpy_plain(&self, a: &SharedTensor<F>, x: &SharedTensor<F>, y: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
+    fn axpy(&self, a: &SharedTensor<F>, x: &SharedTensor<F>, y: &mut SharedTensor<F>)
+            -> Result<(), ::collenchyma::error::Error>;
 }
 
 /// Provides the copy operation.
 pub trait Copy<F> {
-    /// Copies `x.len()` elements of vector `x` into vector `y` with complete memory management.
+    /// Copies `x.len()` elements of vector `x` into vector `y`.
     ///
     /// Saves the result to `y`.
     /// This is a Level 1 BLAS operation.
-    ///
-    /// For a no-memory managed version see `copy_plain`.
-    fn copy(&self, x: &mut SharedTensor<F>, y: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
-
-    /// Copies `x.len()` elements of vector `x` into vector `y` without any memory management.
-    ///
-    /// Saves the result to `y`.
-    /// This is a Level 1 BLAS operation.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `copy`.
-    fn copy_plain(&self, x: &SharedTensor<F>, y: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
+    fn copy(&self, x: &SharedTensor<F>, y: &mut SharedTensor<F>)
+            -> Result<(), ::collenchyma::error::Error>;
 }
 
 /// Provides the dot operation.
 pub trait Dot<F> {
-    /// Computes the [dot product][dot-product] over x and y with complete memory management.
-    /// [dot-product]: https://en.wikipedia.org/wiki/Dot_product
-    ///
-    /// Saves the resulting value into `result`.
-    /// This is a Level 1 BLAS operation.
-    ///
-    /// For a no-memory managed version see `dot_plain`.
-    fn dot(&self, x: &mut SharedTensor<F>, y: &mut SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
-
-    /// Computes the [dot product][dot-product] over x and y without any memory management.
+    /// Computes the [dot product][dot-product] over x and y.
     /// [dot-product]: https://en.wikipedia.org/wiki/Dot_product
     ///
     /// Saves the resulting value into `result`.
     /// This is a Level 1 BLAS operation.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `dot`.
-    fn dot_plain(&self, x: &SharedTensor<F>, y: &SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
+    fn dot(&self, x: &SharedTensor<F>, y: &SharedTensor<F>,
+           result: &mut SharedTensor<F>)
+           -> Result<(), ::collenchyma::error::Error>;
 }
 
 /// Provides the nrm2 operation.
 pub trait Nrm2<F> {
-    /// Computes the L2 norm aka. euclidean length of vector `x` with complete memory management.
+    /// Computes the L2 norm aka. euclidean length of vector `x`.
     ///
     /// Saves the result to `result`.
     /// This is a Level 1 BLAS operation.
-    ///
-    /// For a no-memory managed version see `nrm2_plain`.
-    fn nrm2(&self, x: &mut SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
-
-    /// Computes the L2 norm aka. euclidean length of vector `x` without any memory management.
-    ///
-    /// Saves the result to `result`.
-    /// This is a Level 1 BLAS operation.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `nrm2`.
-    fn nrm2_plain(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
+    fn nrm2(&self, x: &SharedTensor<F>, result: &mut SharedTensor<F>)
+            -> Result<(), ::collenchyma::error::Error>;
 }
 
 /// Provides the scal operation.
 pub trait Scal<F> {
-    /// Scales a vector `x` by a constant `a` aka. `a * x` with complete memory management.
+    /// Scales a vector `x` by a constant `a` aka. `a * x`.
     ///
     /// Saves the resulting vector back into `x`.
     /// This is a Level 1 BLAS operation.
-    ///
-    /// For a no-memory managed version see `scale_plain`.
-    fn scal(&self, a: &mut SharedTensor<F>, x: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
-
-    /// Scales a vector `x` by a constant `a` aka. `a * x` without any memory management.
-    ///
-    /// Saves the resulting vector back into `x`.
-    /// This is a Level 1 BLAS operation.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `scale`.
-    fn scal_plain(&self, a: &SharedTensor<F>, x: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
+    fn scal(&self, a: &SharedTensor<F>, x: &mut SharedTensor<F>)
+            -> Result<(), ::collenchyma::error::Error>;
 }
 
 /// Provides the swap operation.
@@ -143,19 +77,8 @@ pub trait Swap<F> {
     ///
     /// Saves the resulting vector back into `x`.
     /// This is a Level 1 BLAS operation.
-    ///
-    /// For a no-memory managed version see `swap_plain`.
-    fn swap(&self, x: &mut SharedTensor<F>, y: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
-
-    /// Swaps the content of vector `x` and vector `y` without any memory management.
-    ///
-    /// Saves the resulting vector back into `x`.
-    /// This is a Level 1 BLAS operation.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `swap`.
-    fn swap_plain(&self, x: &mut SharedTensor<F>, y: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
+    fn swap(&self, x: &mut SharedTensor<F>, y: &mut SharedTensor<F>)
+            -> Result<(), ::collenchyma::error::Error>;
 }
 
 /// Provides the gemm operation.
@@ -164,19 +87,11 @@ pub trait Gemm<F> {
     ///
     /// Saves the result into `c`.
     /// This is a Level 3 BLAS operation.
-    ///
-    /// For a no-memory managed version see `gemm_plain`.
-    fn gemm(&self, alpha: &mut SharedTensor<F>, at: Transpose, a: &mut SharedTensor<F>, bt: Transpose, b: &mut SharedTensor<F>, beta: &mut SharedTensor<F>, c: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
-
-    /// Computes a matrix-matrix product with general matrices.
-    ///
-    /// Saves the result into `c`.
-    /// This is a Level 3 BLAS operation.
-    ///
-    /// *Attention*:<br/>
-    /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
-    /// For a memory managed version see `gemm`.
-    fn gemm_plain(&self, alpha: &SharedTensor<F>, at: Transpose, a: &SharedTensor<F>, bt: Transpose, b: &SharedTensor<F>, beta: &SharedTensor<F>, c: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
+    fn gemm(&self, alpha: &SharedTensor<F>,
+            at: Transpose, a: &SharedTensor<F>,
+            bt: Transpose, b: &SharedTensor<F>,
+            beta: &SharedTensor<F>,
+            c: &mut SharedTensor<F>) -> Result<(), ::collenchyma::error::Error>;
 }
 
 /// Allows a BlasBinary to be provided which is used for a IBlas implementation.