diff --git a/src/frameworks/cuda/helper.rs b/src/frameworks/cuda/helper.rs index ac44ea5..4e088a5 100644 --- a/src/frameworks/cuda/helper.rs +++ b/src/frameworks/cuda/helper.rs @@ -1,34 +1,56 @@ +// Those macros should be removed when read()/read_only()/write() are refactored +// to return typed memory. For now they remove a lot of visual clutter and +// lessen probability of stupid mistakes. +macro_rules! read { + ($x:ident, $slf:ident) => ( + try!($x.read($slf.device())).as_cuda() + .expect("Broken invariant: not a CUDA memory") + ) +} + +macro_rules! read_write { + ($x:ident, $slf:ident) => ( + try!($x.read_write($slf.device())).as_cuda() + .expect("Broken invariant: not a CUDA memory") + ) +} + +macro_rules! write_only { + ($x:ident, $slf:ident) => ( + try!($x.write_only($slf.device())).as_cuda() + .expect("Broken invariant: not a CUDA memory") + ) +} + +// trans! cannot be inlined into macros above, because `$mem` would become +// intermidiate variable and `*mut $t` will outlive it. +macro_rules! trans { + ($mem:ident, $t:ident) => ( + unsafe { ::std::mem::transmute::(*$mem.id_c()) } + ) +} + +macro_rules! exec { + ($name:ident, $f:expr) => ({ + let res = $f; + res.map_err(|_| PluginError::Operation( + stringify!(Unable to execute operation $name)).into()) + }) +} + + #[macro_export] macro_rules! iblas_asum_for_cuda { ($t:ident) => ( - fn asum(&self, - x: &mut ::collenchyma::tensor::SharedTensor<$t>, - result: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => try!(result.sync(self.device())) } - self.asum_plain(x, result) - } - - fn asum_plain(&self, - x: &::collenchyma::tensor::SharedTensor<$t>, - result: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - let x_get = try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))); - let r_get = try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`"))); - let x_mem = try!(x_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `x`."))); - let r_mem = try!(r_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `result`."))); - unsafe { - let res = CONTEXT.asum(::std::mem::transmute::(*x_mem.id_c()), - ::std::mem::transmute::(*r_mem.id_c()), - x.desc().size() as i32, - None); - if res.is_ok() { - Ok(()) - } else { - Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation asum."))) - } - } + fn asum(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>) + -> Result<(), ::collenchyma::error::Error> { + let n = x.desc().size() as i32; + let x_mem = read!(x, self); + let r_mem = write_only!(result, self); + exec!(asum, CONTEXT.asum( + trans!(x_mem, $t), + trans!(r_mem, $t), + n, None)) } ); } @@ -36,42 +58,18 @@ macro_rules! iblas_asum_for_cuda { #[macro_export] macro_rules! iblas_axpy_for_cuda { ($t:ident) => ( - fn axpy(&self, - a: &mut ::collenchyma::tensor::SharedTensor<$t>, - x: &mut ::collenchyma::tensor::SharedTensor<$t>, - y: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match a.add_device(self.device()) { _ => try!(a.sync(self.device())) } - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match y.add_device(self.device()) { _ => try!(y.sync(self.device())) } - self.axpy_plain(a, x, y) - } - - fn axpy_plain(&self, - a: &::collenchyma::tensor::SharedTensor<$t>, - x: &::collenchyma::tensor::SharedTensor<$t>, - y: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { + fn axpy(&self, a: &SharedTensor<$t>, x: &SharedTensor<$t>, + y: &mut SharedTensor<$t>) + -> Result<(), ::collenchyma::error::Error> { let n = x.desc().size() as i32; - let a_get = try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`"))); - let x_get = try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))); - let y_get = try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))); - let a_mem = try!(a_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `a`."))); - let x_mem = try!(x_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `x`."))); - let y_mem = try!(y_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `y`."))); - unsafe { - let res = CONTEXT.axpy(::std::mem::transmute::(*a_mem.id_c()), - ::std::mem::transmute::(*x_mem.id_c()), - ::std::mem::transmute::(*y_mem.id_c()), - n, - None, - None); - if res.is_ok() { - Ok(()) - } else { - Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation axpy."))) - } - } + let a_mem = read!(a, self); + let x_mem = read!(x, self); + let y_mem = read_write!(y, self); + exec!(axpy, CONTEXT.axpy( + trans!(a_mem, $t), + trans!(x_mem, $t), + trans!(y_mem, $t), + n, None, None)) } ); } @@ -79,36 +77,15 @@ macro_rules! iblas_axpy_for_cuda { #[macro_export] macro_rules! iblas_copy_for_cuda { ($t:ident) => ( - fn copy(&self, - x: &mut ::collenchyma::tensor::SharedTensor<$t>, - y: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match y.add_device(self.device()) { _ => try!(y.sync(self.device())) } - self.copy_plain(x, y) - } - - fn copy_plain(&self, - x: &::collenchyma::tensor::SharedTensor<$t>, - y: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { + fn copy(&self, x: &SharedTensor<$t>, y: &mut SharedTensor<$t>) + -> Result<(), ::collenchyma::error::Error> { let n = x.desc().size() as i32; - let x_get = try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))); - let y_get = try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))); - let x_mem = try!(x_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `x`."))); - let y_mem = try!(y_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `y`."))); - unsafe { - let res = CONTEXT.copy(::std::mem::transmute::(*x_mem.id_c()), - ::std::mem::transmute::(*y_mem.id_c()), - n, - None, - None); - if res.is_ok() { - Ok(()) - } else { - Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation copy."))) - } - } + let x_mem = read!(x, self); + let y_mem = write_only!(y, self); + exec!(copy, CONTEXT.copy( + trans!(x_mem, $t), + trans!(y_mem, $t), + n, None, None)) } ); } @@ -116,42 +93,18 @@ macro_rules! iblas_copy_for_cuda { #[macro_export] macro_rules! iblas_dot_for_cuda { ($t:ident) => ( - fn dot(&self, - x: &mut ::collenchyma::tensor::SharedTensor<$t>, - y: &mut ::collenchyma::tensor::SharedTensor<$t>, - result: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match y.add_device(self.device()) { _ => try!(y.sync(self.device())) } - match result.add_device(self.device()) { _ => try!(result.sync(self.device())) } - self.dot_plain(x, y, result) - } - - fn dot_plain(&self, - x: &::collenchyma::tensor::SharedTensor<$t>, - y: &::collenchyma::tensor::SharedTensor<$t>, - result: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { + fn dot(&self, x: &SharedTensor<$t>, y: &SharedTensor<$t>, + result: &mut SharedTensor<$t>) + -> Result<(), ::collenchyma::error::Error> { let n = x.desc().size() as i32; - let x_get = try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))); - let y_get = try!(y.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))); - let r_get = try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`"))); - let x_mem = try!(x_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `x`."))); - let y_mem = try!(y_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `y`."))); - let r_mem = try!(r_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `result`."))); - unsafe { - let res = CONTEXT.dot( ::std::mem::transmute::(*x_mem.id_c()), - ::std::mem::transmute::(*y_mem.id_c()), - ::std::mem::transmute::(*r_mem.id_c()), - n, - None, - None); - if res.is_ok() { - Ok(()) - } else { - Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation dot."))) - } - } + let x_mem = read!(x, self); + let y_mem = read!(y, self); + let r_mem = write_only!(result, self); + exec!(dot, CONTEXT.dot( + trans!(x_mem, $t), + trans!(y_mem, $t), + trans!(r_mem, $t), + n, None, None)) } ); } @@ -159,35 +112,15 @@ macro_rules! iblas_dot_for_cuda { #[macro_export] macro_rules! iblas_nrm2_for_cuda { ($t:ident) => ( - fn nrm2(&self, - x: &mut ::collenchyma::tensor::SharedTensor<$t>, - result: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => try!(result.sync(self.device())) } - self.nrm2_plain(x, result) - } - - fn nrm2_plain(&self, - x: &::collenchyma::tensor::SharedTensor<$t>, - result: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { + fn nrm2(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>) + -> Result<(), ::collenchyma::error::Error> { let n = x.desc().size() as i32; - let x_get = try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))); - let r_get = try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`"))); - let x_mem = try!(x_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `x`."))); - let r_mem = try!(r_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `result`."))); - unsafe { - let res = CONTEXT.nrm2(::std::mem::transmute::(*x_mem.id_c()), - ::std::mem::transmute::(*r_mem.id_c()), - n, - None); - if res.is_ok() { - Ok(()) - } else { - Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation nrm2."))) - } - } + let x_mem = read!(x, self); + let r_mem = write_only!(result, self); + exec!(nrm2, CONTEXT.nrm2( + trans!(x_mem, $t), + trans!(r_mem, $t), + n, None)) } ); } @@ -195,35 +128,15 @@ macro_rules! iblas_nrm2_for_cuda { #[macro_export] macro_rules! iblas_scal_for_cuda { ($t:ident) => ( - fn scal(&self, - a: &mut ::collenchyma::tensor::SharedTensor<$t>, - x: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match a.add_device(self.device()) { _ => try!(a.sync(self.device())) } - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - self.scal_plain(a, x) - } - - fn scal_plain(&self, - a: &::collenchyma::tensor::SharedTensor<$t>, - x: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { + fn scal(&self, a: &SharedTensor<$t>, x: &mut SharedTensor<$t>) + -> Result<(), ::collenchyma::error::Error> { let n = x.desc().size() as i32; - let a_get = try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`"))); - let x_get = try!(x.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))); - let a_mem = try!(a_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `a`."))); - let x_mem = try!(x_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `x`."))); - unsafe { - let res = CONTEXT.scal(::std::mem::transmute::(*a_mem.id_c()), - ::std::mem::transmute::(*x_mem.id_c()), - n, - None); - if res.is_ok() { - Ok(()) - } else { - Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation scal."))) - } - } + let a_mem = read!(a, self); + let x_mem = read_write!(x, self); + exec!(scal, CONTEXT.scal( + trans!(a_mem, $t), + trans!(x_mem, $t), + n, None)) } ); } @@ -231,36 +144,15 @@ macro_rules! iblas_scal_for_cuda { #[macro_export] macro_rules! iblas_swap_for_cuda { ($t:ident) => ( - fn swap(&self, - x: &mut ::collenchyma::tensor::SharedTensor<$t>, - y: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match y.add_device(self.device()) { _ => try!(y.sync(self.device())) } - self.swap_plain(x, y) - } - - fn swap_plain(&self, - x: &mut ::collenchyma::tensor::SharedTensor<$t>, - y: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { + fn swap(&self, x: &mut SharedTensor<$t>, y: &mut SharedTensor<$t>) + -> Result<(), ::collenchyma::error::Error> { let n = x.desc().size() as i32; - let x_get = try!(x.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))); - let y_get = try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))); - let x_mem = try!(x_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `x`."))); - let y_mem = try!(y_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `y`."))); - unsafe { - let res = CONTEXT.swap(::std::mem::transmute::(*x_mem.id_c()), - ::std::mem::transmute::(*y_mem.id_c()), - n, - None, - None); - if res.is_ok() { - Ok(()) - } else { - Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation swap."))) - } - } + let x_mem = read_write!(x, self); + let y_mem = read_write!(y, self); + exec!(swap, CONTEXT.swap( + trans!(x_mem, $t), + trans!(y_mem, $t), + n, None, None)) } ); } @@ -269,76 +161,49 @@ macro_rules! iblas_swap_for_cuda { macro_rules! iblas_gemm_for_cuda { ($t:ident) => ( fn gemm(&self, - alpha: &mut ::collenchyma::tensor::SharedTensor<$t>, - at: ::transpose::Transpose, - a: &mut ::collenchyma::tensor::SharedTensor<$t>, - bt: ::transpose::Transpose, - b: &mut ::collenchyma::tensor::SharedTensor<$t>, - beta: &mut ::collenchyma::tensor::SharedTensor<$t>, - c: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match alpha.add_device(self.device()) { _ => try!(alpha.sync(self.device())) } - match a.add_device(self.device()) { _ => try!(a.sync(self.device())) } - match beta.add_device(self.device()) { _ => try!(beta.sync(self.device())) } - match b.add_device(self.device()) { _ => try!(b.sync(self.device())) } - match c.add_device(self.device()) { _ => try!(c.sync(self.device())) } - self.gemm_plain(alpha, at, a, bt, b, beta, c) - } - - fn gemm_plain(&self, - alpha: &::collenchyma::tensor::SharedTensor<$t>, - at: ::transpose::Transpose, - a: &::collenchyma::tensor::SharedTensor<$t>, - bt: ::transpose::Transpose, - b: &::collenchyma::tensor::SharedTensor<$t>, - beta: &::collenchyma::tensor::SharedTensor<$t>, - c: &mut ::collenchyma::tensor::SharedTensor<$t> + alpha: &SharedTensor<$t>, + at: Transpose, + a: &SharedTensor<$t>, + bt: Transpose, + b: &SharedTensor<$t>, + beta: &SharedTensor<$t>, + c: &mut SharedTensor<$t> ) -> Result<(), ::collenchyma::error::Error> { let c_desc = c.desc().clone(); - let alpha_get = try!(alpha.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `alpha`"))); - let alpha_mem = try!(alpha_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `alpha`."))); - let a_get = try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`"))); - let a_mem = try!(a_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `a`."))); - let b_get = try!(b.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `b`"))); - let b_mem = try!(b_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `b`."))); - let beta_get = try!(beta.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `beta`"))); - let beta_mem = try!(beta_get.as_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `beta`."))); - let c_get = try!(c.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `c`"))); - let c_mem = try!(c_get.as_mut_cuda().ok_or(PluginError::MissingMemoryForDevice("Unable to receive CUDA memory for `c`."))); - unsafe { - let a_0 = a.desc()[0] as i32; - let a_1 = a.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32; - let b_0 = b.desc()[0] as i32; - let b_1 = b.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32; - let c_1 = c_desc.iter().skip(1).fold(1, |prod, i| prod * i) as i32; - let n = match bt { - ::transpose::Transpose::NoTrans => b_1, - _ => b_0 - }; - let (m, k) = match at { - ::transpose::Transpose::NoTrans => (a_0, a_1), - _ => (a_1, a_0) - }; - let lda = a_1; - let ldb = b_1; - let ldc = c_1; - let res = CONTEXT.gemm(::cublas::api::Operation::from(bt), - ::cublas::api::Operation::from(at), - n, m, k, - ::std::mem::transmute::(*alpha_mem.id_c()), - ::std::mem::transmute::(*b_mem.id_c()), // matrix a and b are switched to make it work with row-major memory layout. - ldb, - ::std::mem::transmute::(*a_mem.id_c()), - lda, - ::std::mem::transmute::(*beta_mem.id_c()), - ::std::mem::transmute::(*c_mem.id_c()), - ldc); - if res.is_ok() { - Ok(()) - } else { - Err(::collenchyma::error::Error::Plugin(::collenchyma::plugin::Error::Operation("Unable to execute operation gemm."))) - } - } + let alpha_mem = read!(alpha, self); + let beta_mem = read!(beta, self); + let a_mem = read!(a, self); + let b_mem = read!(b, self); + let c_mem = write_only!(c, self); + + let a_0 = a.desc()[0] as i32; + let a_1 = a.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32; + let b_0 = b.desc()[0] as i32; + let b_1 = b.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32; + let c_1 = c_desc.iter().skip(1).fold(1, |prod, i| prod * i) as i32; + let n = match bt { + Transpose::NoTrans => b_1, + _ => b_0 + }; + let (m, k) = match at { + Transpose::NoTrans => (a_0, a_1), + _ => (a_1, a_0) + }; + let lda = a_1; + let ldb = b_1; + let ldc = c_1; + exec!(gemm, CONTEXT.gemm( + ::cublas::api::Operation::from(bt), + ::cublas::api::Operation::from(at), + n, m, k, + trans!(alpha_mem, $t), + trans!(b_mem, $t), // matrix a and b are switched to make it work with row-major memory layout. + ldb, + trans!(a_mem, $t), + lda, + trans!(beta_mem, $t), + trans!(c_mem, $t), + ldc)) } ); } diff --git a/src/frameworks/cuda/mod.rs b/src/frameworks/cuda/mod.rs index 32e810d..ba11068 100644 --- a/src/frameworks/cuda/mod.rs +++ b/src/frameworks/cuda/mod.rs @@ -1,11 +1,12 @@ //! Provides BLAS for a CUDA backend. #![allow(missing_docs)] -use ::plugin::*; use collenchyma::backend::Backend; -use collenchyma::tensor::ITensorDesc; +use collenchyma::tensor::{SharedTensor, ITensorDesc}; use collenchyma::plugin::Error as PluginError; use collenchyma::frameworks::cuda::Cuda; use cublas; +use ::plugin::*; +use ::transpose::Transpose; #[macro_use] pub mod helper; diff --git a/src/frameworks/native.rs b/src/frameworks/native.rs index 604f9bb..d803205 100644 --- a/src/frameworks/native.rs +++ b/src/frameworks/native.rs @@ -1,176 +1,191 @@ //! Provides BLAS for a Native backend. -use ::operation::*; use ::plugin::*; use ::transpose::*; use collenchyma::backend::Backend; -use collenchyma::memory::MemoryType; use collenchyma::frameworks::native::Native; -use collenchyma::plugin::Error; +use collenchyma::tensor::{SharedTensor, ITensorDesc}; use rblas::math::mat::Mat; use rblas::matrix::Matrix; use rblas; -macro_rules! impl_asum_for { - ($t:ident, $b:ty) => ( - impl IOperationAsum<$t> for $b { - fn compute(&self, x: &MemoryType, result: &mut MemoryType) -> Result<(), Error> { - let x_slice = try!(x.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `x`."))).as_slice::<$t>(); - let mut r_slice = try!(result.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `result`."))).as_mut_slice::<$t>(); - r_slice[0] = rblas::Asum::asum(x_slice); - Ok(()) - } +macro_rules! read { + ($x:ident, $t:ident, $slf:ident) => ( + try!($x.read($slf.device())).as_native() + .expect("Broken invariant: not a CUDA memory") + .as_slice::<$t>(); + ) +} + +macro_rules! read_write { + ($x:ident, $t: ident, $slf:ident) => ( + try!($x.read_write($slf.device())).as_mut_native() + .expect("Broken invariant: not a CUDA memory") + .as_mut_slice::<$t>(); + ) +} + +macro_rules! write_only { + ($x:ident, $t: ident, $slf:ident) => ( + try!($x.write_only($slf.device())).as_mut_native() + .expect("Broken invariant: not a CUDA memory") + .as_mut_slice::<$t>(); + ) +} + + +macro_rules! iblas_asum_for_native { + ($t:ident) => ( + fn asum(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>) + -> Result<(), ::collenchyma::error::Error> { + let r_slice = write_only!(result, $t, self); + r_slice[0] = rblas::Asum::asum(read!(x, $t, self)); + Ok(()) } ); } -macro_rules! impl_axpy_for { - ($t:ident, $b:ty) => ( - impl IOperationAxpy<$t> for $b { - fn compute(&self, a: &MemoryType, x: &MemoryType, y: &mut MemoryType) -> Result<(), Error> { - let a_slice = try!(a.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `a`."))).as_slice::<$t>(); - let x_slice = try!(x.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `x`."))).as_slice::<$t>(); - let y_slice = try!(y.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `y`."))).as_mut_slice::<$t>(); - rblas::Axpy::axpy(&a_slice[0], x_slice, y_slice); - Ok(()) - } +macro_rules! iblas_axpy_for_native { + ($t:ident) => ( + fn axpy(&self, a: &SharedTensor<$t>, x: &SharedTensor<$t>, + y: &mut SharedTensor<$t>) + -> Result<(), ::collenchyma::error::Error> { + rblas::Axpy::axpy( + &read!(a, $t, self)[0], + read!(x, $t, self), + read_write!(y, $t, self)); + Ok(()) } ); } -macro_rules! impl_copy_for { - ($t:ident, $b:ty) => ( - impl IOperationCopy<$t> for $b { - fn compute(&self, x: &MemoryType, y: &mut MemoryType) -> Result<(), Error> { - let x_slice = try!(x.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `x`."))).as_slice::<$t>(); - let y_slice = try!(y.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `y`."))).as_mut_slice::<$t>(); - rblas::Copy::copy(x_slice, y_slice); - Ok(()) - } +macro_rules! iblas_copy_for_native { + ($t:ident) => ( + fn copy(&self, x: &SharedTensor<$t>, y: &mut SharedTensor<$t>) + -> Result<(), ::collenchyma::error::Error> { + rblas::Copy::copy( + read!(x, $t, self), + write_only!(y, $t, self)); + Ok(()) } ); } -macro_rules! impl_dot_for { - ($t:ident, $b:ty) => ( - impl IOperationDot<$t> for $b { - fn compute(&self, x: &MemoryType, y: &MemoryType, result: &mut MemoryType) -> Result<(), Error> { - let x_slice = try!(x.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `x`."))).as_slice::<$t>(); - let y_slice = try!(y.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `y`."))).as_slice::<$t>(); - let mut r_slice = try!(result.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `result`."))).as_mut_slice::<$t>(); - r_slice[0] = rblas::Dot::dot(x_slice, y_slice); - Ok(()) - } +macro_rules! iblas_dot_for_native { + ($t:ident) => ( + fn dot(&self, x: &SharedTensor<$t>, y: &SharedTensor<$t>, + result: &mut SharedTensor<$t> + ) -> Result<(), ::collenchyma::error::Error> { + let r_slice = write_only!(result, $t, self); + r_slice[0] = rblas::Dot::dot(read!(x, $t, self), read!(y, $t, self)); + Ok(()) } ); } -macro_rules! impl_nrm2_for { - ($t:ident, $b:ty) => ( - impl IOperationNrm2<$t> for $b { - fn compute(&self, x: &MemoryType, result: &mut MemoryType) -> Result<(), Error> { - let x_slice = try!(x.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `x`."))).as_slice::<$t>(); - let mut r_slice = try!(result.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `result`."))).as_mut_slice::<$t>(); - r_slice[0] = rblas::Nrm2::nrm2(x_slice); - Ok(()) - } +macro_rules! iblas_nrm2_for_native { + ($t:ident) => ( + fn nrm2(&self, x: &SharedTensor<$t>, result: &mut SharedTensor<$t>) + -> Result<(), ::collenchyma::error::Error> { + let r_slice = write_only!(result, $t, self); + r_slice[0] = rblas::Nrm2::nrm2(read!(x, $t, self)); + Ok(()) } ); } -macro_rules! impl_scale_for { - ($t:ident, $b:ty) => ( - impl IOperationScale<$t> for $b { - fn compute(&self, a: &MemoryType, x: &mut MemoryType) -> Result<(), Error> { - let a_slice = try!(a.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `a`."))).as_slice::<$t>(); - let mut x_slice = try!(x.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `x`."))).as_mut_slice::<$t>(); - rblas::Scal::scal(&a_slice[0], x_slice); - Ok(()) - } +macro_rules! iblas_scal_for_native { + ($t:ident) => ( + fn scal(&self, a: &SharedTensor<$t>, x: &mut SharedTensor<$t>) + -> Result<(), ::collenchyma::error::Error> { + rblas::Scal::scal( + &read!(a, $t, self)[0], + read_write!(x, $t, self)); + Ok(()) } ); } -macro_rules! impl_swap_for { - ($t:ident, $b:ty) => ( - impl IOperationSwap<$t> for $b { - fn compute(&self, x: &mut MemoryType, y: &mut MemoryType) -> Result<(), Error> { - let mut x_slice = try!(x.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `x`."))).as_mut_slice::<$t>(); - let mut y_slice = try!(y.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `y`."))).as_mut_slice::<$t>(); - rblas::Swap::swap(x_slice, y_slice); - Ok(()) - } +macro_rules! iblas_swap_for_native { + ($t:ident) => ( + fn swap(&self, x: &mut SharedTensor<$t>, y: &mut SharedTensor<$t>) + -> Result<(), ::collenchyma::error::Error> { + rblas::Swap::swap(read_write!(x, $t, self), read_write!(y, $t, self)); + Ok(()) } ); } -macro_rules! impl_gemm_for { - ($t:ident, $b:ty) => ( - impl IOperationGemm<$t> for $b { - fn compute(&self, alpha: &MemoryType, at: Transpose, a_dims: &[usize], a: &MemoryType, bt: Transpose, b_dims: &[usize], b: &MemoryType, beta: &MemoryType, c_dims: &[usize], c: &mut MemoryType) -> Result<(), ::collenchyma::error::Error> { - let alpha_slice = try!(alpha.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `alpha`."))).as_slice::<$t>(); - let a_slice = try!(a.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `a`."))).as_slice::<$t>(); - let beta_slice = try!(beta.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `beta`."))).as_slice::<$t>(); - let b_slice = try!(b.as_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `b`."))).as_slice::<$t>(); - let mut c_slice = try!(c.as_mut_native().ok_or(Error::MissingMemoryForDevice("Unable to receive native memory for `c`."))).as_mut_slice::<$t>(); - - let a_matrix = as_matrix(a_slice, a_dims); - let b_matrix = as_matrix(b_slice, b_dims); - let mut c_matrix = as_matrix(c_slice, c_dims); - rblas::Gemm::gemm(&alpha_slice[0], at.to_rblas(), &a_matrix, bt.to_rblas(), &b_matrix, &beta_slice[0], &mut c_matrix); - read_from_matrix(&c_matrix, c_slice); - Ok(()) - } +macro_rules! iblas_gemm_for_native { + ($t:ident) => ( + fn gemm(&self, + alpha: &SharedTensor<$t>, + at: Transpose, + a: &SharedTensor<$t>, + bt: Transpose, + b: &SharedTensor<$t>, + beta: &SharedTensor<$t>, + c: &mut SharedTensor<$t> + ) -> Result<(), ::collenchyma::error::Error> { + let c_dims = c.desc().clone(); // FIXME: clone() can be removed + + let a_slice = read!(a, $t, self); + let b_slice = read!(b, $t, self); + let c_slice = write_only!(c, $t, self); + + let a_matrix = as_matrix(a_slice, a.desc().dims()); + let b_matrix = as_matrix(b_slice, b.desc().dims()); + let mut c_matrix = as_matrix(c_slice, &c_dims); + rblas::Gemm::gemm( + &read!(alpha, $t, self)[0], + at.to_rblas(), + &a_matrix, + bt.to_rblas(), + &b_matrix, + &read!(beta, $t, self)[0], + &mut c_matrix); + read_from_matrix(&c_matrix, c_slice); + Ok(()) } ); } macro_rules! impl_iblas_for { ($t:ident, $b:ty) => ( - impl_asum_for!($t, $b); - impl_axpy_for!($t, $b); - impl_copy_for!($t, $b); - impl_dot_for!($t, $b); - impl_nrm2_for!($t, $b); - impl_scale_for!($t, $b); - impl_swap_for!($t, $b); - - impl_gemm_for!($t, $b); - impl IBlas<$t> for $b { } // Level 1 impl Asum<$t> for $b { - iblas_asum_for!($t, $b); + iblas_asum_for_native!($t); } impl Axpy<$t> for $b { - iblas_axpy_for!($t, $b); + iblas_axpy_for_native!($t); } impl Copy<$t> for $b { - iblas_copy_for!($t, $b); + iblas_copy_for_native!($t); } impl Dot<$t> for $b { - iblas_dot_for!($t, $b); + iblas_dot_for_native!($t); } impl Nrm2<$t> for $b { - iblas_nrm2_for!($t, $b); + iblas_nrm2_for_native!($t); } impl Scal<$t> for $b { - iblas_scale_for!($t, $b); + iblas_scal_for_native!($t); } impl Swap<$t> for $b { - iblas_swap_for!($t, $b); + iblas_swap_for_native!($t); } impl Gemm<$t> for $b { - iblas_gemm_for!($t, $b); + iblas_gemm_for_native!($t); } ); } @@ -239,14 +254,15 @@ mod test { #[test] fn it_converts_correctly_to_and_from_matrix() { let backend = get_native_backend(); - let mut a = SharedTensor::::new(backend.device(), &vec![3, 2]).unwrap(); - write_to_memory(a.get_mut(backend.device()).unwrap(), + let mut a = SharedTensor::::new(&vec![3, 2]).unwrap(); + write_to_memory(a.write_only(backend.device()).unwrap(), &[2f32, 5f32, 2f32, 5f32, 2f32, 5f32]); { - let a_slice_in = a.get(backend.device()).unwrap().as_native().unwrap().as_slice::(); + let a_slice_in = a.read(backend.device()).unwrap() + .as_native().unwrap().as_slice::(); let a_mat = as_matrix(a_slice_in, &[3, 2]); // right assert_eq!(a_mat[0][0], 2f32); diff --git a/src/helper.rs b/src/helper.rs deleted file mode 100644 index b2b6e46..0000000 --- a/src/helper.rs +++ /dev/null @@ -1,289 +0,0 @@ -//! Provides macros for convenient implementation of BLAS operations. - -#[macro_export] -macro_rules! iblas_asum_for { - ($t:ident, $b:ty) => ( - fn asum(&self, - x: &mut ::collenchyma::tensor::SharedTensor<$t>, - result: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => () } - Ok(try!( - <$b as IOperationAsum<$t>>::compute(&self, - try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))), - try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`"))), - ) - )) - } - - fn asum_plain(&self, - x: &::collenchyma::tensor::SharedTensor<$t>, - result: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - Ok(try!( - <$b as IOperationAsum<$t>>::compute(&self, - try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))), - try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`"))), - ) - )) - } - ); -} - -#[macro_export] -macro_rules! iblas_axpy_for { - ($t:ident, $b:ty) => ( - fn axpy(&self, - a: &mut ::collenchyma::tensor::SharedTensor<$t>, - x: &mut ::collenchyma::tensor::SharedTensor<$t>, - y: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match a.add_device(self.device()) { _ => try!(a.sync(self.device())) } - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match y.add_device(self.device()) { _ => try!(y.sync(self.device())) } - Ok(try!( - <$b as IOperationAxpy<$t>>::compute(&self, - try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`"))), - try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))), - try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))), - ) - )) - } - - fn axpy_plain(&self, - a: &::collenchyma::tensor::SharedTensor<$t>, - x: &::collenchyma::tensor::SharedTensor<$t>, - y: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - Ok(try!( - <$b as IOperationAxpy<$t>>::compute(&self, - try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`"))), - try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))), - try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))), - ) - )) - } - ); -} - -#[macro_export] -macro_rules! iblas_copy_for { - ($t:ident, $b:ty) => ( - fn copy(&self, - x: &mut ::collenchyma::tensor::SharedTensor<$t>, - y: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match y.add_device(self.device()) { _ => () } - Ok(try!( - <$b as IOperationCopy<$t>>::compute(&self, - try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))), - try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))), - ) - )) - } - - fn copy_plain(&self, - x: &::collenchyma::tensor::SharedTensor<$t>, - y: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - Ok(try!( - <$b as IOperationCopy<$t>>::compute(&self, - try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))), - try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))), - ) - )) - } - ); -} - -#[macro_export] -macro_rules! iblas_dot_for { - ($t:ident, $b:ty) => ( - fn dot(&self, - x: &mut ::collenchyma::tensor::SharedTensor<$t>, - y: &mut ::collenchyma::tensor::SharedTensor<$t>, - result: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match y.add_device(self.device()) { _ => try!(y.sync(self.device())) } - match result.add_device(self.device()) { _ => () } - Ok(try!( - <$b as IOperationDot<$t>>::compute(&self, - try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))), - try!(y.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))), - try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`"))) - ) - )) - } - - fn dot_plain(&self, - x: &::collenchyma::tensor::SharedTensor<$t>, - y: &::collenchyma::tensor::SharedTensor<$t>, - result: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - Ok(try!( - <$b as IOperationDot<$t>>::compute(&self, - try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))), - try!(y.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))), - try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`"))) - ) - )) - } - ); -} - -#[macro_export] -macro_rules! iblas_nrm2_for { - ($t:ident, $b:ty) => ( - fn nrm2(&self, - x: &mut ::collenchyma::tensor::SharedTensor<$t>, - result: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => () } - Ok(try!( - <$b as IOperationNrm2<$t>>::compute(&self, - try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))), - try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`"))), - ) - )) - } - - fn nrm2_plain(&self, - x: &::collenchyma::tensor::SharedTensor<$t>, - result: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - Ok(try!( - <$b as IOperationNrm2<$t>>::compute(&self, - try!(x.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))), - try!(result.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `result`"))), - ) - )) - } - ); -} - -#[macro_export] -macro_rules! iblas_scale_for { - ($t:ident, $b:ty) => ( - fn scal(&self, - a: &mut ::collenchyma::tensor::SharedTensor<$t>, - x: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match a.add_device(self.device()) { _ => try!(a.sync(self.device())) } - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - Ok(try!( - <$b as IOperationScale<$t>>::compute(&self, - try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`"))), - try!(x.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))), - ) - )) - } - - fn scal_plain(&self, - a: &::collenchyma::tensor::SharedTensor<$t>, - x: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - Ok(try!( - <$b as IOperationScale<$t>>::compute(&self, - try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`"))), - try!(x.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))), - ) - )) - } - ); -} - -#[macro_export] -macro_rules! iblas_swap_for { - ($t:ident, $b:ty) => ( - fn swap(&self, - x: &mut ::collenchyma::tensor::SharedTensor<$t>, - y: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match y.add_device(self.device()) { _ => try!(y.sync(self.device())) } - Ok(try!( - <$b as IOperationSwap<$t>>::compute(&self, - try!(x.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))), - try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))), - ) - )) - } - - fn swap_plain(&self, - x: &mut ::collenchyma::tensor::SharedTensor<$t>, - y: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - Ok(try!( - <$b as IOperationSwap<$t>>::compute(&self, - try!(x.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `x`"))), - try!(y.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `y`"))), - ) - )) - } - ); -} - -#[macro_export] -macro_rules! iblas_gemm_for { - ($t:ident, $b:ty) => ( - fn gemm(&self, - alpha: &mut ::collenchyma::tensor::SharedTensor<$t>, - at: ::transpose::Transpose, - a: &mut ::collenchyma::tensor::SharedTensor<$t>, - bt: ::transpose::Transpose, - b: &mut ::collenchyma::tensor::SharedTensor<$t>, - beta: &mut ::collenchyma::tensor::SharedTensor<$t>, - c: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - match alpha.add_device(self.device()) { _ => try!(alpha.sync(self.device())) } - match a.add_device(self.device()) { _ => try!(a.sync(self.device())) } - match beta.add_device(self.device()) { _ => try!(beta.sync(self.device())) } - match b.add_device(self.device()) { _ => try!(b.sync(self.device())) } - match c.add_device(self.device()) { _ => try!(c.sync(self.device())) } - - Ok(try!( - <$b as IOperationGemm<$t>>::compute(&self, - try!(alpha.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `alpha`"))), - at, - &a.desc().clone(), - try!(a.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`"))), - bt, - &b.desc().clone(), - try!(b.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `b`"))), - try!(beta.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `beta`"))), - &c.desc().clone(), - try!(c.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `c`"))), - ) - )) - } - - fn gemm_plain(&self, - alpha: &::collenchyma::tensor::SharedTensor<$t>, - at: ::transpose::Transpose, - a: &::collenchyma::tensor::SharedTensor<$t>, - bt: ::transpose::Transpose, - b: &::collenchyma::tensor::SharedTensor<$t>, - beta: &::collenchyma::tensor::SharedTensor<$t>, - c: &mut ::collenchyma::tensor::SharedTensor<$t> - ) -> Result<(), ::collenchyma::error::Error> { - Ok(try!( - <$b as IOperationGemm<$t>>::compute(&self, - try!(alpha.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `alpha`"))), - at, - &a.desc().clone(), - try!(a.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `a`"))), - bt, - &b.desc().clone(), - try!(b.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `b`"))), - try!(beta.get(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `beta`"))), - &c.desc().clone(), - try!(c.get_mut(self.device()).ok_or(::collenchyma::plugin::Error::MissingMemoryForDevice("Unable to resolve memory for `c`"))), - ) - )) - } - ); -} diff --git a/src/lib.rs b/src/lib.rs index e3c4245..82407ef 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -61,6 +61,4 @@ pub mod plugin; pub mod binary; pub mod operation; pub mod transpose; -#[macro_use] -pub mod helper; pub mod frameworks; diff --git a/src/plugin.rs b/src/plugin.rs index 08fb4e5..619face 100644 --- a/src/plugin.rs +++ b/src/plugin.rs @@ -11,130 +11,64 @@ pub trait IBlas { } /// Provides the asum operation. pub trait Asum { - /// Computes the absolute sum of vector `x` with complete memory management. + /// Computes the absolute sum of vector `x`. /// /// Saves the result to `result`. /// This is a Level 1 BLAS operation. - /// - /// For a no-memory managed version see `asum_plain`. - fn asum(&self, x: &mut SharedTensor, result: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; - - /// Computes the absolute sum of vector `x` without any memory management. - /// - /// Saves the result to `result`. - /// This is a Level 1 BLAS operation. - /// - /// *Attention*:
- /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.
- /// For a memory managed version see `asum`. - fn asum_plain(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; + fn asum(&self, x: &SharedTensor, result: &mut SharedTensor) + -> Result<(), ::collenchyma::error::Error>; } /// Provides the axpy operation. pub trait Axpy { - /// Computes a vector `x` times a constant `a` plus a vector `y` aka. `a * x + y` with complete memory management. + /// Computes a vector `x` times a constant `a` plus a vector `y` aka. `a * x + y`. /// /// Saves the resulting vector back into `y`. /// This is a Level 1 BLAS operation. - /// - /// For a no-memory managed version see `axpy_plain`. - fn axpy(&self, a: &mut SharedTensor, x: &mut SharedTensor, y: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; - - /// Computes a vector `x` times a constant `a` plus a vector `y` aka. `a * x + y` without any memory management. - /// - /// Saves the resulting vector back into `y`. - /// This is a Level 1 BLAS operation. - /// - /// *Attention*:
- /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.
- /// For a memory managed version see `axpy`. - fn axpy_plain(&self, a: &SharedTensor, x: &SharedTensor, y: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; + fn axpy(&self, a: &SharedTensor, x: &SharedTensor, y: &mut SharedTensor) + -> Result<(), ::collenchyma::error::Error>; } /// Provides the copy operation. pub trait Copy { - /// Copies `x.len()` elements of vector `x` into vector `y` with complete memory management. + /// Copies `x.len()` elements of vector `x` into vector `y`. /// /// Saves the result to `y`. /// This is a Level 1 BLAS operation. - /// - /// For a no-memory managed version see `copy_plain`. - fn copy(&self, x: &mut SharedTensor, y: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; - - /// Copies `x.len()` elements of vector `x` into vector `y` without any memory management. - /// - /// Saves the result to `y`. - /// This is a Level 1 BLAS operation. - /// - /// *Attention*:
- /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.
- /// For a memory managed version see `copy`. - fn copy_plain(&self, x: &SharedTensor, y: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; + fn copy(&self, x: &SharedTensor, y: &mut SharedTensor) + -> Result<(), ::collenchyma::error::Error>; } /// Provides the dot operation. pub trait Dot { - /// Computes the [dot product][dot-product] over x and y with complete memory management. - /// [dot-product]: https://en.wikipedia.org/wiki/Dot_product - /// - /// Saves the resulting value into `result`. - /// This is a Level 1 BLAS operation. - /// - /// For a no-memory managed version see `dot_plain`. - fn dot(&self, x: &mut SharedTensor, y: &mut SharedTensor, result: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; - - /// Computes the [dot product][dot-product] over x and y without any memory management. + /// Computes the [dot product][dot-product] over x and y. /// [dot-product]: https://en.wikipedia.org/wiki/Dot_product /// /// Saves the resulting value into `result`. /// This is a Level 1 BLAS operation. - /// - /// *Attention*:
- /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.
- /// For a memory managed version see `dot`. - fn dot_plain(&self, x: &SharedTensor, y: &SharedTensor, result: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; + fn dot(&self, x: &SharedTensor, y: &SharedTensor, + result: &mut SharedTensor) + -> Result<(), ::collenchyma::error::Error>; } /// Provides the nrm2 operation. pub trait Nrm2 { - /// Computes the L2 norm aka. euclidean length of vector `x` with complete memory management. + /// Computes the L2 norm aka. euclidean length of vector `x`. /// /// Saves the result to `result`. /// This is a Level 1 BLAS operation. - /// - /// For a no-memory managed version see `nrm2_plain`. - fn nrm2(&self, x: &mut SharedTensor, result: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; - - /// Computes the L2 norm aka. euclidean length of vector `x` without any memory management. - /// - /// Saves the result to `result`. - /// This is a Level 1 BLAS operation. - /// - /// *Attention*:
- /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.
- /// For a memory managed version see `nrm2`. - fn nrm2_plain(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; + fn nrm2(&self, x: &SharedTensor, result: &mut SharedTensor) + -> Result<(), ::collenchyma::error::Error>; } /// Provides the scal operation. pub trait Scal { - /// Scales a vector `x` by a constant `a` aka. `a * x` with complete memory management. + /// Scales a vector `x` by a constant `a` aka. `a * x`. /// /// Saves the resulting vector back into `x`. /// This is a Level 1 BLAS operation. - /// - /// For a no-memory managed version see `scale_plain`. - fn scal(&self, a: &mut SharedTensor, x: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; - - /// Scales a vector `x` by a constant `a` aka. `a * x` without any memory management. - /// - /// Saves the resulting vector back into `x`. - /// This is a Level 1 BLAS operation. - /// - /// *Attention*:
- /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.
- /// For a memory managed version see `scale`. - fn scal_plain(&self, a: &SharedTensor, x: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; + fn scal(&self, a: &SharedTensor, x: &mut SharedTensor) + -> Result<(), ::collenchyma::error::Error>; } /// Provides the swap operation. @@ -143,19 +77,8 @@ pub trait Swap { /// /// Saves the resulting vector back into `x`. /// This is a Level 1 BLAS operation. - /// - /// For a no-memory managed version see `swap_plain`. - fn swap(&self, x: &mut SharedTensor, y: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; - - /// Swaps the content of vector `x` and vector `y` without any memory management. - /// - /// Saves the resulting vector back into `x`. - /// This is a Level 1 BLAS operation. - /// - /// *Attention*:
- /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.
- /// For a memory managed version see `swap`. - fn swap_plain(&self, x: &mut SharedTensor, y: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; + fn swap(&self, x: &mut SharedTensor, y: &mut SharedTensor) + -> Result<(), ::collenchyma::error::Error>; } /// Provides the gemm operation. @@ -164,19 +87,11 @@ pub trait Gemm { /// /// Saves the result into `c`. /// This is a Level 3 BLAS operation. - /// - /// For a no-memory managed version see `gemm_plain`. - fn gemm(&self, alpha: &mut SharedTensor, at: Transpose, a: &mut SharedTensor, bt: Transpose, b: &mut SharedTensor, beta: &mut SharedTensor, c: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; - - /// Computes a matrix-matrix product with general matrices. - /// - /// Saves the result into `c`. - /// This is a Level 3 BLAS operation. - /// - /// *Attention*:
- /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.
- /// For a memory managed version see `gemm`. - fn gemm_plain(&self, alpha: &SharedTensor, at: Transpose, a: &SharedTensor, bt: Transpose, b: &SharedTensor, beta: &SharedTensor, c: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; + fn gemm(&self, alpha: &SharedTensor, + at: Transpose, a: &SharedTensor, + bt: Transpose, b: &SharedTensor, + beta: &SharedTensor, + c: &mut SharedTensor) -> Result<(), ::collenchyma::error::Error>; } /// Allows a BlasBinary to be provided which is used for a IBlas implementation.