Merge pull request #147 from ComputationalRadiationPhysics/dev

Release 2.3.0crp
alpaka-group · Jun 11, 2018 · 4b779a3 · 4b779a3
2 parents 80bf2b0 + 1ca54d6
commit 4b779a3
Show file tree

Hide file tree

Showing 26 changed files with 982 additions and 712 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,31 +1,39 @@
 language: cpp
 
+sudo: required
+
+dist: trusty
+
 compiler:
   - gcc
 
 env:
   global:
     - INSTALL_DIR=~/mylibs
+    - CXXFLAGS="-Werror"
 
 script:
   - mkdir build_tmp && cd build_tmp
-  - CXXFLAGS="-Werror" cmake -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR $TRAVIS_BUILD_DIR
+  - cmake -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR $TRAVIS_BUILD_DIR
   - make
   - make install
   - make examples
 
 before_script:
-  - sudo add-apt-repository --yes ppa:smspillaz/cmake-2.8.12
+  - cat /etc/apt/sources.list
+  - cat /etc/apt/sources.list.d/*
+  - sudo apt-add-repository multiverse
   - sudo apt-get update -qq
+  - sudo dpkg --configure -a
+  - sudo apt-get install -f -qq
+  - sudo dpkg --get-selections | grep hold || { echo "All packages OK."; }
   - sudo apt-get install -q -y cmake-data cmake
   - sudo apt-get install -qq build-essential
-  - sudo apt-get install -qq gcc-4.4 g++-4.4
-  - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.4 60 --slave /usr/bin/g++ g++ /usr/bin/g++-4.4
-  - gcc --version && g++ --version
+  - gcc --version && g++ --version # 4.8
+  - apt-cache search nvidia-*
   - sudo apt-get install -qq nvidia-common
-  - sudo apt-get install -qq nvidia-current
-  - sudo apt-get install -qq nvidia-cuda-toolkit nvidia-cuda-dev
-  - sudo apt-get install -qq libboost1.48-dev
+  - sudo apt-get install -qq nvidia-cuda-dev nvidia-cuda-toolkit # 5.5
+  - sudo apt-get install -qq libboost-dev # 1.54.0
   - sudo find /usr/ -name libcuda*.so
 
 after_script:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,42 @@
 Change Log / Release Log for mallocMC
 ================================================================
 
+2.3.0crp
+--------
+**Date:** 2018-06-11
+
+This release adds support for CUDA 9 and clang's -x cuda frontend and fixes several bugs.
+Global objects have been refactored to separate objects on host and device.
+
+### Changes to mallocMC 2.2.0crp
+
+**Features**
+ - CUDA 9 support #144 #145
+ - clang++ -x cuda support #133
+ - add `destructiveResize` method #136
+ - heap as separate object on host and device, no more globals #116
+ - use `BOOST_STATIC_CONSTEXPR` where possible #109
+
+**Bug fixes**
+ - fix uninitialized pointers #110 #112
+ - fix crash in getAvailableSlots #106 #107
+ - Fix `uint32_t` cstdint #104 #105
+ - fix missing boost include #142
+ - fix includes from C headers #121
+ - fix missing local size change in `finalizeHeap()` #135
+ - check heap pointer in Scatter creation policy #126
+
+**Misc:**
+ - better link usage and install docs #141
+ - self consistent allocator #140
+ - rename some shadowed variables in C++11 mode #108
+ - properly enforce `-Werror` in Travis-CI #128
+ - update Travis-CI image #119
+ - improved docs #125 #127
+
+Thanks to Carlchristian Eckert, René Widera, Axel Huebl and Alexander Grund for contributing to this release!
+
+
 2.2.0crp
 -------------
 **Date:** 2015-09-25

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -81,18 +81,22 @@ INSTALL(
 ###############################################################################
 # Executables
 ###############################################################################
-add_custom_target(examples DEPENDS mallocMC_Example01 mallocMC_Example02 VerifyHeap)
+add_custom_target(examples DEPENDS mallocMC_Example01 mallocMC_Example02 mallocMC_Example03 VerifyHeap)
 
 cuda_add_executable(mallocMC_Example01
                     EXCLUDE_FROM_ALL
                     examples/mallocMC_example01.cu )
 cuda_add_executable(mallocMC_Example02
                     EXCLUDE_FROM_ALL
                     examples/mallocMC_example02.cu )
+cuda_add_executable(mallocMC_Example03
+                    EXCLUDE_FROM_ALL
+                    examples/mallocMC_example03.cu )
 cuda_add_executable(VerifyHeap
                     EXCLUDE_FROM_ALL
                     tests/verify_heap.cu )
 
 target_link_libraries(mallocMC_Example01 ${LIBS})
 target_link_libraries(mallocMC_Example02 ${LIBS})
+target_link_libraries(mallocMC_Example03 ${LIBS})
 target_link_libraries(VerifyHeap ${LIBS})
diff --git a/README.md b/README.md
@@ -8,6 +8,20 @@ accelerators**. Currently, it supports **NVIDIA GPUs** of compute capability
 `sm_20` or higher through the *ScatterAlloc* algorithm.
 
 
+Usage
+-------
+
+Follow the step-by-step instructions in [Usage.md](Usage.md) to replace your
+`new`/`malloc` calls with a *blacingly fast* mallocMC heap! :rocket:
+
+
+Install
+-------
+
+mallocMC is header-only, but requires a few other C++ libraries to be
+available. Our installation notes can be found in [INSTALL.md](INSTALL.md).
+
+
 On the ScatterAlloc Algorithm
 -----------------------------
 
@@ -42,16 +56,11 @@ Branches
 
 | *branch*    | *state* | *description*           |
 | ----------- | ------- | ----------------------- |
-| **master**  | [![Build Status Master](https://travis-ci.org/ComputationalRadiationPhysics/mallocMC.png?branch=master)](https://travis-ci.org/ComputationalRadiationPhysics/mallocMC "master") | our stable new releases |
+| **master**  | [![Build Status Master](https://travis-ci.org/ComputationalRadiationPhysics/mallocMC.png?branch=master)](https://travis-ci.org/ComputationalRadiationPhysics/mallocMC "master") | our latest stable release |
 | **dev**     | [![Build Status Development](https://travis-ci.org/ComputationalRadiationPhysics/mallocMC.png?branch=dev)](https://travis-ci.org/ComputationalRadiationPhysics/mallocMC "dev") | our development branch - start and merge new branches here |
 | **tugraz**  | n/a | *ScatterAlloc* "upstream" branch: not backwards compatible mirror for algorithmic changes |
 
 
-Install
--------
-
-Installation notes can be found in [INSTALL.md](INSTALL.md).
-
 
 Literature
 ----------
@@ -64,6 +73,9 @@ Just an incomplete link collection for now:
 - 2012, May 5th: [Presentation](http://innovativeparallel.org/Presentations/inPar_kainz.pdf)
         at *Innovative Parallel Computing 2012* by *Bernhard Kainz*
 
+- Junior Thesis [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.34461.svg)](http://dx.doi.org/10.5281/zenodo.34461) by
+  Carlchristian Eckert (2014)
+
 
 License
 -------

diff --git a/Usage.md b/Usage.md
@@ -13,7 +13,7 @@ There is one header file that will include *all* necessary files:
 Step 2a: choose policies
 -----------------------
 
-Each instance of a policy based allocator is composed through 5 **policies**. Each policy is expressed as a **policy class**. 
+Each instance of a policy based allocator is composed through 5 **policies**. Each policy is expressed as a **policy class**.
 
 Currently, there are the following policy classes available:
 
@@ -73,7 +73,7 @@ could create the following typedef instead:
 ```c++
 using namespace mallocMC;
 
-typedef mallocMC::Allocator< 
+typedef mallocMC::Allocator<
   CreationPolicies::Scatter<>,
   DistributionPolicies::XMallocSIMD<>,
   OOMPolicies::ReturnNull,
@@ -90,59 +90,63 @@ configuration struct defined above.
 Step 3: instantiate allocator
 -----------------------------
 
-To create a default instance of the ScatterAllocator type and add the necessary 
-functions, the following Macro has to be executed:
+To use the defined allocator type, create an instance with the desired heap size:
 
 ```c++
-MALLOCMC_SET_ALLOCATOR_TYPE(ScatterAllocator)
+ScatterAllocator sa( 512U * 1024U * 1024U ); // heap size of 512MiB
 ```
 
-This will set up the following functions in the namespace `mallocMC`:
+The allocator object offers the following methods
+
+| Name | description |
+|---------------------- |-------------------------|
+| getAvailableSlots(size_t)   | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits<ScatterAllocator>::providesAvailableSlots`) |
+
 
-| Name                  | description                                                                                                                                                                                                |
-|-----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| mallocMC::initHeap()            | Initializes the heap. Must be called before any other calls to the allocator are permitted. Can take the desired size of the heap as a parameter                                                           |
-| mallocMC::finalizeHeap()        | Destroys the heap again                     |
-| mallocMC::malloc() | Allocates memory on the accelerator              |
-| mallocMC::free()     | Frees memory on the accelerator     |
-| mallocMC::getAvailableSlots()   | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits<ScatterAllocator>::providesAvailableSlots`) |
+Step 4: use dynamic memory allocation in a kernel
+-------------------------------------------------
 
+A handle to the allocator object must be passed to each kernel. The handle type is defined as an internal type of the allocator. Inside the kernel, this handle can be used to request memory.
+
+The handle offers the following methods:
+
+| Name | description |
+|---------------------- |-------------------------|
+| malloc(size_t) | Allocates memory on the accelerator  |
+| free(size_t)     | Frees memory on the accelerator    |
+| getAvailableSlots()   | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits<ScatterAllocator>::providesAvailableSlots`) |
 
-Step 4: use dynamic memory allocation
--------------------------------------
 A simplistic example would look like this:
 ```c++
 #include <mallocMC/mallocMC.hpp>
 
 namespace mallocMC = MC;
 
-typedef MC::Allocator< 
+typedef MC::Allocator<
   MC::CreationPolicies::Scatter<>,
   MC::DistributionPolicies::XMallocSIMD<>,
   MC::OOMPolicies::ReturnNull,
   MC::ReservePoolPolicies::SimpleCudaMalloc,
   MC::AlignmentPolicies::Shrink<ShrinkConfig>
   > ScatterAllocator;
 
-MALLOCMC_SET_ALLOCATOR_TYPE(ScatterAllocator)
-
-__global__ exampleKernel()
+__global__ exampleKernel(ScatterAllocator::AllocatorHandle sah)
 {
   // some code ...
-  
-  int* a = (int*) MC::malloc(sizeof(int)*42);
-  
+
+  int* a = (int*) sah.malloc(sizeof(int)*42);
+
   // some more code, using *a
-  
-  MC::free(a);
+
+  sah.free(a);
 }
 
 int main(){
-  MC::initHeap(512); // heapsize of 512MB
+  ScatterAllocator sa( 1U * 512U * 1024U * 1024U ); // heap size of 512MiB
+  exampleKernel<<< 32, 32 >>>(sa);
 
-  exampleKernel<<<32,32>>>();
-
-  MC::finalizeHeap();
   return 0;
 }
 ```
+
+For more usage examples, have a look at the [examples](examples).
diff --git a/examples/mallocMC_example01.cu b/examples/mallocMC_example01.cu
@@ -27,7 +27,7 @@
 */
 
 #include <iostream>
-#include <assert.h>
+#include <cassert>
 #include <vector>
 #include <numeric>
 
@@ -55,28 +55,28 @@ int main()
 }
 
 
-__device__ int** a;
-__device__ int** b;
-__device__ int** c;
+__device__ int** arA;
+__device__ int** arB;
+__device__ int** arC;
 
 
-__global__ void createArrays(int x, int y){
-  a = (int**) mallocMC::malloc(sizeof(int*) * x*y); 
-  b = (int**) mallocMC::malloc(sizeof(int*) * x*y); 
-  c = (int**) mallocMC::malloc(sizeof(int*) * x*y); 
+__global__ void createArrayPointers(int x, int y, ScatterAllocator::AllocatorHandle mMC){
+  arA = (int**) mMC.malloc(sizeof(int*) * x*y);
+  arB = (int**) mMC.malloc(sizeof(int*) * x*y);
+  arC = (int**) mMC.malloc(sizeof(int*) * x*y);
 }
 
 
-__global__ void fillArrays(int length, int* d){
+__global__ void fillArrays(int length, int* d, ScatterAllocator::AllocatorHandle mMC){
   int id = threadIdx.x + blockIdx.x*blockDim.x;
 
-  a[id] = (int*) mallocMC::malloc(length*sizeof(int));
-  b[id] = (int*) mallocMC::malloc(length*sizeof(int));
-  c[id] = (int*) mallocMC::malloc(sizeof(int)*length);
-  
+  arA[id] = (int*) mMC.malloc(length*sizeof(int));
+  arB[id] = (int*) mMC.malloc(length*sizeof(int));
+  arC[id] = (int*) mMC.malloc(sizeof(int)*length);
+
   for(int i=0 ; i<length; ++i){
-    a[id][i] = id*length+i; 
-    b[id][i] = id*length+i;
+    arA[id][i] = id*length+i;
+    arB[id][i] = id*length+i;
   }
 }
 
@@ -86,17 +86,24 @@ __global__ void addArrays(int length, int* d){
 
   d[id] = 0;
   for(int i=0 ; i<length; ++i){
-    c[id][i] = a[id][i] + b[id][i];
-    d[id] += c[id][i];
+    arC[id][i] = arA[id][i] + arB[id][i];
+    d[id] += arC[id][i];
   }
 }
 
 
-__global__ void freeArrays(){
+__global__ void freeArrays(ScatterAllocator::AllocatorHandle mMC){
   int id = threadIdx.x + blockIdx.x*blockDim.x;
-  mallocMC::free(a[id]);
-  mallocMC::free(b[id]);
-  mallocMC::free(c[id]);
+  mMC.free(arA[id]);
+  mMC.free(arB[id]);
+  mMC.free(arC[id]);
+}
+
+
+__global__ void freeArrayPointers(ScatterAllocator::AllocatorHandle mMC){
+  mMC.free(arA);
+  mMC.free(arB);
+  mMC.free(arC);
 }
 
 
@@ -109,7 +116,7 @@ void run()
 
   //init the heap
   std::cerr << "initHeap...";
-  mallocMC::initHeap(1U*1024U*1024U*1024U); //1GB for device-side malloc
+  ScatterAllocator mMC(1U*1024U*1024U*1024U); //1GB for device-side malloc
   std::cerr << "done" << std::endl;
 
   std::cout << ScatterAllocator::info("\n") << std::endl;
@@ -122,18 +129,18 @@ void run()
   std::vector<int> array_sums(block*grid,0);
 
   // create arrays of arrays on the device
-  createArrays<<<1,1>>>(grid,block);
+  createArrayPointers<<<1,1>>>(grid,block, mMC );
 
   // fill 2 of them all with ascending values
-  fillArrays<<<grid,block>>>(length, d);
+  fillArrays<<<grid,block>>>(length, d, mMC );
 
   // add the 2 arrays (vector addition within each thread)
   // and do a thread-wise reduce to d
   addArrays<<<grid,block>>>(length, d);
 
   cudaMemcpy(&array_sums[0],d,sizeof(int)*block*grid,cudaMemcpyDeviceToHost);
 
-  mallocMC::getAvailableSlots(1024U*1024U); //get available megabyte-sized slots
+  mMC.getAvailableSlots(1024U*1024U); //get available megabyte-sized slots
 
   int sum = std::accumulate(array_sums.begin(),array_sums.end(),0);
   std::cout << "The sum of the arrays on GPU is " << sum << std::endl;
@@ -142,8 +149,8 @@ void run()
   int gaussian = n*(n-1);
   std::cout << "The gaussian sum as comparison: " << gaussian << std::endl;
 
-  freeArrays<<<grid,block>>>();
+  freeArrays<<<grid,block>>>( mMC );
+  freeArrayPointers<<<1,1>>>( mMC );
   cudaFree(d);
-  //finalize the heap again
-  mallocMC::finalizeHeap();
+
 }