Merge pull request #20 from ax3l/release-1.0.2crp

Release 1.0.2crp
alpaka-group · Jan 7, 2014 · ec9c8e0 · ec9c8e0
2 parents bbf1a38 + cdef072
commit ec9c8e0
Show file tree

Hide file tree

Showing 7 changed files with 89 additions and 22 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,20 @@
+language: cpp
+
+compiler:
+  - gcc
+
+script:
+  - mkdir build_tmp && cd build_tmp
+  - cmake $TRAVIS_BUILD_DIR
+  - make
+
+before_script:
+  - sudo apt-get update -qq
+  - sudo apt-get install -qq build-essential
+  - sudo apt-get install -qq gcc-4.4 g++-4.4
+  - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.4 60 --slave /usr/bin/g++ g++ /usr/bin/g++-4.4
+  - gcc --version && g++ --version
+  - sudo apt-get install -qq nvidia-common
+  - sudo apt-get install -qq nvidia-current
+  - sudo apt-get install -qq nvidia-cuda-toolkit nvidia-cuda-dev
+  - sudo find /usr/ -name libcuda*.so
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,25 @@
+Change Log / Release Log for ScatterAlloc
+================================================================
+
+1.0.2crp
+-------------
+**Date:** 2014-01-07
+
+This is our first bug fix release.
+We closed all issues documented in
+[Milestone *Bug fixes*](https://github.com/ComputationalRadiationPhysics/scatteralloc/issues?milestone=1&state=closed)
+
+### Changes to 1.0.1
+
+**Features:**
+  - added travis-ci.org support for compile tests #7
+
+**Bug fixes:**
+  - broken cmake/compile #1
+  - g++ warnings #10
+  - only N-1 access blocks used instead of N #2
+  - 32bit bug: allocate more than 4GB #12
+
+**Misc:**
+  See the full changes at
+  https://github.com/ComputationalRadiationPhysics/scatteralloc/compare/1.0.1...1.0.2crp
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,7 +1,9 @@
 project(ScatterAlloc)
 cmake_minimum_required(VERSION 2.6)
-set(CUDA_NVCC_FLAGS "-arch=sm_20;-use_fast_math;")
+
 find_package(CUDA REQUIRED)
+set(CUDA_NVCC_FLAGS "-arch=sm_20;-use_fast_math;")
+set(CUDA_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${CUDA_INCLUDE_DIRS})
 cuda_include_directories(${CUDA_INCLUDE_DIRS})
 
@@ -17,7 +19,4 @@ if(NOT ${CUDA_OPTIMIZATION_TYPE} STREQUAL  "unset")
 endif()
 
 cuda_add_executable(ScatterAllocExample 
-                    example.cu 
-                    ${CMAKE_CURRENT_SOURCE_DIR}/tools/heap.cuh
-					${CMAKE_CURRENT_SOURCE_DIR}/tools/heap_impl.cuh 
-					${CMAKE_CURRENT_SOURCE_DIR}/tools/utils.h)
+                    example.cu )
diff --git a/README.md b/README.md
@@ -3,6 +3,23 @@ ScatterAlloc
 
 ScatterAlloc: Massively Parallel Dynamic Memory Allocation for the GPU
 
+This project provides a **fast memory manager** for **Nvidia GPUs** with
+compute capability `sm_20` or higher.
+
+From http://www.icg.tugraz.at/project/mvp/downloads :
+```quote
+ScatterAlloc is a dynamic memory allocator for the GPU. It is
+designed concerning the requirements of massively parallel
+execution.
+
+ScatterAlloc greatly reduces collisions and congestion by
+scattering memory requests based on hashing. It can deal with
+thousands of GPU-threads concurrently allocating memory and its
+execution time is almost independent of the thread count.
+
+ScatterAlloc is open source and easy to use in your CUDA projects.
+```
+
 Original Homepage: http://www.icg.tugraz.at/project/mvp
 
 Our Homepage: https://www.hzdr.de/crp
@@ -15,8 +32,8 @@ This repository is a
 [fork](https://en.wikipedia.org/wiki/Fork_%28software_development%29)
 of the **ScatterAlloc** project from the
 [Managed Volume Processing](http://www.icg.tugraz.at/project/mvp)
-group at [Institute for Computer Graphics](http://www.icg.tugraz.at) and
-Vision, TU Graz (kudos!).
+group at [Institute for Computer Graphics and Vision](http://www.icg.tugraz.at),
+TU Graz (kudos!).
 
 Our aim is to improve the implementation, add new features and to fix some
 minor bugs.

diff --git a/example.cu b/example.cu
@@ -128,4 +128,4 @@ void runexample(int cuda_device)
   CUDA_CHECKED_CALL(cudaDeviceSynchronize());
   freeSomething<<<grid,block>>>(data);
   CUDA_CHECKED_CALL(cudaDeviceSynchronize());
-}
+}
diff --git a/tools/heap.cuh b/tools/heap.cuh
@@ -4,8 +4,11 @@
 
   Copyright (C) 2012 Institute for Computer Graphics and Vision,
                      Graz University of Technology
+  Copyright (C) 2014 Institute of Radiation Physics,
+                     Helmholtz-Zentrum Dresden - Rossendorf
 
   Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
+              Rene Widera - r.widera ( at ) hzdr.de
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -29,6 +32,7 @@
 #ifndef HEAP_CUH
 #define HEAP_CUH
 
+#include <stdio.h>
 #include "tools/utils.h"
 
 namespace GPUTools
@@ -114,7 +118,7 @@ namespace GPUTools
     volatile uint* _regions;
     PAGE* _page;
     uint _numpages;
-    uint _memsize;
+    size_t _memsize;
     uint _pagebasedMutex;
     volatile uint _firstFreePageBased;
     volatile uint _firstfreeblock;
@@ -171,7 +175,6 @@ namespace GPUTools
           return -1;
         spot = nextspot(old, spot, spots);
       }
-      return -1;
     }
 
     /**
@@ -273,7 +276,7 @@ namespace GPUTools
       {
         for(uint b = startblock; b < accessblocks; ++b)
         {
-          while(ptetry < b*pagesperblock)
+          while(ptetry < (b+1)*pagesperblock)
           {
             uint region = ptetry/regionsize;
             uint regionfilllevel = _regions[region];
@@ -579,7 +582,7 @@ namespace GPUTools
       //take care of padding
       bytes = (bytes + dataAlignment - 1) & ~(dataAlignment-1);
 
-      bool use_coalescing = false; 
+      bool can_use_coalescing = false; 
       uint myoffset = 0;
       uint warpid = GPUTools::warpid();
 
@@ -592,15 +595,15 @@ namespace GPUTools
       if (coalescible && threadcount > 1) 
       {
         myoffset = atomicAdd(&warp_sizecounter[warpid], bytes);
-        use_coalescing = true;
+        can_use_coalescing = true;
       }
 
       uint req_size = bytes;
-      if (use_coalescing)
+      if (can_use_coalescing)
         req_size = (myoffset == 16) ? warp_sizecounter[warpid] : 0;
 
       char* myalloc = (char*)alloc_internal_direct(req_size);
-      if (req_size && use_coalescing) 
+      if (req_size && can_use_coalescing) 
       {
         warp_res[warpid] = myalloc;
         if (myalloc != 0)
@@ -609,7 +612,7 @@ namespace GPUTools
       __threadfence_block();
 
       void *myres = myalloc;
-      if(use_coalescing) 
+      if(can_use_coalescing) 
       {
         if(warp_res[warpid] != 0)
           myres = warp_res[warpid] + myoffset;
@@ -661,7 +664,7 @@ namespace GPUTools
      * @param memory pointer to the memory used for the heap
      * @param memsize size of the memory in bytes
      */
-    __device__ void init(void* memory, uint memsize)
+    __device__ void init(void* memory, size_t memsize)
     {
       uint linid = threadIdx.x + blockDim.x*(threadIdx.y + threadIdx.z*blockDim.y);
       uint threads = blockDim.x*blockDim.y*blockDim.z;
@@ -711,7 +714,7 @@ namespace GPUTools
         _pagebasedMutex = 0;
         _firstFreePageBased = numpages-1;
 
-        if(_page[numpages].data - 1 >= (char*)(memory) + memsize)
+        if( (char*) (_page+numpages) > (char*)(memory) + memsize)
           printf("error in heap alloc: numpages too high\n");
       }
 
@@ -748,7 +751,7 @@ namespace GPUTools
     * global init heap method
     */
   template<uint pagesize, uint accessblocks, uint regionsize, uint wastefactor,  bool use_coalescing, bool resetfreedpages>
-  __global__ void initHeap(DeviceHeap<pagesize, accessblocks, regionsize, wastefactor, use_coalescing, resetfreedpages>* heap, void* heapmem, uint memsize)
+  __global__ void initHeap(DeviceHeap<pagesize, accessblocks, regionsize, wastefactor, use_coalescing, resetfreedpages>* heap, void* heapmem, size_t memsize)
   {
     heap->init(heapmem, memsize);
   }

diff --git a/tools/heap_impl.cuh b/tools/heap_impl.cuh
@@ -4,8 +4,11 @@
 
   Copyright (C) 2012 Institute for Computer Graphics and Vision,
                      Graz University of Technology
+  Copyright (C) 2014 Institute of Radiation Physics,
+                     Helmholtz-Zentrum Dresden - Rossendorf
 
   Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
+              Rene Widera - r.widera ( at ) hzdr.de
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -54,11 +57,11 @@ void* initHeap(size_t memsize = 8*1024U*1024U)
 #ifdef __CUDACC__
 #ifdef OVERWRITE_MALLOC
 #if __CUDA_ARCH__ >= 200
-__device__ void* malloc(size_t t)
+__device__ void* malloc(size_t t) __THROW
 {
   return theHeap.alloc(t);
 }
-__device__ void  free(void* p)
+__device__ void  free(void* p) __THROW
 {
   theHeap.dealloc(p);
 }
@@ -96,4 +99,4 @@ __device__ void operator delete[](void* mem, GPUTools::DeviceHeap<pagesize, acce
 
 #endif //__CUDACC__
 
-#endif //HEAP_IMPL_CUH
+#endif //HEAP_IMPL_CUH