CIS565-Fall-2022 · zedward23 · Sep 16, 2022 · Sep 17, 2022 · Sep 18, 2022 · Sep 18, 2022
diff --git a/README.md b/README.md
@@ -1,14 +1,91 @@
 CUDA Stream Compaction
 ======================
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
+**University of Pennsylvania, CIS 565: GPU Programming and Architecture,
+Project 2 - CUDA Stream Compaction**
+
+* Edward Zhang
+  * https://www.linkedin.com/in/edwardjczhang/
+  * https://zedward23.github.io/personal_Website/
+
+* Tested on: Windows 10 Home, i7-11800H @ 2.3GHz, 16.0GB, NVIDIA GeForce RTX 3060 Laptop GPU
+
+## Background
+The project contains an implementation of the Scan and Compaction Algorithms.
+
+### Scan 
+Description: 
+Each index i of a scan output array is the sum of the corresponding elements in the input array at the indices that came before i. This algorithm was implemented in the following ways:
+
+1. CPU - Non-parallel Scan
+2. Naive - Naively Parallel Scan
+3. Efficient - Parallel Scan using Upsweep and Downsweep on a binary tree representation of an array
+4. Thrust - Scan using Thrust API
+
+### Compaction
+Description: 
+Condenses an array into just its non-zero elements without changing its order
+
+1. CPU - Non-parallel Compact
+2. CPU with Scan - Non-parallel Compact while using Scan
+3. GPU - Parallel Compaction using Efficient Parallel Scan
+
+## Block Size Performance Analysis
+
+![](img/Graph0.png)
+
+A blocksize of 256 seems to yield the best results since it was the first size large enough to take advantage of the parallelism offered by the GPU.
+
+## Scan Performance
+### Powers of 2
+
+![](img/Graph1.png)
+
+Observations:
+- CPU Scan is our baseline
+- Thrust Scan is the fastest; this is expected since it is a library provided to us.
+- Efficient and Naive GPU scan were actually fairly inefficient; this is likely due to so suboptimal thread allocation.
+
+### Non-Powers of 2
+
+![](img/Graph2.png)
+
+Observations:
+- The same observations from running the implementations on array lengths that were powers of 2
+
+## Compact
+
+![](img/Graph3.png)
+
+Observations:
+- Compaction without Scan on the CPU is actually faster that with Scan
+- GPU implementations are still slower than the CPU implementations
+
+## Why is My GPU Approach So Slow? (Extra Credit) (+5)
+
+If you implement your efficient scan version following the slides closely, there's a good chance
+that you are getting an "efficient" gpu scan that is actually not that efficient -- it is slower than the cpu approach?
+
+Though it is totally acceptable for this assignment,
+In addition to explain the reason of this phenomena, you are encouraged to try to upgrade your work-efficient gpu scan.
+
+Thinking about these may lead you to an aha moment:
+- What is the occupancy at a deeper level in the upper/down sweep? Are most threads actually working?
+
+  Most threads are just idling since at each level, less and less indices should be written to.
+
+- Are you always launching the same number of blocks throughout each level of the upper/down sweep?
+
+  I am always launching the same number of blocks regardless of how many indices should actually be written to.
+
+- If some threads are being lazy, can we do an early termination on them?
+
+  Even if we terminate them early, we cannot move onto the next iteration until the ones that need to be written to are properly finished. 
+
+- How can I compact the threads? What should I modify to keep the remaining threads still working correctly?
+
+  On each iteration, dynamically dispatch the optimal number of threads and blocks that operate only on the specific indices that need to be modified.
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
 
-### (TODO: Your README)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
 
diff --git a/img/Graph0.png b/img/Graph0.png
diff --git a/img/Graph1.png b/img/Graph1.png
diff --git a/img/Graph2.png b/img/Graph2.png
diff --git a/img/Graph3.png b/img/Graph3.png
diff --git a/src/main.cpp b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 26; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -51,48 +51,49 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
-    /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-    onesArray(SIZE, c);
+    //For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
+    /*onesArray(SIZE, c);
     printDesc("1s array for finding bugs");
     StreamCompaction::Naive::scan(SIZE, c, a);
-    printArray(SIZE, c, true); */
+    printArray(SIZE, c, true);*/
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
+    printArray(SIZE, a, true);
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
-    zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, a, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
@@ -131,20 +132,21 @@ int main(int argc, char* argv[]) {
     count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
     printArray(count, c, true);
+
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -2,6 +2,7 @@
 #include "cpu.h"
 
 #include "common.h"
+#define IDENTITY 0
 
 namespace StreamCompaction {
     namespace CPU {
@@ -20,6 +21,12 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+
+            odata[0] = idata[0];
+            for (int i = 1; i < n; i++) {
+                odata[i] = idata[i] + odata[i-1];
+            }
+
             timer().endCpuTimer();
         }
 
@@ -31,8 +38,18 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int currIdx = 0;
+
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[currIdx] = idata[i];
+                    currIdx++;
+                }
+            }
+
+
             timer().endCpuTimer();
-            return -1;
+            return currIdx;
         }
 
         /**
@@ -42,9 +59,45 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+
+            int* scanned = new int[n];
+            //Filter
+            for (int i = 0; i < n; i++) {
+                (idata[i] != 0) ? scanned[i] = 1 : scanned[i] = 0;
+            }
+
+            //Exclusive Scan
+            idata[0] == 0 ? scanned[0] = 0 : scanned[0] = 1;
+            for (int i = 1; i < n; i++) {
+                scanned[i] = scanned[i] + scanned[i - 1];
+            }
+
+            //Scatter
+            int currIdx = 0;
+            //odata[currIdx] = idata[0];
+
+
+
+            //SCATTERING MUTHA FUCKAAAA REEEEE
+            if (idata[0] > 0) {
+                odata[0] = idata[0];
+                currIdx++;
+            }
+
+            for (int i = 1; i < n; i++) {
+                if (scanned[i] > scanned[i - 1]) {
+                    odata[scanned[i]-1] = idata[i];
+                    currIdx++;
+                }
+                else {
+                    odata[i] = 0;
+                }
+            }
+
+            delete[] scanned;
+
             timer().endCpuTimer();
-            return -1;
+            return currIdx;
         }
     }
 }