-
Notifications
You must be signed in to change notification settings - Fork 0
/
Memory_Transfer.cu
63 lines (49 loc) · 1.44 KB
/
Memory_Transfer.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include<cstring>
#include<time.h>
//S1L12: Memory transfer between host and device
__global__ void mem_trs_test(int * input)
{
int gid = blockIdx.x * blockDim.x + threadIdx.x;
printf("tid : %d, gid : %d, value : %d \n", threadIdx.x, gid, input[gid] );
}
__global__ void mem_trs_test2(int* input, int size) //More optimize .
{
int gid = blockIdx.x * blockDim.x + threadIdx.x;
if(gid<size)
printf("tid : %d, gid : %d, value : %d \n", threadIdx.x, gid, input[gid]);
//if we don't give this condition still the other 10 threads will try to acess the array and it'll find the value 0
}
int main()
{
int size = 150;
int byte_size = size * sizeof(int);
int* h_input;
h_input = (int*) malloc(byte_size);
//time to randomly initialize the array
time_t t;
srand((unsigned)time(&t));
for (int i = -0; i < size; i++)
{
h_input[i] = (int)(rand() & 0xff);
}
int * d_input;
cudaMalloc((void**) &d_input, byte_size);
cudaMemcpy(d_input, h_input, byte_size, cudaMemcpyHostToDevice);
dim3 block(32);
dim3 grid(5);
//mem_trs_test << <grid, block >> > (d_input);
mem_trs_test2 << <grid, block >> > (d_input, size);
//thread block size affects the performance.
//usually we keep the thread block size to multiplication of 32
cudaDeviceSynchronize();
//Reclaim memory
cudaFree(d_input);
free(h_input);
cudaDeviceReset();
return 0;
}