|
float *x, *y;
/* ==========================================
Allocate arrays to store vector x and y
========================================== */
x = calloc( N, sizeof(float) );
y = calloc( N, sizeof(float) );
/* ===============================================
CPU version of the vector addition algorithm
=============================================== */
for (i = 0; i < N; i++) // Add N elements...
y[i] = x[i] + y[i]; // Add one element at a time...
|
How the program code progresses in time:
|
cd /home/cs355001/demo/CUDA/3-add-vector/cpu-add-vector.c nvcc -g cpu-add-vector.c -o cpu-add-vector cpu-add-vector 100000 Elasped time = 611 micro secs cpu-add-vector 10000000 Elasped time = 41079 micro secs |
|
/* ==================================================
"Serial" version of the vector addition algorithm
================================================== */
for (i = 0; i < N; i++)
y[i] = x[i] + y[i]; // Add 1 element at a time
|
__global__
void add(int n, float *x, float *y)
{
for (int i = 0; i < n; i++) // Same code as CPU version
y[i] = x[i] + y[i];
}
int main(int argc, char *argv[])
{
... initialization code omitted ....
// =======================================================================
// Run "add" kernel on on the GPU using 1 block, 1 thread/per block
add<<<1, 1>>>(N, x, y); // Call CPU function
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize(); // Wait for GPU to finish
// =======================================================================
... print code omitted ...
}
|
How the program code progresses in time:
|
file: /home/cs355001/demo/CUDA/3-add-vector/add-vector1.cu Compile: nvcc -o add-vector1 add-vector1.cu Sample runs: cs355@ghost01 (1457)> add-vector1 100000 Elasped time = 18492 micro secs cs355@ghost01 (1458)> add-vector1 10000000 Elasped time = 1604292 micro secs |
Conclusion:
|
|
int main(int argc, char *argv[])
{
... initialization code omitted ...
// ==================================================================
// Run kernel on the GPU using 1 block, K thread/per block
add<<<1, K>>>(N, x, y); // Spawn K GPU threads
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize(); // Wait for GPU to finish
// ==================================================================
... print code omitted ...
}
|
Program file: /home/cs355001/demo/CUDA/3-add-vector/vector-add2.cu Compile: nvcc -o add-vector2 add-vector2.cu Sample runs: cs355@ghost01 (1475)> add-vector2 10000000 1 (1 thread) Elasped time = 2106604 micro secs cs355@ghost01 (1479)> add-vector2 10000000 10 (10 threads !!) Elasped time = 328381 micro secs cs355@ghost01 (1480)> add-vector2 10000000 100 Elasped time = 56360 micro secs |
|
__global__
void add(int n, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if ( i < n ) // NO LOOP !!!
{
y[i] = x[i] + y[i]; // Work load for thread i !!!
}
}
|
Program file: /home/cs355001/demo/CUDA/3-add-vector/vector-add3.cu Compile: nvcc -o add-vector3 add-vector3.cu Sample runs: cs355@ghost01 (1475)> add-vector3 10000000 1 (1 thread/blk) User specified to use K=1 threads/block N = 10000000/1 = 10000000.000000 ---> we must use 10000000 blocks Elasped time = 92493 micro secs cs355@ghost01 (1751)> add-vector3 10000000 32 (32 threads/blk) User specified to use K=32 threads/block N = 10000000/32 = 312500.000000 ---> we must use 312500 blocks Elasped time = 21408 micro secs cs355@ghost01 (1752)> add-vector3 10000000 64 User specified to use K=64 threads/block N = 10000000/64 = 156250.000000 ---> we must use 156250 blocks Elasped time = 20970 micro secs |