|
|
Threads in the same block can be synchronized
(But we will not learn about synchronization in this short course on CUDA)
|
Threads in different thread blocks in a grid can not be synchronized
|
<<< NBlocks, NThreads >>> NBlocks = # blocks used NThreads = # threads in each block |
Example:
<<< 1, 4 >>>: use 1 block, with 4 (parallel) threads in each block <<< 3, 4 >>>: use 3 blocks, with 4 (parallel) threads in each block |
|
kernelFuncName <<< NBlocks, NThreads >>> ( params ) ;
Effect:
Launch the kernel function kernelFuncName using
NBlocks blocks of threads
NThreads threads in each block
|
Example:
hello <<< 2, 4 >>> ( ) ;
Launch (run) "hello( )" kernel using a <<< 2, 4 >>> CUDA grid of threads
|
|
|
Example program that show that different threads as assigned different blockIdx.x and threadIdx.x values:
#include <stdio.h>
#include <unistd.h>
__global__ void hello( )
{
printf("gridDim.x=%d, blockIdx.x=#%d, blockDim.x=%d, threadIdx.x=#%d\n",
gridDim.x, blockIdx.x, blockDim.x, threadIdx.x);
}
int main()
{
hello<<< 2, 4 >>>( );
printf("I am the CPU: Hello World ! \n");
cudaDeviceSynchronize();
}
|
/home/cs355001/demo/CUDA/1-intro/hello-thrIndex
Output:
I am the CPU: Hello World !
I am in block #0 and thread #0: Hello World !
I am in block #0 and thread #1: Hello World !
I am in block #0 and thread #2: Hello World !
I am in block #0 and thread #3: Hello World !
I am in block #1 and thread #0: Hello World !
I am in block #1 and thread #1: Hello World !
I am in block #1 and thread #2: Hello World !
I am in block #1 and thread #3: Hello World !
|
|
Example program
__global__ void hello( )
{
printf("blockIdx.x=%d/%d block, threadIdx.x=%d/%d threads\n",
blockIdx.x, gridDim.x,
threadIdx.x, blockDim.x);
}
int main()
{
hello<<< 2, 4 >>>( );
printf("I am the CPU: Hello World ! \n");
cudaDeviceSynchronize();
return 0;
}
|
/home/cs355001/demo/CUDA/1-intro/hello-dim
Output:
blockIdx.x=0/2 blocks, threadIdx.x=0/4 threads -+
blockIdx.x=0/2 blocks, threadIdx.x=1/4 threads | block #0
blockIdx.x=0/2 blocks, threadIdx.x=2/4 threads |
blockIdx.x=0/2 blocks, threadIdx.x=3/4 threads -+
blockIdx.x=1/2 blocks, threadIdx.x=0/4 threads -+
blockIdx.x=1/2 blocks, threadIdx.x=1/4 threads | block #1
blockIdx.x=1/2 blocks, threadIdx.x=2/4 threads |
blockIdx.x=1/2 blocks, threadIdx.x=3/4 threads -+
|
|
Then:
threadID = blockDim.x*blockIdx.x + threadIdx.x
|
/home/cs355001/demo/CUDA/1-intro/hello-thrIndex2
Output:
gridDim.x=2, blockIdx.x=#1, blockDim.x=4, threadIdx.x=#0 -> ID=4
gridDim.x=2, blockIdx.x=#1, blockDim.x=4, threadIdx.x=#1 -> ID=5
gridDim.x=2, blockIdx.x=#1, blockDim.x=4, threadIdx.x=#2 -> ID=6
gridDim.x=2, blockIdx.x=#1, blockDim.x=4, threadIdx.x=#3 -> ID=7
gridDim.x=2, blockIdx.x=#0, blockDim.x=4, threadIdx.x=#0 -> ID=0
gridDim.x=2, blockIdx.x=#0, blockDim.x=4, threadIdx.x=#1 -> ID=1
gridDim.x=2, blockIdx.x=#0, blockDim.x=4, threadIdx.x=#2 -> ID=2
gridDim.x=2, blockIdx.x=#0, blockDim.x=4, threadIdx.x=#3 -> ID=3
I am the CPU: Hello World !
|