|
|
|
In CUDA, we can assign each thread with a 2-dimensional identifier (and even a 3-dim identfier !!)
|
int threadRowID, threadColId; threadRowID = blockIdx.x * blockDim.x + threadIdx.x; threadColId = blockIdx.y * blockDim.y + threadIdx.y; |
Example:
__global__ void hello( )
{
int threadRowID, threadColId;
threadRowID = blockIdx.x * blockDim.x + threadIdx.x;
threadColId = blockIdx.y * blockDim.y + threadIdx.y;
/* ------------------------------------
Print the thread's 2 dim grid ID
------------------------------------ */
printf("Blk: (%d,%d) Thread: (%d,%d) -> Row/Col = (%d,%d)\n",
blockIdx.x, blockIdx.y,
threadIdx.x, threadIdx.y,
threadRowID, threadColId);
}
int main()
{
dim3 blockShape = dim3( 2, 3 );
dim3 gridShape = dim3( 3, 2 );
hello<<< gridShape, blockShape >>>( ); // Launch a 2 dim grid of threads
printf("I am the CPU: Hello World ! \n");
cudaDeviceSynchronize();
return 0;
}
|
/home/cs355001/demo/CUDA/1-intro/hello-2dim-ID
Output:
I am the CPU: Hello World !
Blk: (2,0) Thread: (0,0) -> Row/Col = (4,0)
Blk: (2,0) Thread: (1,0) -> Row/Col = (5,0)
Blk: (2,0) Thread: (0,1) -> Row/Col = (4,1)
Blk: (2,0) Thread: (1,1) -> Row/Col = (5,1) // If you look carefully at
Blk: (2,0) Thread: (0,2) -> Row/Col = (4,2) // the last column, the
Blk: (2,0) Thread: (1,2) -> Row/Col = (5,2) // indexes span these ranges:
Blk: (0,0) Thread: (0,0) -> Row/Col = (0,0) //
Blk: (0,0) Thread: (1,0) -> Row/Col = (1,0) // (0,0) (0,1) ... (0,5)
Blk: (0,0) Thread: (0,1) -> Row/Col = (0,1) // (1,0) (1,1) ... (1,5)
Blk: (0,0) Thread: (1,1) -> Row/Col = (1,1) // (2,0) (2,1) ... (2,5)
Blk: (0,0) Thread: (0,2) -> Row/Col = (0,2) // ...
Blk: (0,0) Thread: (1,2) -> Row/Col = (1,2) // (5,0) (5,1) ... (5,5)
Blk: (2,1) Thread: (0,0) -> Row/Col = (4,3) //
Blk: (2,1) Thread: (1,0) -> Row/Col = (5,3) // So you can use these indexes
Blk: (2,1) Thread: (0,1) -> Row/Col = (4,4) // to access matrixes !!!
Blk: (2,1) Thread: (1,1) -> Row/Col = (5,4)
Blk: (2,1) Thread: (0,2) -> Row/Col = (4,5)
Blk: (2,1) Thread: (1,2) -> Row/Col = (5,5)
Blk: (0,1) Thread: (0,0) -> Row/Col = (0,3)
Blk: (0,1) Thread: (1,0) -> Row/Col = (1,3)
Blk: (0,1) Thread: (0,1) -> Row/Col = (0,4)
Blk: (0,1) Thread: (1,1) -> Row/Col = (1,4)
Blk: (0,1) Thread: (0,2) -> Row/Col = (0,5)
Blk: (0,1) Thread: (1,2) -> Row/Col = (1,5)
Blk: (1,0) Thread: (0,0) -> Row/Col = (2,0)
Blk: (1,0) Thread: (1,0) -> Row/Col = (3,0)
Blk: (1,0) Thread: (0,1) -> Row/Col = (2,1)
Blk: (1,0) Thread: (1,1) -> Row/Col = (3,1)
Blk: (1,0) Thread: (0,2) -> Row/Col = (2,2)
Blk: (1,0) Thread: (1,2) -> Row/Col = (3,2)
Blk: (1,1) Thread: (0,0) -> Row/Col = (2,3)
Blk: (1,1) Thread: (1,0) -> Row/Col = (3,3)
Blk: (1,1) Thread: (0,1) -> Row/Col = (2,4)
Blk: (1,1) Thread: (1,1) -> Row/Col = (3,4)
Blk: (1,1) Thread: (0,2) -> Row/Col = (2,5)
Blk: (1,1) Thread: (1,2) -> Row/Col = (3,5)
|
|
|
I won't go into the details, it's similar to the 2-D example above
/usr/local/cuda/samples/
|
/usr/local/cuda/samples/1_Utilities/deviceQuery/deviceQuery
Output on ghost01:
Detected 1 CUDA Capable device(s)
Device 0: "Quadro P1000"
CUDA Driver Version / Runtime Version 9.2 / 9.0
CUDA Capability Major/Minor version number: 6.1
Total amount of global memory: 4040 MBytes (4235919360 bytes)
( 5) Multiprocessors, (128) CUDA Cores/MP: 640 CUDA Cores
GPU Max Clock rate: 1481 MHz (1.48 GHz)
Memory Clock rate: 2505 Mhz
Memory Bus Width: 128-bit
L2 Cache Size: 1048576 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64) -- max block size
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535) -- max grid size
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 2 copy engine(s)
Run time limit on kernels: No
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): Yes
Supports Cooperative Kernel Launch: Yes
Supports MultiDevice Co-op Kernel Launch: Yes
Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice()
with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 9.2,
CUDA Runtime Version = 9.0, NumDevs = 1
Result = PASS
|