We will create N threads, one thread per array element:
The threads will repeat the sorting steps until the variable isSorted == 1
The even numbered threads will start first:
Thread e compares
A[e] with
A[e+1] and
swap them if
they are out of order
And: sets
isSorted=0
The odd numbered threads will run next:
Thread o compares
A[o] with
A[o+1] and
swap them if
they are out of order
And: sets
isSorted=0
The CPU code: (1) allocate arrays and (2) then launches the threads (using a <<< 1,N >>> grid):
int main(int argc, char *argv[])
{
int N = Input array size
}
|
N will be specified by the user input
We define the reference variables to help us allocate the shared arrays:
int main(int argc, char *argv[])
{
int N = Input array size
int *A;
}
|
(This is CUDA syntax... i.e, how CUDA provide the dynamic shared array to us)
Allocate the 3 shared vectors (as 1-dim arrays):
int main(int argc, char *argv[])
{
int N = Input array size
int *A;
/* =======================================
Allocate a shared array
======================================= */
cudaMallocManaged(&A, N*sizeof(float));
// initialize array A (code omitted)
}
|
(This is CUDA syntax... i.e, how CUDA provide the dynamic shared array to us)
Launch (at least) N threads as a <<< 1,N >>> grid to perform the odd-even sort:
int main(int argc, char *argv[])
{
int N = Input array size
int *A;
/* =======================================
Allocate 3 shared matrices (as arrays)
======================================= */
cudaMallocManaged(&A, N*sizeof(float));
// initialize array A (code omitted)
oddEvenSort<<< 1, N >>>(A, N); // Using > 1 block can result in error !
// The reason will need to be explained later...
}
|
We will write the kernel code for vectorAdd( ) next...
First, make each thread computes its own unique ID i:
__global__ void oddEvenSort(int *a, int n)
{
int isSorted;
int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID
isSorted = 0;
while ( !isSorted )
{
isSorted = 1;
if ( i%2 == 0 && i < n-1 ) // Even phase
{
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
if ( i%2 == 1 && i < n-1 ) // Odd phase
{
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
}
}
|
Repeat the sort step (compare and swap) until the array is sorted:
__global__ void oddEvenSort(int *a, int n)
{
int isSorted;
int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID
isSorted = 0;
while ( isSorted == 0 )
{
isSorted = 1;
if ( i%2 == 0 && i < n-1 ) // Even phase
{
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
SORT step
if ( i%2 == 1 && i < n-1 ) // Odd phase
{
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
}
}
|
Assume that the array is sorted (isSorted is updated to 0 if we need to swap):
__global__ void oddEvenSort(int *a, int n)
{
int isSorted;
int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID
isSorted = 0;
while ( isSorted == 0 )
{
isSorted = 1;
if ( i%2 == 0 && i < n-1 ) // Even phase
{
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
if ( i%2 == 1 && i < n-1 ) // Odd phase
{
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
}
}
|
Even phase: allow only the "even" threads to do the compare and swap step:
__global__ void oddEvenSort(int *a, int n)
{
int isSorted;
int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID
isSorted = 0;
while ( isSorted == 0 )
{
isSorted = 1;
if ( i%2 == 0 && i < n-1 ) // Even phase (test is exec'ed by ALL threads)
{ // ONLY an "even" thread execute this if-statement
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
if ( i%2 == 1 && i < n-1 ) // Odd phase
{
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
}
}
|
Even phase: make sure that all threads are finished before moving onwards:
__global__ void oddEvenSort(int *a, int n)
{
int isSorted;
int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID
isSorted = 0;
while ( isSorted == 0 )
{
isSorted = 1;
if ( i%2 == 0 && i < n-1 ) // Even phase (test is exec'ed by ALL threads)
{ // ONLY an "even" thread execute this if-statement
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
if ( i%2 == 1 && i < n-1 ) // Odd phase
{
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
}
}
|
Odd phase: allow only the "odd" threads to do the compare and swap step:
__global__ void oddEvenSort(int *a, int n)
{
int isSorted;
int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID
isSorted = 0;
while ( isSorted == 0 )
{
isSorted = 1;
if ( i%2 == 0 && i < n-1 ) // Even phase (test is exec'ed by ALL threads)
{ // ONLY an "even" thread execute this if-statement
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
if ( i%2 == 1 && i < n-1 ) // odd phase (test is exec'ed by ALL threads)
{ // ONLY an "odd" thread execute this if-statement
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
}
}
|
Odd phase: make sure that all threads are finished before moving onwards:
__global__ void oddEvenSort(int *a, int n)
{
int isSorted;
int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID
isSorted = 0;
while ( isSorted == 0 )
{
isSorted = 1;
if ( i%2 == 0 && i < n-1 ) // Even phase (test is exec'ed by ALL threads)
{ // ONLY an "even" thread execute this if-statement
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
if ( i%2 == 1 && i < n-1 ) // odd phase (test is exec'ed by ALL threads)
{ // ONLY an "odd" thread execute this if-statement
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
}
}
|
Failed DEMO: /home/cs355001/demo/CUDA/7-sort/odd-even.cu
Local variables are "private" variables for a thread:
__global__ void oddEvenSort(int *a, int n)
{ // EACH thread has its own copy of local variables
int isSorted;
int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID for each thread
isSorted = 0;
while ( isSorted == 0 )
{
isSorted = 1;
if ( i%2 == 0 && i < n-1 ) // Even phase (test is exec'ed by ALL threads)
{ // ONLY an "even" thread execute this if-statement
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
if ( i%2 == 1 && i < n-1 ) // odd phase (test is exec'ed by ALL threads)
{ // ONLY an "odd" thread execute this if-statement
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0;
}
}
__syncthreads(); // All threads must finish before move on
}
}
|
When thread i updates isSorted, the isSorted variable in other threads are not changed:
__global__ void oddEvenSort(int *a, int n)
{ // EACH thread has its own copy of local variables
int isSorted;
int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID
isSorted = 0;
while ( isSorted == 0 )
{
isSorted = 1;
if ( i%2 == 0 && i < n-1 ) // Even phase (test is exec'ed by ALL threads)
{ // ONLY an "even" thread execute this if-statement
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0; // Thread i will ONLY update its copy !!
}
}
__syncthreads(); // All threads must finish before move on
if ( i%2 == 1 && i < n-1 ) // odd phase (test is exec'ed by ALL threads)
{ // ONLY an "odd" thread execute this if-statement
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0; // Thread i will ONLY update its copy !!
}
}
__syncthreads(); // All threads must finish before move on
}
}
|
Recall that each stream multi-processor in the GPU has a shared memory:
Variables stored in the shared memory will be shared between all threads running on the same multi-processor (= in the same thread block)
The keyword __shared__ will allocate a variable in the shared memory:
__global__ void oddEvenSort(int *a, int n)
{ // EACH thread will share the variable isSorted
__shared__ int isSorted;
int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID
isSorted = 0;
while ( isSorted == 0 )
{
isSorted = 1;
if ( i%2 == 0 && i < n-1 ) // Even phase (test is exec'ed by ALL threads)
{ // ONLY an "even" thread execute this if-statement
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0; // ALL threads will see this update !!
}
}
__syncthreads(); // All threads must finish before move on
if ( i%2 == 1 && i < n-1 ) // odd phase (test is exec'ed by ALL threads)
{ // ONLY an "odd" thread execute this if-statement
if (a[i] > a[i+1])
{
SWAP(a[i], a[i+1]);
isSorted = 0; // ALL threads will see this update !!
}
}
__syncthreads(); // All threads must finish before move on
}
}
|
DEMO: /home/cs355001/demo/CUDA/7-sort/odd-even.cu
|
DEMO: demo/CUDA/8-shared-vars/shared_var.cu
|
DEMO: demo/CUDA/8-shared-vars/shared_var2.cu