__managed__ C-global-var-definition; |
When you define the variable as global:
|
__managed__ int x; // Accessible by ALL CPU and GPU functions !!!
__global__ void GPU_func( )
{
printf("GPU sees x = %d\n", x);
x = 4444;
}
int main()
{
x = 1234;
GPU_func<<< 1, 1 >>>( ); // Start GPU function
cudaDeviceSynchronize(); // Wait until GPU kernel function finishes !!
printf("CPU sees x = %d\n", x); // Now obtain the result !!
return 0;
}
|
Key:
|
__managed__ int x[10]; // Defines global shared array variable !!!
__global__ void GPU_func( )
{
printf("++ GPU sees x: ");
for (int i = 0; i < 10; i++ )
{
printf("%d ", x[i]);
x[i] = x[i] + i; // GPU updates x[i]
}
printf("\n");
}
int main()
{
for (int i = 0; i < 10; i++ ) // CPU initializes x[ ]
x[i] = 1000+i;
GPU_func<<< 1, 1 >>>( ); // GPU uses CPU's values and updates x[ ]
cudaDeviceSynchronize(); // CPU WAITS until GPU finishes !!!
printf("** CPU sees x: "); // CPU then access GPU's updated values
for (int i = 0; i < 10; i++ )
{
printf("%d ", x[i]);
}
printf("\n");
return 0;
}
|
/home/cs355001/demo/CUDA/2-unified-mem/shared-global-array
Output:
++ GPU sees x: 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009
** CPU sees x: 1000 1002 1004 1006 1008 1010 1012 1014 1016 1018
|
|
The following construct is illegal:
int main()
{
int a;
__managed__ a = 1234; // Illegal: cannot define "managed" local variable
return 0;
}
|
/home/cs355001/demo/CUDA/2-unified-mem/shared-local-fail.cu
nvcc -g -o shared-local-fail.o -c shared-local-fail.cu
shared-local-fail.cu(13): error: expected a declaration
Line 13 is:
__managed__ a = 1234; // Illegal: cannot define "managed" local variable
|
int *a; // You must use a pointer variable
cudaMallocManaged( &a, sizeof( int ) ); // Create a shared local int variable
|
Example:
__global__ void GPU_func( int *x )
{
printf("GPU sees *x = %d\n", *x); // *x access int variable
*x = 4444; // Update shared local variable
}
int main()
{
int *a; // Must use a pointer variable
cudaMallocManaged(&a, sizeof(int) ); // Create a shared managed variable
// and make a point to -> variable
*a = 1234; // Assign *a = 1234;
GPU_func<<< 1, 1 >>>( a ); // Pass address of the variable to GPU_func( )
cudaDeviceSynchronize(); // Wait until GPU is done
printf("CPU sees *a = %d\n", *a);
return 0;
}
|
/home/cs355001/demo/CUDA/2-unified-mem/shared-local
Output:
GPU sees *x = 1234
CPU sees *a = 4444
|
Example:
__global__ void GPU_func( int *a ) ; // Declare the GPU function
int main()
{
int *a; // a is a local variable
cudaMallocManaged(&a, 10*sizeof(int) ); // a -> shared 10 int var (= array)
for (int i = 0; i < 10; i++ )
a[i] = 1000;
GPU_func<<< 1, 1 >>>( a ); // Must pass a to GPU_function to "share"
cudaDeviceSynchronize(); // Wait for data transfer to finish
// before accessing the variable a
printf("** CPU sees a: ");
for (int i = 0; i < 10; i++ )
{
printf("%d ", a[i]);
}
printf("\n");
return 0;
}
__global__ void GPU_func( int *a )
{
printf("++ GPU sees a: ");
for (int i = 0; i < 10; i++ )
{
printf("%d ", a[i]);
a[i] = a[i] + i; // GPU_Func updates a[ ]
}
printf("\n");
}
|
/home/cs355001/demo/CUDA/2-unified-mem/shared-local-array
Output:
++ GPU sees a: 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000
** CPU sees a: 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009
|