#include <omp.h>
#pragma omp parallel [Options...]
{ ...
... Parallel region
...
... Program statements between the braces
... are executed in parallel by all threads
...
}
|
export OMP_NUM_THREADS=... Example: export OMP_NUM_THREADS=8 |
|
#include <opm.h>
int main(int argc, char *argv[])
{
#pragma omp parallel
{
cout << "Hello World !!!" << endl;
}
}
|
export OMP_NUM_THREADS=8
a.out
You will see "Hello World !!!" printed EIGHT times !!! (Remove the #pragma line and you get ONE line)....
|
Execution:
|
|
|
#include <omp.h>
int main(int argc, char *argv[])
{
int N; // Variable defined OUTSIDE parallel region....
// It is therefore SHARED
N = 1001;
cout << "Before parallel section: N = " << N << endl;
#pragma omp parallel
{
N = N + 1;
cout << "Inside parallel section: N = " << N << endl;
}
cout << "After parallel section: N = " << N << endl;
}
|
You should see the value for N at the end is not always 1009, it could be less. You have seen this phenomenon before in threaded programs when multiple-threads update a shared variable concurrently... The same things is happening here.
#include <omp.h>
int main(int argc, char *argv[])
{
#pragma omp parallel
{
int N; // Variable defined INSIDE parallel region....
// It is therefore NON-SHARED
N = 1001;
N = N + 1;
}
// ERROR if you try to do this:
// cout << "N = " << N << endl;
// because N is not defined in the outer scope !!!
}
|
You should see the value for N at the end is always 1002
#include <omp.h>
int main(int argc, char *argv[])
{
int N; // Line XXX
N = 1001;
cout << "Before parallel section: N = " << N << endl;
#pragma omp parallel private(N)
{ // Define a local variable N !
N = N + 1; // This N is different from the N at line XXX !!!
cout << "Inside parallel section: N = " << N << endl;
}
cout << "After parallel section: N = " << N << endl;
}
|
You should see the value for N inside the parallel section
is always 1
The variable N outside the parallel section
remains 1001
| Function Name | Effect |
|---|---|
| omp_set_num_threads(int nthread) | Set size of thread team |
| INTEGER omp_get_num_threads() | return size of thread team |
| INTEGER omp_get_max_threads() | return max size of thread team (typically equal to the number of processors |
| INTEGER omp_get_thread_num() | return thread ID of the thread that calls this function |
| INTEGER omp_get_num_procs() | return number of processors |
| LOGICAL omp_in_parallel() | return TRUE if currently in a PARALLEL segment |
| omp_init_lock(omp_lock_t *lock) | Initialize the mutex lock "lock" |
| omp_set_lock(omp_lock_t *lock) | Lock the mutex lock "lock" |
| omp_unset_lock(omp_lock_t *lock) | Unlock the mutex lock "lock" |
| omp_test_lock(omp_lock_t *lock) | Return true if the mutex lock "lock" is locked, returns false otherwise |
NOTE: We will study other synchronization primitives and will not discuss omp..lock()
#include <iostream.h>
#include <omp.h> // Read in OpenMP function prototypes
int main(int argc, char *argv[])
{
int nthreads, myid;
#pragma omp parallel private (nthreads, myid)
{
/* Every thread does this */
myid = omp_get_thread_num();
cout << "Hello I am thread " << myid << endl;
/* Only thread 0 does this */
if (myid == 0)
{
nthreads = omp_get_num_threads();
cout << "Number of threads = " << nthreads << endl;
}
}
return 0;
}
|
/* Shared Variables */
double x[1000000]; // Must be SHARED (accessed by worker threads !!)
int start[100]; // Contain starting array index of each thread
double min[100]; // Contain the minimum found by each thread
int num_threads;
int main(...)
{
for (i = 0; i < MAX; i++)
x[i] = random()/(double)1147483648;
// ---------------------------- Start parallel -----
#pragma omp parallel
{
... Thread i finds its minimum and
... store the result in min[i]
}
// ---------------------------- End parallel -----
// ----------------------------------------
// Post processing: Find actual minimum
// ----------------------------------------
my_min = min[0];
for (i = 1; i < num_threads; i++)
if ( min[i] < my_min )
my_min = min[i];
}
|
(For simplicity of discussion, I used 2 threads)
|
start[0] start[1] | | | values handled by | values handled by V thread 0 V thread 1 |<--------------------->|<--------------------->| |
#define MAX 1000000
/* Shared Variables */
double x[MAX]; // Must be SHARED (accessed by worker threads !!)
int start[100]; // Contain starting array index of each thread
double min[100]; // Contain the minimum found by each thread
int num_threads;
int main(...)
{
for (i = 0; i < MAX; i++)
x[i] = random()/(double)1147483648;
// ---------------------------- Start parallel -----
#pragma omp parallel
{
int id;
int i, n, start, stop;
double my_min;
n = MAX/omp_get_num_threads(); // step = MAX/number of threads.
id = omp_get_thread_num(); // id is one of 0, 1, ..., (num_threads-1)
/* ----------------------------
Find the starting index
---------------------------- */
start = id * n;
/* ----------------------------
Find the stopping index
---------------------------- */
if ( id != (num_threads-1) )
{
stop = start + n;
}
else
{
stop = MAX;
}
/* ------------------------------------------
Find the min between x[start] and x[stop]
------------------------------------------ */
my_min = x[start];
for (i = start+1; i < stop; i++ )
{
if ( x[i] < my_min )
my_min = x[i];
}
/* ----------------------------
Save result in shared area
---------------------------- */
min[id] = my_min; // Store result in min[id]
}
// ---------------------------- End parallel -----
// ----------------------------------------
// Post processing: Find actual minimum
// ----------------------------------------
my_min = min[0];
for (i = 1; i < num_threads; i++)
if ( min[i] < my_min )
my_min = min[i];
}
|
Compile with:
Run with (on compute):
export OMP_NUM_THREADS=8
a.out
I will limit to the most commonly used one: mutual exclusion.
#pragma omp critical
{
...
... Mutual exclusive access to
... shared variables
...
}
|
int N; // Global - shared by all threads
int main(...)
{
....
/* -------------------
Parallel section
------------------- */
#pragma omp parallel
{
....
/* ---------------------------------------
Section with mutual exclussive access
--------------------------------------- */
#pragma omp critical
{
N = N + 1;
}
....
}
...
}
|
double f(double a)
{
return( 2.0 / sqrt(1 - a*a) );
}
int main(int argc, char *argv[])
{
int i;
int N;
double sum;
double x, w;
N = ...; // accuracy of the approximation
w = 1.0/N;
sum = 0.0;
for (i = 1; i <= N; i = i + 1)
{
x = w*(i - 0.5);
sum = sum + w*f(x);
}
cout << sum;
}
|
Compile with:
Run the program with:
values handled by thread 0
| | | | | | | | | | | | | |
V V V V V V V V V V V V V V
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|
^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
| | | | | | | | | | | | | |
values handled by thread 1
|
|
double f(double a)
{
return( 2.0 / sqrt(1 - a*a) );
}
int main(int argc, char *argv[])
{
int N;
double x, w;
double sum; // Shared variable, updated !
N = ...; // accuracy of the approximation
w = 1.0/N;
sum = 0.0;
#pragma omp parallel
{
int i, num_threads; // Non-shared variables !!!
double x;
num_threads = omp_get_num_threads() ;
for (i = omp_get_thread_num(); i < N; i = i + num_threads)
{
x = w*(i + 0.5);
#pragma omp critical
{
sum = sum + w*f(x);
}
}
}
cout << sum;
}
|
|
|
Change OMP_NUM_THREADS and see the difference in performance
|
double f(double a)
{
return( 2.0 / sqrt(1 - a*a) );
}
int main(int argc, char *argv[])
{
int N;
double sum; // Shared variable !
double x, w;
N = ...; // accuracy of the approximation
w = 1.0/N;
sum = 0.0;
#pragma omp parallel
{
int i, num_threads;
double x;
double mypi; // Private variable to reduce synchronization
num_threads = omp_get_num_threads() ;
mypi = 0.0;
for (i = omp_get_thread_num(); i < N; i = i + num_threads)
{
x = w*(i + 0.5);
mypi = mypi + w*f(x); // No synchronization needed !
}
#pragma omp critical
{
sum = sum + mypi; // Synchronize outside loop !
}
}
cout << sum;
}
|
|
|
Change OMP_NUM_THREADS ` and see the difference in performance
|
The division of labor (splitting the work of a for-loop) in a parallel for-loop can be done automatically in OpenMP through the PARALLEL LOOP construct.
#pragma omp for [parameters]
for-statement // Parallel Loop
|
|
|
double f(double a)
{
return( 2.0 / sqrt(1 - a*a) );
}
int main(int argc, char *argv[])
{
int N;
double sum; // Shared variable, updated !
double x, w;
N = ...; // accuracy of the approximation
w = 1.0/N;
sum = 0.0;
#pragma omp parallel
{
int i;
double x;
double mypi; // Non-shared within the parallel section
mypi = 0.0;
/* --------------------------
PARALLEL FOR construct
-------------------------- */
#pragma omp for
for (i = 0; i < N; i = i + 1)
{
x = w*(i + 0.5); // Save us the trouble of dividing
mypi = mypi + w*f(x); // the work up...
}
#pragma omp critical
{
sum = sum + mypi;
}
}
cout << sum;
}
|
Comment:
|
export OMP_NUM_THREADS=8
a.out 50000000
Change OMP_NUM_THREADS and see the difference in performance
setenv STACKSIZE nBytes |