#pragma omp parallel [Options...]
{ ...
... Parallel region
...
... Program statements between the braces
... are executed in parallel by all threads
...
}
|
#include<opm.h>
int main(int argc, char *argv[])
{
#pragma omp parallel
{
cout << "Hello World !!!" << endl;
}
}
|
export OMP_NUM_THREADS=8
a.out
You will see "Hello World !!!" printed EIGHT times !!! (Remove the #pragma line and you get ONE line)....
|
Then the master thread T0 continue with the single-threaded execution following the PARALLEL section.
|
#include <omp.h>
int main(int argc, char *argv[])
{
int N; // Variable defined OUTSIDE parallel region....
// It is therefore SHARED
N = 1001;
cout << "Before parallel section: N = " << N << endl;
#pragma omp parallel
{
N = N + 1;
cout << "Inside parallel section: N = " << N << endl;
}
cout << "After parallel section: N = " << N << endl;
}
|
You should see the value for N at the end is not always 1009, it could be less. You have seen this phenomenon before in threaded programs when multiple-threads update a shared variable concurrently... The same things is happening here.
#include <omp.h>
int main(int argc, char *argv[])
{
#pragma omp parallel
{
int N; // Variable defined INSIDE parallel region....
// It is therefore NON-SHARED
N = 1001;
N = N + 1;
}
// ERROR if you try to do this:
// cout << "N = " << N << endl;
// because N is not defined in the outer scope !!!
}
|
You should see the value for N at the end is not always 1009, it could be less. You have seen this phenomenon before in threaded programs when multiple-threads update a shared variable concurrently... The same things is happening here.
| Function Name | Effect |
|---|---|
| omp_set_num_threads(int nthread) | Set size of thread team |
| INTEGER omp_get_num_threads() | return size of thread team |
| INTEGER omp_get_max_threads() | return max size of thread team (typically equal to the number of processors |
| INTEGER omp_get_thread_num() | return thread ID of the thread that calls this function |
| INTEGER omp_get_num_procs() | return number of processors |
| LOGICAL omp_in_parallel() | return TRUE if currently in a PARALLEL segment |
| omp_init_lock(omp_lock_t *lock) | Initialize the mutex lock "lock" |
| omp_set_lock(omp_lock_t *lock) | Lock the mutex lock "lock" |
| omp_unset_lock(omp_lock_t *lock) | Unlock the mutex lock "lock" |
| omp_test_lock(omp_lock_t *lock) | Return true if the mutex lock "lock" is locked, returns false otherwise |
#include <iostream.h>
#include <omp.h> // Read in OpenMP function prototypes
int main(int argc, char *argv[])
{
int nthreads, myid;
#pragma omp parallel private (nthreads, myid)
{
/* Every thread does this */
myid = omp_get_thread_num();
cout << "Hello I am thread " << myid << endl;
/* Only thread 0 does this */
if (myid == 0)
{
nthreads = omp_get_num_threads();
cout << "Number of threads = " << nthreads << endl;
}
}
return 0;
}
|
/* Shared Variables */
double x[1000000]; // Must be SHARED (accessed by worker threads !!)
int start[100]; // Contain starting array index of each thread
double min[100]; // Contain the minimum found by each thread
int num_threads;
int main(...)
{
for (i = 0; i < MAX; i++)
x[i] = random()/(double)1147483648;
// ---------------------------------------------------
// Tell each thread where to start searching for min
// ----------------------------------------------
for (i = 0; i < num_threads; i = i + 1)
start[i] = i;
#pragma omp parallel
{
... Thread i finds its minimum and
... store the result in min[i]
}
// ----------------------------------------
// Post processing: Find actual minimum
// ----------------------------------------
my_min = min[0];
for (i = 1; i < num_threads; i++)
if ( min[i] < my_min )
my_min = min[i];
}
|
/* Shared Variables */
double x[1000000]; // Must be SHARED (accessed by worker threads !!)
int start[100]; // Contain starting array index of each thread
double min[100]; // Contain the minimum found by each thread
int num_threads;
int main(...)
{
for (i = 0; i < MAX; i++)
x[i] = random()/(double)1147483648;
#pragma omp parallel
{
int id;
int i, n, start, stop;
double my_min;
n = MAX/omp_get_num_threads(); // step = MAX/number of threads.
id = omp_get_thread_num(); // id is one of 0, 1, ..., (num_threads-1)
start = id * n;
if ( id != (num_threads-1) )
{
stop = start + n;
}
else
{
stop = MAX;
}
my_min = x[start];
for (i = start+1; i < stop; i++ )
{
if ( x[i] < my_min )
my_min = x[i];
}
min[id] = my_min; // Store result in min[id]
}
// ----------------------------------------
// Post processing: Find actual minimum
// ----------------------------------------
my_min = min[0];
for (i = 1; i < num_threads; i++)
if ( min[i] < my_min )
my_min = min[i];
}
|
Compile with:
Run with (on compute):
export OMP_NUM_THREADS=8
a.out
#pragma omp critical
{
...
... Update share variables
...
}
|
double f(double a)
{
return( 2.0 / sqrt(1 - a*a) );
}
int main(int argc, char *argv[])
{
int i;
int N;
double sum;
double x, w;
N = ...; // accuracy of the approximation
w = 1.0/N;
sum = 0.0;
for (i = 1; i <= N; i = i + 1)
{
x = w*(i - 0.5);
sum = sum + w*f(x);
}
cout << sum;
}
|
Compile with:
Run the program with:
(We have seen this program before, so I will not explain it again: click here )
When we parallelize, it is important to know which UPDATES must be SYNCHRONIZED:
double f(double a)
{
return( 2.0 / sqrt(1 - a*a) );
}
int main(int argc, char *argv[])
{
int N;
double sum; // Shared variable, updated !
double x, w;
N = ...; // accuracy of the approximation
w = 1.0/N;
sum = 0.0;
#pragma omp parallel
{
int i, num_threads; // Non-shared variables !!!
double x;
num_threads = omp_get_num_threads() ;
for (i = omp_get_thread_num(); i < N; i = i + num_threads)
{
x = w*(i + 0.5);
#pragma omp critical
{
sum = sum + w*f(x);
}
}
}
cout << sum;
}
|
export OMP_NUM_THREADS=8
a.out 50000000
Change OMP_NUM_THREADS and see the difference in performance
double f(double a)
{
return( 2.0 / sqrt(1 - a*a) );
}
int main(int argc, char *argv[])
{
int N;
double sum; // Shared variable, updated !
double x, w;
N = ...; // accuracy of the approximation
w = 1.0/N;
sum = 0.0;
#pragma omp parallel
{
int i, num_threads;
double x;
double mypi; // Private variable to reduce synchronization
num_threads = omp_get_num_threads() ;
mypi = 0.0;
for (i = omp_get_thread_num(); i < N; i = i + num_threads)
{
x = w*(i + 0.5);
mypi = mypi + w*f(x);
}
#pragma omp critical
{
sum = sum + mypi;
}
}
cout << sum;
}
|
export OMP_NUM_THREADS=8
a.out 50000000
Change OMP_NUM_THREADS and see the difference in performance
It is a very simple and mechanical process to divide up the work over a number of threads simply by scheduling a different thread to work on the for-body using a different index value.
The division of labor (splitting the work of a for-loop) of a for-loop can be done in OpenMP through a special Parallel LOOP construct.
#pragma omp parallel
{
....
#pragma omp for [parameters]
for-statement // Parallel Loop
....
}
|
Each iteration of the for-loop is executed exactly once by each thread.
The loop variable used in the Parallel LOOP construct is by default PRIVATE (other variables are still by default SHARED)
double f(double a)
{
return( 2.0 / sqrt(1 - a*a) );
}
int main(int argc, char *argv[])
{
int N;
double sum; // Shared variable, updated !
double x, w;
N = ...; // accuracy of the approximation
w = 1.0/N;
sum = 0.0;
#pragma omp parallel
{
int i;
double mypi, x;
mypi = 0.0;
#pragma omp for
for (i = 0; i < N; i = i + 1)
{
x = w*(i + 0.5); // Save us the trouble of dividing
mypi = mypi + w*f(x); // the work up...
}
#pragma omp critical
{
sum = sum + mypi;
}
}
cout << sum;
}
|
The C/C++ compiler will insert instructions that distribute the execution of the each iteration of for-loop to some thread - it is no longer your problem to "skip" index count to accomplish load distribution !
export OMP_NUM_THREADS=8
a.out 50000000
Change OMP_NUM_THREADS and see the difference in performance
setenv STACKSIZE nBytes |