Actual source code: mpishm.c

  1: #include <petscsys.h>
  2: #include <petsc/private/petscimpl.h>

  4: struct _n_PetscShmComm {
  5:   PetscMPIInt *globranks;         /* global ranks of each rank in the shared memory communicator */
  6:   PetscMPIInt  shmsize;           /* size of the shared memory communicator */
  7:   MPI_Comm     globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
  8: };

 10: /*
 11:    Private routine to delete internal shared memory communicator when a communicator is freed.

 13:    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.

 15:    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()

 17: */
 18: PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state)
 19: {
 20:   PetscShmComm p = (PetscShmComm)val;

 22:   PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm);
 23:   MPI_Comm_free(&p->shmcomm);
 24:   PetscFree(p->globranks);
 25:   PetscFree(val);
 26:   return MPI_SUCCESS;
 27: }

 29: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
 30:   /* Data structures to support freeing comms created in PetscShmCommGet().
 31:   Since we predict communicators passed to PetscShmCommGet() are very likely
 32:   either a petsc inner communicator or an MPI communicator with a linked petsc
 33:   inner communicator, we use a simple static array to store dupped communicators
 34:   on rare cases otherwise.
 35:  */
 36:   #define MAX_SHMCOMM_DUPPED_COMMS 16
 37: static PetscInt       num_dupped_comms = 0;
 38: static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
 39: static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
 40: {
 41:   PetscInt i;
 42:   for (i = 0; i < num_dupped_comms; i++) PetscCommDestroy(&shmcomm_dupped_comms[i]);
 43:   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
 44:   return 0;
 45: }
 46: #endif

 48: /*@C
 49:     PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory

 51:     Collective.

 53:     Input Parameter:
 54: .   globcomm - `MPI_Comm`, which can be a user MPI_Comm or a PETSc inner MPI_Comm

 56:     Output Parameter:
 57: .   pshmcomm - the PETSc shared memory communicator object

 59:     Level: developer

 61:     Note:
 62:        When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis

 64: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
 65: @*/
 66: PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm)
 67: {
 68: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
 69:   MPI_Group         globgroup, shmgroup;
 70:   PetscMPIInt      *shmranks, i, flg;
 71:   PetscCommCounter *counter;

 74:   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
 75:   MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg);
 76:   if (!flg) { /* globcomm is not a petsc comm */
 77:     union
 78:     {
 79:       MPI_Comm comm;
 80:       void    *ptr;
 81:     } ucomm;
 82:     /* check if globcomm already has a linked petsc inner comm */
 83:     MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg);
 84:     if (!flg) {
 85:       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
 87:       PetscCommDuplicate(globcomm, &globcomm, NULL);
 88:       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
 89:       if (num_dupped_comms == 0) PetscRegisterFinalize(PetscShmCommDestroyDuppedComms);
 90:       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
 91:       num_dupped_comms++;
 92:     } else {
 93:       /* otherwise, we pull out the inner comm and use it as globcomm */
 94:       globcomm = ucomm.comm;
 95:     }
 96:   }

 98:   /* Check if globcomm already has an attached pshmcomm. If no, create one */
 99:   MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg);
100:   if (flg) return 0;

102:   PetscNew(pshmcomm);
103:   (*pshmcomm)->globcomm = globcomm;

105:   MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm);

107:   MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize);
108:   MPI_Comm_group(globcomm, &globgroup);
109:   MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);
110:   PetscMalloc1((*pshmcomm)->shmsize, &shmranks);
111:   PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks);
112:   for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i;
113:   MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);
114:   PetscFree(shmranks);
115:   MPI_Group_free(&globgroup);
116:   MPI_Group_free(&shmgroup);

118:   for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i]);
119:   MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm);
120:   return 0;
121: #else
122:   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
123: #endif
124: }

126: /*@C
127:     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator

129:     Input Parameters:
130: +   pshmcomm - the shared memory communicator object
131: -   grank    - the global rank

133:     Output Parameter:
134: .   lrank - the local rank, or `MPI_PROC_NULL` if it does not exist

136:     Level: developer

138:     Developer Notes:
139:     Assumes the pshmcomm->globranks[] is sorted

141:     It may be better to rewrite this to map multiple global ranks to local in the same function call

143: .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
144: @*/
145: PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank)
146: {
147:   PetscMPIInt low, high, t, i;
148:   PetscBool   flg = PETSC_FALSE;

152:   *lrank = MPI_PROC_NULL;
153:   if (grank < pshmcomm->globranks[0]) return 0;
154:   if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) return 0;
155:   PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL);
156:   if (flg) return 0;
157:   low  = 0;
158:   high = pshmcomm->shmsize;
159:   while (high - low > 5) {
160:     t = (low + high) / 2;
161:     if (pshmcomm->globranks[t] > grank) high = t;
162:     else low = t;
163:   }
164:   for (i = low; i < high; i++) {
165:     if (pshmcomm->globranks[i] > grank) return 0;
166:     if (pshmcomm->globranks[i] == grank) {
167:       *lrank = i;
168:       return 0;
169:     }
170:   }
171:   return 0;
172: }

174: /*@C
175:     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank

177:     Input Parameters:
178: +   pshmcomm - the shared memory communicator object
179: -   lrank    - the local rank in the shared memory communicator

181:     Output Parameter:
182: .   grank - the global rank in the global communicator where the shared memory communicator is built

184:     Level: developer

186: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()`
187: @*/
188: PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank)
189: {
193:   *grank = pshmcomm->globranks[lrank];
194:   return 0;
195: }

197: /*@C
198:     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory

200:     Input Parameter:
201: .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()

203:     Output Parameter:
204: .   comm     - the MPI communicator

206:     Level: developer

208: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`
209: @*/
210: PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm)
211: {
214:   *comm = pshmcomm->shmcomm;
215:   return 0;
216: }

218: #if defined(PETSC_HAVE_OPENMP_SUPPORT)
219:   #include <pthread.h>
220:   #include <hwloc.h>
221:   #include <omp.h>

223:   /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
224:    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
225:    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
226:    by 50%. Until the reason is found out, we use mmap() instead.
227: */
228:   #define USE_MMAP_ALLOCATE_SHARED_MEMORY

230:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
231:     #include <sys/mman.h>
232:     #include <sys/types.h>
233:     #include <sys/stat.h>
234:     #include <fcntl.h>
235:   #endif

237: struct _n_PetscOmpCtrl {
238:   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
239:   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
240:   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
241:   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
242:   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
243:   pthread_barrier_t *barrier;         /* pointer to the barrier */
244:   hwloc_topology_t   topology;
245:   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
246:   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
247: };

249: /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
250:    contained by the controller.

252:    PETSc OpenMP controller users do not call this function directly. This function exists
253:    only because we want to separate shared memory allocation methods from other code.
254:  */
255: static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
256: {
257:   MPI_Aint              size;
258:   void                 *baseptr;
259:   pthread_barrierattr_t attr;

261:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
262:   PetscInt  fd;
263:   PetscChar pathname[PETSC_MAX_PATH_LEN];
264:   #else
265:   PetscMPIInt disp_unit;
266:   #endif

268:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
269:   size = sizeof(pthread_barrier_t);
270:   if (ctrl->is_omp_master) {
271:     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
272:     PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN);
273:     PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN);
274:     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
275:     fd = mkstemp(pathname);
277:     ftruncate(fd, size);
278:     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
280:     close(fd);
281:     MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm);
282:     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
283:     MPI_Barrier(ctrl->omp_comm);
284:     unlink(pathname);
285:   } else {
286:     MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm);
287:     fd = open(pathname, O_RDWR);
289:     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
291:     close(fd);
292:     MPI_Barrier(ctrl->omp_comm);
293:   }
294:   #else
295:   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
296:   MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win);
297:   MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr);
298:   #endif
299:   ctrl->barrier = (pthread_barrier_t *)baseptr;

301:   /* omp master initializes the barrier */
302:   if (ctrl->is_omp_master) {
303:     MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size);
304:     pthread_barrierattr_init(&attr);
305:     pthread_barrierattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
306:     pthread_barrier_init(ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
307:     pthread_barrierattr_destroy(&attr);
308:   }

310:   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
311:   MPI_Barrier(ctrl->omp_comm);
312:   return 0;
313: }

315: /* Destroy the pthread barrier in the PETSc OpenMP controller */
316: static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
317: {
318:   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
319:   MPI_Barrier(ctrl->omp_comm);
320:   if (ctrl->is_omp_master) pthread_barrier_destroy(ctrl->barrier);

322:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
323:   munmap(ctrl->barrier, sizeof(pthread_barrier_t));
324:   #else
325:   MPI_Win_free(&ctrl->omp_win);
326:   #endif
327:   return 0;
328: }

330: /*@C
331:     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP

333:     Input Parameters:
334: +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
335: -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value

337:     Output Parameter:
338: .   pctrl      - a PETSc OpenMP controller

340:     Level: developer

342:     Developer Note:
343:     Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use

345: .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
346: @*/
347: PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
348: {
349:   PetscOmpCtrl   ctrl;
350:   unsigned long *cpu_ulongs = NULL;
351:   PetscInt       i, nr_cpu_ulongs;
352:   PetscShmComm   pshmcomm;
353:   MPI_Comm       shm_comm;
354:   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color;
355:   PetscInt       num_packages, num_cores;

357:   PetscNew(&ctrl);

359:   /*=================================================================================
360:     Init hwloc
361:    ==================================================================================*/
362:   hwloc_topology_init(&ctrl->topology);
363:   #if HWLOC_API_VERSION >= 0x00020000
364:   /* to filter out unneeded info and have faster hwloc_topology_load */
365:   hwloc_topology_set_all_types_filter(ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
366:   hwloc_topology_set_type_filter(ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
367:   #endif
368:   hwloc_topology_load(ctrl->topology);

370:   /*=================================================================================
371:     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
372:     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
373:     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
374:     which is usually passed to third party libraries.
375:    ==================================================================================*/

377:   /* fetch the stored shared memory communicator */
378:   PetscShmCommGet(petsc_comm, &pshmcomm);
379:   PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm);

381:   MPI_Comm_rank(shm_comm, &shm_rank);
382:   MPI_Comm_size(shm_comm, &shm_comm_size);

384:   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
385:   if (nthreads == -1) {
386:     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
387:     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
388:     nthreads     = num_cores / num_packages;
389:     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
390:   }

393:   if (shm_comm_size % nthreads) PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size);

395:   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
396:      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
397:      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
398:      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
399:      Use 0 as key so that rank ordering wont change in new comm.
400:    */
401:   color = shm_rank / nthreads;
402:   MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm);

404:   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
405:   MPI_Comm_rank(ctrl->omp_comm, &omp_rank);
406:   if (!omp_rank) {
407:     ctrl->is_omp_master = PETSC_TRUE; /* master */
408:     color               = 0;
409:   } else {
410:     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
411:     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
412:   }
413:   MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm);

415:   /*=================================================================================
416:     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
417:     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
418:     and run them on the idle CPUs.
419:    ==================================================================================*/
420:   PetscOmpCtrlCreateBarrier(ctrl);

422:   /*=================================================================================
423:     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
424:     is the union of the bindings of all ranks in the omp_comm
425:     =================================================================================*/

427:   ctrl->cpuset = hwloc_bitmap_alloc();
429:   hwloc_get_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);

431:   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
432:   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
433:   PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs);
434:   if (nr_cpu_ulongs == 1) {
435:     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
436:   } else {
437:     for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
438:   }

440:   MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm);

442:   if (ctrl->is_omp_master) {
443:     ctrl->omp_cpuset = hwloc_bitmap_alloc();
445:     if (nr_cpu_ulongs == 1) {
446:   #if HWLOC_API_VERSION >= 0x00020000
447:       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
448:   #else
449:       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
450:   #endif
451:     } else {
452:       for (i = 0; i < nr_cpu_ulongs; i++) {
453:   #if HWLOC_API_VERSION >= 0x00020000
454:         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
455:   #else
456:         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
457:   #endif
458:       }
459:     }
460:   }
461:   PetscFree(cpu_ulongs);
462:   *pctrl = ctrl;
463:   return 0;
464: }

466: /*@C
467:     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller

469:     Input Parameter:
470: .   pctrl  - a PETSc OpenMP controller

472:     Level: developer

474: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
475: @*/
476: PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
477: {
478:   PetscOmpCtrl ctrl = *pctrl;

480:   hwloc_bitmap_free(ctrl->cpuset);
481:   hwloc_topology_destroy(ctrl->topology);
482:   PetscOmpCtrlDestroyBarrier(ctrl);
483:   MPI_Comm_free(&ctrl->omp_comm);
484:   if (ctrl->is_omp_master) {
485:     hwloc_bitmap_free(ctrl->omp_cpuset);
486:     MPI_Comm_free(&ctrl->omp_master_comm);
487:   }
488:   PetscFree(ctrl);
489:   return 0;
490: }

492: /*@C
493:     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller

495:     Input Parameter:
496: .   ctrl - a PETSc OMP controller

498:     Output Parameters:
499: +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
500: .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
501:                        on slave ranks, `MPI_COMM_NULL` will be return in reality.
502: -   is_omp_master    - true if the calling process is an OMP master rank.

504:     Note:
505:     Any output parameter can be NULL. The parameter is just ignored.

507:     Level: developer

509: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
510: @*/
511: PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
512: {
513:   if (omp_comm) *omp_comm = ctrl->omp_comm;
514:   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
515:   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
516:   return 0;
517: }

519: /*@C
520:     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)

522:     Input Parameter:
523: .   ctrl - a PETSc OMP controller

525:     Notes:
526:     this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
527:     require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
528:     MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.

530:     A code using `PetscOmpCtrlBarrier()` would be like this,
531: .vb
532:     if (is_omp_master) {
533:       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
534:       Call the library using OpenMP
535:       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
536:     }
537:     PetscOmpCtrlBarrier(ctrl);
538: .ve

540:     Level: developer

542: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
543: @*/
544: PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
545: {
546:   int err;

548:   err = pthread_barrier_wait(ctrl->barrier);
550:   return 0;
551: }

553: /*@C
554:     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks

556:     Input Parameter:
557: .   ctrl - a PETSc OMP controller

559:     Note:
560:     Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
561:     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime

563:     Level: developer

565: .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
566: @*/
567: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
568: {
569:   hwloc_set_cpubind(ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
570:   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
571:   return 0;
572: }

574: /*@C
575:    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks

577:    Input Parameter:
578: .  ctrl - a PETSc OMP controller

580:    Note:
581:    Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
582:    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.

584:    Level: developer

586: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
587: @*/
588: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
589: {
590:   hwloc_set_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
591:   omp_set_num_threads(1);
592:   return 0;
593: }

595:   #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
596: #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */