Actual source code: mpishm.c
1: #include <petscsys.h>
2: #include <petsc/private/petscimpl.h>
4: struct _n_PetscShmComm {
5: PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */
6: PetscMPIInt shmsize; /* size of the shared memory communicator */
7: MPI_Comm globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8: };
10: /*
11: Private routine to delete internal shared memory communicator when a communicator is freed.
13: This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed.
15: Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
17: */
18: PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state)
19: {
20: PetscShmComm p = (PetscShmComm)val;
22: PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm);
23: MPI_Comm_free(&p->shmcomm);
24: PetscFree(p->globranks);
25: PetscFree(val);
26: return MPI_SUCCESS;
27: }
29: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
30: /* Data structures to support freeing comms created in PetscShmCommGet().
31: Since we predict communicators passed to PetscShmCommGet() are very likely
32: either a petsc inner communicator or an MPI communicator with a linked petsc
33: inner communicator, we use a simple static array to store dupped communicators
34: on rare cases otherwise.
35: */
36: #define MAX_SHMCOMM_DUPPED_COMMS 16
37: static PetscInt num_dupped_comms = 0;
38: static MPI_Comm shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
39: static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
40: {
41: PetscInt i;
42: for (i = 0; i < num_dupped_comms; i++) PetscCommDestroy(&shmcomm_dupped_comms[i]);
43: num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
44: return 0;
45: }
46: #endif
48: /*@C
49: PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory
51: Collective.
53: Input Parameter:
54: . globcomm - `MPI_Comm`, which can be a user MPI_Comm or a PETSc inner MPI_Comm
56: Output Parameter:
57: . pshmcomm - the PETSc shared memory communicator object
59: Level: developer
61: Note:
62: When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
64: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
65: @*/
66: PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm)
67: {
68: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
69: MPI_Group globgroup, shmgroup;
70: PetscMPIInt *shmranks, i, flg;
71: PetscCommCounter *counter;
74: /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
75: MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg);
76: if (!flg) { /* globcomm is not a petsc comm */
77: union
78: {
79: MPI_Comm comm;
80: void *ptr;
81: } ucomm;
82: /* check if globcomm already has a linked petsc inner comm */
83: MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg);
84: if (!flg) {
85: /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
87: PetscCommDuplicate(globcomm, &globcomm, NULL);
88: /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
89: if (num_dupped_comms == 0) PetscRegisterFinalize(PetscShmCommDestroyDuppedComms);
90: shmcomm_dupped_comms[num_dupped_comms] = globcomm;
91: num_dupped_comms++;
92: } else {
93: /* otherwise, we pull out the inner comm and use it as globcomm */
94: globcomm = ucomm.comm;
95: }
96: }
98: /* Check if globcomm already has an attached pshmcomm. If no, create one */
99: MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg);
100: if (flg) return 0;
102: PetscNew(pshmcomm);
103: (*pshmcomm)->globcomm = globcomm;
105: MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm);
107: MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize);
108: MPI_Comm_group(globcomm, &globgroup);
109: MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);
110: PetscMalloc1((*pshmcomm)->shmsize, &shmranks);
111: PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks);
112: for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i;
113: MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);
114: PetscFree(shmranks);
115: MPI_Group_free(&globgroup);
116: MPI_Group_free(&shmgroup);
118: for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i]);
119: MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm);
120: return 0;
121: #else
122: SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
123: #endif
124: }
126: /*@C
127: PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
129: Input Parameters:
130: + pshmcomm - the shared memory communicator object
131: - grank - the global rank
133: Output Parameter:
134: . lrank - the local rank, or `MPI_PROC_NULL` if it does not exist
136: Level: developer
138: Developer Notes:
139: Assumes the pshmcomm->globranks[] is sorted
141: It may be better to rewrite this to map multiple global ranks to local in the same function call
143: .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
144: @*/
145: PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank)
146: {
147: PetscMPIInt low, high, t, i;
148: PetscBool flg = PETSC_FALSE;
152: *lrank = MPI_PROC_NULL;
153: if (grank < pshmcomm->globranks[0]) return 0;
154: if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) return 0;
155: PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL);
156: if (flg) return 0;
157: low = 0;
158: high = pshmcomm->shmsize;
159: while (high - low > 5) {
160: t = (low + high) / 2;
161: if (pshmcomm->globranks[t] > grank) high = t;
162: else low = t;
163: }
164: for (i = low; i < high; i++) {
165: if (pshmcomm->globranks[i] > grank) return 0;
166: if (pshmcomm->globranks[i] == grank) {
167: *lrank = i;
168: return 0;
169: }
170: }
171: return 0;
172: }
174: /*@C
175: PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
177: Input Parameters:
178: + pshmcomm - the shared memory communicator object
179: - lrank - the local rank in the shared memory communicator
181: Output Parameter:
182: . grank - the global rank in the global communicator where the shared memory communicator is built
184: Level: developer
186: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()`
187: @*/
188: PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank)
189: {
193: *grank = pshmcomm->globranks[lrank];
194: return 0;
195: }
197: /*@C
198: PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
200: Input Parameter:
201: . pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
203: Output Parameter:
204: . comm - the MPI communicator
206: Level: developer
208: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`
209: @*/
210: PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm)
211: {
214: *comm = pshmcomm->shmcomm;
215: return 0;
216: }
218: #if defined(PETSC_HAVE_OPENMP_SUPPORT)
219: #include <pthread.h>
220: #include <hwloc.h>
221: #include <omp.h>
223: /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
224: otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
225: simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
226: by 50%. Until the reason is found out, we use mmap() instead.
227: */
228: #define USE_MMAP_ALLOCATE_SHARED_MEMORY
230: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
231: #include <sys/mman.h>
232: #include <sys/types.h>
233: #include <sys/stat.h>
234: #include <fcntl.h>
235: #endif
237: struct _n_PetscOmpCtrl {
238: MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */
239: MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */
240: PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */
241: PetscBool is_omp_master; /* rank 0's in omp_comm */
242: MPI_Win omp_win; /* a shared memory window containing a barrier */
243: pthread_barrier_t *barrier; /* pointer to the barrier */
244: hwloc_topology_t topology;
245: hwloc_cpuset_t cpuset; /* cpu bindings of omp master */
246: hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
247: };
249: /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
250: contained by the controller.
252: PETSc OpenMP controller users do not call this function directly. This function exists
253: only because we want to separate shared memory allocation methods from other code.
254: */
255: static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
256: {
257: MPI_Aint size;
258: void *baseptr;
259: pthread_barrierattr_t attr;
261: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
262: PetscInt fd;
263: PetscChar pathname[PETSC_MAX_PATH_LEN];
264: #else
265: PetscMPIInt disp_unit;
266: #endif
268: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
269: size = sizeof(pthread_barrier_t);
270: if (ctrl->is_omp_master) {
271: /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
272: PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN);
273: PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN);
274: /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
275: fd = mkstemp(pathname);
277: ftruncate(fd, size);
278: baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
280: close(fd);
281: MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm);
282: /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
283: MPI_Barrier(ctrl->omp_comm);
284: unlink(pathname);
285: } else {
286: MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm);
287: fd = open(pathname, O_RDWR);
289: baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
291: close(fd);
292: MPI_Barrier(ctrl->omp_comm);
293: }
294: #else
295: size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
296: MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win);
297: MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr);
298: #endif
299: ctrl->barrier = (pthread_barrier_t *)baseptr;
301: /* omp master initializes the barrier */
302: if (ctrl->is_omp_master) {
303: MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size);
304: pthread_barrierattr_init(&attr);
305: pthread_barrierattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
306: pthread_barrier_init(ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
307: pthread_barrierattr_destroy(&attr);
308: }
310: /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
311: MPI_Barrier(ctrl->omp_comm);
312: return 0;
313: }
315: /* Destroy the pthread barrier in the PETSc OpenMP controller */
316: static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
317: {
318: /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
319: MPI_Barrier(ctrl->omp_comm);
320: if (ctrl->is_omp_master) pthread_barrier_destroy(ctrl->barrier);
322: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
323: munmap(ctrl->barrier, sizeof(pthread_barrier_t));
324: #else
325: MPI_Win_free(&ctrl->omp_win);
326: #endif
327: return 0;
328: }
330: /*@C
331: PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
333: Input Parameters:
334: + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
335: - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
337: Output Parameter:
338: . pctrl - a PETSc OpenMP controller
340: Level: developer
342: Developer Note:
343: Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
345: .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
346: @*/
347: PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
348: {
349: PetscOmpCtrl ctrl;
350: unsigned long *cpu_ulongs = NULL;
351: PetscInt i, nr_cpu_ulongs;
352: PetscShmComm pshmcomm;
353: MPI_Comm shm_comm;
354: PetscMPIInt shm_rank, shm_comm_size, omp_rank, color;
355: PetscInt num_packages, num_cores;
357: PetscNew(&ctrl);
359: /*=================================================================================
360: Init hwloc
361: ==================================================================================*/
362: hwloc_topology_init(&ctrl->topology);
363: #if HWLOC_API_VERSION >= 0x00020000
364: /* to filter out unneeded info and have faster hwloc_topology_load */
365: hwloc_topology_set_all_types_filter(ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
366: hwloc_topology_set_type_filter(ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
367: #endif
368: hwloc_topology_load(ctrl->topology);
370: /*=================================================================================
371: Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
372: physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
373: others are called slaves. OMP Masters make up a new comm called omp_master_comm,
374: which is usually passed to third party libraries.
375: ==================================================================================*/
377: /* fetch the stored shared memory communicator */
378: PetscShmCommGet(petsc_comm, &pshmcomm);
379: PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm);
381: MPI_Comm_rank(shm_comm, &shm_rank);
382: MPI_Comm_size(shm_comm, &shm_comm_size);
384: /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
385: if (nthreads == -1) {
386: num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
387: num_cores = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
388: nthreads = num_cores / num_packages;
389: if (nthreads > shm_comm_size) nthreads = shm_comm_size;
390: }
393: if (shm_comm_size % nthreads) PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size);
395: /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
396: shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
397: color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
398: be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
399: Use 0 as key so that rank ordering wont change in new comm.
400: */
401: color = shm_rank / nthreads;
402: MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm);
404: /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
405: MPI_Comm_rank(ctrl->omp_comm, &omp_rank);
406: if (!omp_rank) {
407: ctrl->is_omp_master = PETSC_TRUE; /* master */
408: color = 0;
409: } else {
410: ctrl->is_omp_master = PETSC_FALSE; /* slave */
411: color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
412: }
413: MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm);
415: /*=================================================================================
416: Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
417: slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
418: and run them on the idle CPUs.
419: ==================================================================================*/
420: PetscOmpCtrlCreateBarrier(ctrl);
422: /*=================================================================================
423: omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
424: is the union of the bindings of all ranks in the omp_comm
425: =================================================================================*/
427: ctrl->cpuset = hwloc_bitmap_alloc();
429: hwloc_get_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
431: /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
432: nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
433: PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs);
434: if (nr_cpu_ulongs == 1) {
435: cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
436: } else {
437: for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
438: }
440: MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm);
442: if (ctrl->is_omp_master) {
443: ctrl->omp_cpuset = hwloc_bitmap_alloc();
445: if (nr_cpu_ulongs == 1) {
446: #if HWLOC_API_VERSION >= 0x00020000
447: hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
448: #else
449: hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
450: #endif
451: } else {
452: for (i = 0; i < nr_cpu_ulongs; i++) {
453: #if HWLOC_API_VERSION >= 0x00020000
454: hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
455: #else
456: hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
457: #endif
458: }
459: }
460: }
461: PetscFree(cpu_ulongs);
462: *pctrl = ctrl;
463: return 0;
464: }
466: /*@C
467: PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
469: Input Parameter:
470: . pctrl - a PETSc OpenMP controller
472: Level: developer
474: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
475: @*/
476: PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
477: {
478: PetscOmpCtrl ctrl = *pctrl;
480: hwloc_bitmap_free(ctrl->cpuset);
481: hwloc_topology_destroy(ctrl->topology);
482: PetscOmpCtrlDestroyBarrier(ctrl);
483: MPI_Comm_free(&ctrl->omp_comm);
484: if (ctrl->is_omp_master) {
485: hwloc_bitmap_free(ctrl->omp_cpuset);
486: MPI_Comm_free(&ctrl->omp_master_comm);
487: }
488: PetscFree(ctrl);
489: return 0;
490: }
492: /*@C
493: PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
495: Input Parameter:
496: . ctrl - a PETSc OMP controller
498: Output Parameters:
499: + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads
500: . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm;
501: on slave ranks, `MPI_COMM_NULL` will be return in reality.
502: - is_omp_master - true if the calling process is an OMP master rank.
504: Note:
505: Any output parameter can be NULL. The parameter is just ignored.
507: Level: developer
509: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
510: @*/
511: PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
512: {
513: if (omp_comm) *omp_comm = ctrl->omp_comm;
514: if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
515: if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
516: return 0;
517: }
519: /*@C
520: PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
522: Input Parameter:
523: . ctrl - a PETSc OMP controller
525: Notes:
526: this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
527: require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
528: MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
530: A code using `PetscOmpCtrlBarrier()` would be like this,
531: .vb
532: if (is_omp_master) {
533: PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
534: Call the library using OpenMP
535: PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
536: }
537: PetscOmpCtrlBarrier(ctrl);
538: .ve
540: Level: developer
542: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
543: @*/
544: PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
545: {
546: int err;
548: err = pthread_barrier_wait(ctrl->barrier);
550: return 0;
551: }
553: /*@C
554: PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
556: Input Parameter:
557: . ctrl - a PETSc OMP controller
559: Note:
560: Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
561: This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
563: Level: developer
565: .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
566: @*/
567: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
568: {
569: hwloc_set_cpubind(ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
570: omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
571: return 0;
572: }
574: /*@C
575: PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
577: Input Parameter:
578: . ctrl - a PETSc OMP controller
580: Note:
581: Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
582: This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
584: Level: developer
586: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
587: @*/
588: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
589: {
590: hwloc_set_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
591: omp_set_num_threads(1);
592: return 0;
593: }
595: #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
596: #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */