Actual source code: xmllogevent.c

  1: /*************************************************************************************
  2:  *    M A R I T I M E  R E S E A R C H  I N S T I T U T E  N E T H E R L A N D S     *
  3:  *************************************************************************************
  4:  *    authors: Bas van 't Hof, Koos Huijssen, Christiaan M. Klaij                    *
  5:  *************************************************************************************
  6:  *    content: Support for nested PetscTimers                                        *
  7:  *************************************************************************************/
  8: #include <petsclog.h>
  9: #include <petsc/private/logimpl.h>
 10: #include <petsctime.h>
 11: #include <petscviewer.h>
 12: #include "../src/sys/logging/xmlviewer.h"

 14: #if defined(PETSC_USE_LOG)

 16: /*
 17:  * Support for nested PetscTimers
 18:  *
 19:  * PetscTimers keep track of a lot of useful information: Wall clock times,
 20:  * message passing statistics, flop counts.  Information about the nested structure
 21:  * of the timers is lost. Example:
 22:  *
 23:  * 7:30   Start: awake
 24:  * 7:30      Start: morning routine
 25:  * 7:40         Start: eat
 26:  * 7:49         Done:  eat
 27:  * 7:43      Done:  morning routine
 28:  * 8:15      Start: work
 29:  * 12:15        Start: eat
 30:  * 12:45        Done:  eat
 31:  * 16:00     Done:  work
 32:  * 16:30     Start: evening routine
 33:  * 18:30        Start: eat
 34:  * 19:15        Done:  eat
 35:  * 22:00     Done:  evening routine
 36:  * 22:00  Done:  awake
 37:  *
 38:  * Petsc timers provide the following timer results:
 39:  *
 40:  *    awake:              1 call    14:30 hours
 41:  *    morning routine:    1 call     0:13 hours
 42:  *    eat:                3 calls    1:24 hours
 43:  *    work:               1 call     7:45 hours
 44:  *    evening routine     1 call     5:30 hours
 45:  *
 46:  * Nested timers can be used to get the following table:
 47:  *
 48:  *   [1 call]: awake                14:30 hours
 49:  *   [1 call]:    morning routine         0:13 hours         ( 2 % of awake)
 50:  *   [1 call]:       eat                       0:09 hours         (69 % of morning routine)
 51:  *                   rest (morning routine)    0:04 hours         (31 % of morning routine)
 52:  *   [1 call]:    work                    7:45 hours         (53 % of awake)
 53:  *   [1 call]:       eat                       0:30 hours         ( 6 % of work)
 54:  *                   rest (work)               7:15 hours         (94 % of work)
 55:  *   [1 call]:    evening routine         5:30 hours         (38 % of awake)
 56:  *   [1 call]:       eat                       0:45 hours         (14 % of evening routine)
 57:  *                   rest (evening routine)    4:45 hours         (86 % of morning routine)
 58:  *
 59:  * We ignore the concept of 'stages', because these seem to be conflicting notions, or at least,
 60:  * the nested timers make the stages unnecessary.
 61:  *
 62:  */

 64: /*
 65:  * Data structures for keeping track of nested timers:
 66:  *
 67:  *   nestedEvents: information about the timers that have actually been activated
 68:  *   dftParentActive: if a timer is started now, it is part of (nested inside) the dftParentActive
 69:  *
 70:  * The Default-timers are used to time the nested timers. Every nested timer corresponds to
 71:  * (one or more) default timers, where one of the default timers has the same event-id as the
 72:  * nested one.
 73:  *
 74:  * Because of the risk of confusion between nested timer ids and default timer ids, we
 75:  * introduce a typedef for nested events (NestedEventId) and use the existing type PetscLogEvent
 76:  * only for default events. Also, all nested event variables are prepended with 'nst', and
 77:  * default timers with 'dft'.
 78:  */

 80:   #define DFT_ID_AWAKE -1

 82: typedef PetscLogEvent NestedEventId;
 83: typedef struct {
 84:   NestedEventId  nstEvent;         /* event-code for this nested event, argument 'event' in PetscLogEventStartNested */
 85:   int            nParents;         /* number of 'dftParents': the default timer which was the dftParentActive when this nested timer was activated */
 86:   PetscLogEvent *dftParentsSorted; /* The default timers which were the dftParentActive when this nested event was started */
 87:   PetscLogEvent *dftEvents;        /* The default timers which represent the different 'instances' of this nested event */

 89:   PetscLogEvent *dftParents;      /* The default timers which were the dftParentActive when this nested event was started */
 90:   PetscLogEvent *dftEventsSorted; /* The default timers which represent the different 'instances' of this nested event */
 91: } PetscNestedEvent;

 93: static PetscLogEvent     dftParentActive        = DFT_ID_AWAKE;
 94: static int               nNestedEvents          = 0;
 95: static int               nNestedEventsAllocated = 0;
 96: static PetscNestedEvent *nestedEvents           = NULL;
 97: static PetscLogDouble    thresholdTime          = 0.01; /* initial value was 0.1 */

 99:   #define THRESHOLD (thresholdTime / 100.0 + 1e-12)

101: static PetscErrorCode       PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
102: static PetscErrorCode       PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
103: PETSC_INTERN PetscErrorCode PetscLogView_Nested(PetscViewer);
104: PETSC_INTERN PetscErrorCode PetscLogView_Flamegraph(PetscViewer);

106: /*@C
107:   PetscLogNestedBegin - Turns on nested logging of objects and events. This logs flop
108:   rates and object creation and should not slow programs down too much.

110:   Logically Collective over `PETSC_COMM_WORLD`

112:   Options Database Keys:
113: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file

115:   Usage:
116: .vb
117:       PetscInitialize(...);
118:       PetscLogNestedBegin();
119:        ... code ...
120:       PetscLogView(viewer);
121:       PetscFinalize();
122: .ve

124:   Level: advanced

126: .seealso: `PetscLogDump()`, `PetscLogAllBegin()`, `PetscLogView()`, `PetscLogTraceBegin()`, `PetscLogDefaultBegin()`
127: @*/
128: PetscErrorCode PetscLogNestedBegin(void)
129: {

132:   nNestedEventsAllocated = 10;
133:   PetscMalloc1(nNestedEventsAllocated, &nestedEvents);
134:   dftParentActive = DFT_ID_AWAKE;
135:   nNestedEvents   = 1;

137:   /* 'Awake' is nested event 0. It has no parents */
138:   nestedEvents[0].nstEvent         = 0;
139:   nestedEvents[0].nParents         = 0;
140:   nestedEvents[0].dftParentsSorted = NULL;
141:   nestedEvents[0].dftEvents        = NULL;
142:   nestedEvents[0].dftParents       = NULL;
143:   nestedEvents[0].dftEventsSorted  = NULL;

145:   PetscLogSet(PetscLogEventBeginNested, PetscLogEventEndNested);
146:   return 0;
147: }

149: /* Delete the data structures for the nested timers */
150: PetscErrorCode PetscLogNestedEnd(void)
151: {
152:   int i;

154:   if (!nestedEvents) return 0;
155:   for (i = 0; i < nNestedEvents; i++) PetscFree4(nestedEvents[i].dftParentsSorted, nestedEvents[i].dftEventsSorted, nestedEvents[i].dftParents, nestedEvents[i].dftEvents);
156:   PetscFree(nestedEvents);
157:   nestedEvents           = NULL;
158:   nNestedEvents          = 0;
159:   nNestedEventsAllocated = 0;
160:   return 0;
161: }

163: /*
164:  UTILITIES: FIND STUFF IN SORTED ARRAYS

166:     dftIndex - index to be found
167:     dftArray - sorted array of PetscLogEvent-ids
168:     narray - dimension of dftArray
169:     entry - entry in the array where dftIndex may be found;

171:      if dftArray[entry] != dftIndex, then dftIndex is not part of dftArray
172:      In that case, the dftIndex can be inserted at this entry.
173: */
174: static PetscErrorCode PetscLogEventFindDefaultTimer(PetscLogEvent dftIndex, const PetscLogEvent *dftArray, int narray, int *entry)
175: {
176:   if (narray == 0 || dftIndex <= dftArray[0]) {
177:     *entry = 0;
178:   } else if (dftIndex > dftArray[narray - 1]) {
179:     *entry = narray;
180:   } else {
181:     int ihigh = narray - 1, ilow = 0;
182:     while (ihigh > ilow) {
183:       const int imiddle = (ihigh + ilow) / 2;
184:       if (dftArray[imiddle] > dftIndex) {
185:         ihigh = imiddle;
186:       } else if (dftArray[imiddle] < dftIndex) {
187:         ilow = imiddle + 1;
188:       } else {
189:         ihigh = imiddle;
190:         ilow  = imiddle;
191:       }
192:     }
193:     *entry = ihigh;
194:   }
195:   return 0;
196: }

198: /*
199:     Utility: find the nested event with given identification

201:     nstEvent - Nested event to be found
202:     entry - entry in the nestedEvents where nstEvent may be found;

204:     if nestedEvents[entry].nstEvent != nstEvent, then index is not part of iarray
205: */
206: static PetscErrorCode PetscLogEventFindNestedTimer(NestedEventId nstEvent, int *entry)
207: {
208:   if (nNestedEvents == 0 || nstEvent <= nestedEvents[0].nstEvent) {
209:     *entry = 0;
210:   } else if (nstEvent > nestedEvents[nNestedEvents - 1].nstEvent) {
211:     *entry = nNestedEvents;
212:   } else {
213:     int ihigh = nNestedEvents - 1, ilow = 0;
214:     while (ihigh > ilow) {
215:       const int imiddle = (ihigh + ilow) / 2;
216:       if (nestedEvents[imiddle].nstEvent > nstEvent) {
217:         ihigh = imiddle;
218:       } else if (nestedEvents[imiddle].nstEvent < nstEvent) {
219:         ilow = imiddle + 1;
220:       } else {
221:         ihigh = imiddle;
222:         ilow  = imiddle;
223:       }
224:     }
225:     *entry = ihigh;
226:   }
227:   return 0;
228: }

230: /*
231:  Nested logging is not prepared yet to support user-defined logging stages, so for now we force logging on the main stage.
232:  Using PetscLogStage{Push/Pop}() would be more appropriate, but these two calls do extra bookkeeping work we don't need.
233: */

235:   #define MAINSTAGE 0

237: static PetscLogStage savedStage = 0;

239: static inline PetscErrorCode PetscLogStageOverride(void)
240: {
241:   PetscStageLog stageLog = petsc_stageLog;

243:   if (stageLog->curStage == MAINSTAGE) return 0;
244:   savedStage         = stageLog->curStage;
245:   stageLog->curStage = MAINSTAGE;
246:   PetscIntStackPush(stageLog->stack, MAINSTAGE);
247:   return 0;
248: }

250: static inline PetscErrorCode PetscLogStageRestore(void)
251: {
252:   PetscStageLog stageLog = petsc_stageLog;

254:   if (savedStage == MAINSTAGE) return 0;
255:   stageLog->curStage = savedStage;
256:   PetscIntStackPop(stageLog->stack, &savedStage);
257:   return 0;
258: }

260: /******************************************************************************************/
261: /* Start a nested event */
262: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
263: {
264:   int           entry, pentry, tentry, i;
265:   PetscLogEvent dftEvent;

267:   PetscLogEventFindNestedTimer(nstEvent, &entry);
268:   if (entry >= nNestedEvents || nestedEvents[entry].nstEvent != nstEvent) {
269:     /* Nested event doesn't exist yet: create it */

271:     if (nNestedEvents == nNestedEventsAllocated) {
272:       /* Enlarge and re-allocate nestedEvents if needed */
273:       PetscNestedEvent *tmp = nestedEvents;
274:       PetscMalloc1(2 * nNestedEvents, &nestedEvents);
275:       nNestedEventsAllocated *= 2;
276:       PetscArraycpy(nestedEvents, tmp, nNestedEvents);
277:       PetscFree(tmp);
278:     }

280:     /* Clear space in nestedEvents for new nested event */
281:     nNestedEvents++;
282:     for (i = nNestedEvents - 1; i > entry; i--) nestedEvents[i] = nestedEvents[i - 1];

284:     /* Create event in nestedEvents */
285:     nestedEvents[entry].nstEvent = nstEvent;
286:     nestedEvents[entry].nParents = 1;
287:     PetscMalloc4(1, &nestedEvents[entry].dftParentsSorted, 1, &nestedEvents[entry].dftEventsSorted, 1, &nestedEvents[entry].dftParents, 1, &nestedEvents[entry].dftEvents);

289:     /* Fill in new event */
290:     pentry   = 0;
291:     dftEvent = (PetscLogEvent)nstEvent;

293:     nestedEvents[entry].nstEvent                 = nstEvent;
294:     nestedEvents[entry].dftParents[pentry]       = dftParentActive;
295:     nestedEvents[entry].dftEvents[pentry]        = dftEvent;
296:     nestedEvents[entry].dftParentsSorted[pentry] = dftParentActive;
297:     nestedEvents[entry].dftEventsSorted[pentry]  = dftEvent;

299:   } else {
300:     /* Nested event exists: find current dftParentActive among parents */
301:     PetscLogEvent *dftParentsSorted = nestedEvents[entry].dftParentsSorted;
302:     PetscLogEvent *dftEvents        = nestedEvents[entry].dftEvents;
303:     int            nParents         = nestedEvents[entry].nParents;

305:     PetscLogEventFindDefaultTimer(dftParentActive, dftParentsSorted, nParents, &pentry);

307:     if (pentry >= nParents || dftParentActive != dftParentsSorted[pentry]) {
308:       /* dftParentActive not in the list: add it to the list */
309:       int            i;
310:       PetscLogEvent *dftParents      = nestedEvents[entry].dftParents;
311:       PetscLogEvent *dftEventsSorted = nestedEvents[entry].dftEventsSorted;
312:       char           name[100];

314:       /* Register a new default timer */
315:       sprintf(name, "%d -> %d", (int)dftParentActive, (int)nstEvent);
316:       PetscLogEventRegister(name, 0, &dftEvent);
317:       PetscLogEventFindDefaultTimer(dftEvent, dftEventsSorted, nParents, &tentry);

319:       /* Reallocate parents and dftEvents to make space for new parent */
320:       PetscMalloc4(1 + nParents, &nestedEvents[entry].dftParentsSorted, 1 + nParents, &nestedEvents[entry].dftEventsSorted, 1 + nParents, &nestedEvents[entry].dftParents, 1 + nParents, &nestedEvents[entry].dftEvents);
321:       PetscArraycpy(nestedEvents[entry].dftParentsSorted, dftParentsSorted, nParents);
322:       PetscArraycpy(nestedEvents[entry].dftEventsSorted, dftEventsSorted, nParents);
323:       PetscArraycpy(nestedEvents[entry].dftParents, dftParents, nParents);
324:       PetscArraycpy(nestedEvents[entry].dftEvents, dftEvents, nParents);
325:       PetscFree4(dftParentsSorted, dftEventsSorted, dftParents, dftEvents);

327:       dftParents       = nestedEvents[entry].dftParents;
328:       dftEvents        = nestedEvents[entry].dftEvents;
329:       dftParentsSorted = nestedEvents[entry].dftParentsSorted;
330:       dftEventsSorted  = nestedEvents[entry].dftEventsSorted;

332:       nestedEvents[entry].nParents++;
333:       nParents++;

335:       for (i = nParents - 1; i > pentry; i--) {
336:         dftParentsSorted[i] = dftParentsSorted[i - 1];
337:         dftEvents[i]        = dftEvents[i - 1];
338:       }
339:       for (i = nParents - 1; i > tentry; i--) {
340:         dftParents[i]      = dftParents[i - 1];
341:         dftEventsSorted[i] = dftEventsSorted[i - 1];
342:       }

344:       /* Fill in the new default timer */
345:       dftParentsSorted[pentry] = dftParentActive;
346:       dftEvents[pentry]        = dftEvent;
347:       dftParents[tentry]       = dftParentActive;
348:       dftEventsSorted[tentry]  = dftEvent;

350:     } else {
351:       /* dftParentActive was found: find the corresponding default 'dftEvent'-timer */
352:       dftEvent = nestedEvents[entry].dftEvents[pentry];
353:     }
354:   }

356:   /* Start the default 'dftEvent'-timer and update the dftParentActive */
357:   PetscLogStageOverride();
358:   PetscLogEventBeginDefault(dftEvent, t, o1, o2, o3, o4);
359:   PetscLogStageRestore();
360:   dftParentActive = dftEvent;
361:   return 0;
362: }

364: /* End a nested event */
365: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
366: {
367:   int            entry, pentry, nParents;
368:   PetscLogEvent *dftEventsSorted;

370:   /* Find the nested event */
371:   PetscLogEventFindNestedTimer(nstEvent, &entry);
374:   dftEventsSorted = nestedEvents[entry].dftEventsSorted;
375:   nParents        = nestedEvents[entry].nParents;

377:   /* Find the current default timer among the 'dftEvents' of this event */
378:   PetscLogEventFindDefaultTimer(dftParentActive, dftEventsSorted, nParents, &pentry);


383:   /* Stop the default timer and update the dftParentActive */
384:   PetscLogStageOverride();
385:   PetscLogEventEndDefault(dftParentActive, t, o1, o2, o3, o4);
386:   PetscLogStageRestore();
387:   dftParentActive = nestedEvents[entry].dftParents[pentry];
388:   return 0;
389: }

391: /*@
392:    PetscLogSetThreshold - Set the threshold time for logging the events; this is a percentage out of 100, so 1. means any event
393:           that takes 1 or more percent of the time.

395:   Logically Collective over `PETSC_COMM_WORLD`

397:   Input Parameter:
398: .   newThresh - the threshold to use

400:   Output Parameter:
401: .   oldThresh - the previously set threshold value

403:   Options Database Keys:
404: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file

406:   Usage:
407: .vb
408:       PetscInitialize(...);
409:       PetscLogNestedBegin();
410:       PetscLogSetThreshold(0.1,&oldthresh);
411:        ... code ...
412:       PetscLogView(viewer);
413:       PetscFinalize();
414: .ve

416:   Level: advanced

418: .seealso: `PetscLogDump()`, `PetscLogAllBegin()`, `PetscLogView()`, `PetscLogTraceBegin()`, `PetscLogDefaultBegin()`,
419:           `PetscLogNestedBegin()`
420: @*/
421: PetscErrorCode PetscLogSetThreshold(PetscLogDouble newThresh, PetscLogDouble *oldThresh)
422: {
423:   if (oldThresh) *oldThresh = thresholdTime;
424:   if (newThresh == PETSC_DECIDE) newThresh = 0.01;
425:   if (newThresh == PETSC_DEFAULT) newThresh = 0.01;
426:   thresholdTime = PetscMax(newThresh, 0.0);
427:   return 0;
428: }

430: static PetscErrorCode PetscPrintExeSpecs(PetscViewer viewer)
431: {
432:   char        arch[128], hostname[128], username[128], pname[PETSC_MAX_PATH_LEN], date[128];
433:   char        version[256], buildoptions[128] = "";
434:   PetscMPIInt size;
435:   size_t      len;

437:   MPI_Comm_size(PetscObjectComm((PetscObject)viewer), &size);
438:   PetscGetArchType(arch, sizeof(arch));
439:   PetscGetHostName(hostname, sizeof(hostname));
440:   PetscGetUserName(username, sizeof(username));
441:   PetscGetProgramName(pname, sizeof(pname));
442:   PetscGetDate(date, sizeof(date));
443:   PetscGetVersion(version, sizeof(version));

445:   PetscViewerXMLStartSection(viewer, "runspecification", "Run Specification");
446:   PetscViewerXMLPutString(viewer, "executable", "Executable", pname);
447:   PetscViewerXMLPutString(viewer, "architecture", "Architecture", arch);
448:   PetscViewerXMLPutString(viewer, "hostname", "Host", hostname);
449:   PetscViewerXMLPutInt(viewer, "nprocesses", "Number of processes", size);
450:   PetscViewerXMLPutString(viewer, "user", "Run by user", username);
451:   PetscViewerXMLPutString(viewer, "date", "Started at", date);
452:   PetscViewerXMLPutString(viewer, "petscrelease", "Petsc Release", version);

454:   if (PetscDefined(USE_DEBUG)) PetscStrlcat(buildoptions, "Debug ", sizeof(buildoptions));
455:   if (PetscDefined(USE_COMPLEX)) PetscStrlcat(buildoptions, "Complex ", sizeof(buildoptions));
456:   if (PetscDefined(USE_REAL_SINGLE)) {
457:     PetscStrlcat(buildoptions, "Single ", sizeof(buildoptions));
458:   } else if (PetscDefined(USE_REAL___FLOAT128)) {
459:     PetscStrlcat(buildoptions, "Quadruple ", sizeof(buildoptions));
460:   } else if (PetscDefined(USE_REAL___FP16)) {
461:     PetscStrlcat(buildoptions, "Half ", sizeof(buildoptions));
462:   }
463:   if (PetscDefined(USE_64BIT_INDICES)) PetscStrlcat(buildoptions, "Int64 ", sizeof(buildoptions));
464:   #if defined(__cplusplus)
465:   PetscStrlcat(buildoptions, "C++ ", sizeof(buildoptions));
466:   #endif
467:   PetscStrlen(buildoptions, &len);
468:   if (len) PetscViewerXMLPutString(viewer, "petscbuildoptions", "Petsc build options", buildoptions);
469:   PetscViewerXMLEndSection(viewer, "runspecification");
470:   return 0;
471: }

473: /* Print the global performance: max, max/min, average and total of
474:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
475:  */
476: static PetscErrorCode PetscPrintXMLGlobalPerformanceElement(PetscViewer viewer, const char *name, const char *desc, PetscLogDouble local_val, const PetscBool print_average, const PetscBool print_total)
477: {
478:   PetscLogDouble min, tot, ratio, avg;
479:   MPI_Comm       comm;
480:   PetscMPIInt    rank, size;
481:   PetscLogDouble valrank[2], max[2];

483:   PetscObjectGetComm((PetscObject)viewer, &comm);
484:   MPI_Comm_size(PetscObjectComm((PetscObject)viewer), &size);
485:   MPI_Comm_rank(comm, &rank);

487:   valrank[0] = local_val;
488:   valrank[1] = (PetscLogDouble)rank;
489:   MPIU_Allreduce(&local_val, &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
490:   MPIU_Allreduce(valrank, &max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
491:   MPIU_Allreduce(&local_val, &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
492:   avg = tot / ((PetscLogDouble)size);
493:   if (min != 0.0) ratio = max[0] / min;
494:   else ratio = 0.0;

496:   PetscViewerXMLStartSection(viewer, name, desc);
497:   PetscViewerXMLPutDouble(viewer, "max", NULL, max[0], "%e");
498:   PetscViewerXMLPutInt(viewer, "maxrank", "rank at which max was found", (PetscMPIInt)max[1]);
499:   PetscViewerXMLPutDouble(viewer, "ratio", NULL, ratio, "%f");
500:   if (print_average) PetscViewerXMLPutDouble(viewer, "average", NULL, avg, "%e");
501:   if (print_total) PetscViewerXMLPutDouble(viewer, "total", NULL, tot, "%e");
502:   PetscViewerXMLEndSection(viewer, name);
503:   return 0;
504: }

506: /* Print the global performance: max, max/min, average and total of
507:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
508:  */
509: static PetscErrorCode PetscPrintGlobalPerformance(PetscViewer viewer, PetscLogDouble locTotalTime)
510: {
511:   PetscLogDouble  flops, mem, red, mess;
512:   const PetscBool print_total_yes = PETSC_TRUE, print_total_no = PETSC_FALSE, print_average_no = PETSC_FALSE, print_average_yes = PETSC_TRUE;

514:   /* Must preserve reduction count before we go on */
515:   red = petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;

517:   /* Calculate summary information */
518:   PetscViewerXMLStartSection(viewer, "globalperformance", "Global performance");

520:   /*   Time */
521:   PetscPrintXMLGlobalPerformanceElement(viewer, "time", "Time (sec)", locTotalTime, print_average_yes, print_total_no);

523:   /*   Objects */
524:   PetscPrintXMLGlobalPerformanceElement(viewer, "objects", "Objects", (PetscLogDouble)petsc_numObjects, print_average_yes, print_total_no);

526:   /*   Flop */
527:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflop", "MFlop", petsc_TotalFlops / 1.0E6, print_average_yes, print_total_yes);

529:   /*   Flop/sec -- Must talk to Barry here */
530:   if (locTotalTime != 0.0) flops = petsc_TotalFlops / locTotalTime;
531:   else flops = 0.0;
532:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflops", "MFlop/sec", flops / 1.0E6, print_average_yes, print_total_yes);

534:   /*   Memory */
535:   PetscMallocGetMaximumUsage(&mem);
536:   if (mem > 0.0) PetscPrintXMLGlobalPerformanceElement(viewer, "memory", "Memory (MiB)", mem / 1024.0 / 1024.0, print_average_yes, print_total_yes);
537:   /*   Messages */
538:   mess = 0.5 * (petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct);
539:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagetransfers", "MPI Message Transfers", mess, print_average_yes, print_total_yes);

541:   /*   Message Volume */
542:   mess = 0.5 * (petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len);
543:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagevolume", "MPI Message Volume (MiB)", mess / 1024.0 / 1024.0, print_average_yes, print_total_yes);

545:   /*   Reductions */
546:   PetscPrintXMLGlobalPerformanceElement(viewer, "reductions", "MPI Reductions", red, print_average_no, print_total_no);
547:   PetscViewerXMLEndSection(viewer, "globalperformance");
548:   return 0;
549: }

551: typedef struct {
552:   PetscLogEvent  dftEvent;
553:   NestedEventId  nstEvent;
554:   PetscLogEvent  dftParent;
555:   NestedEventId  nstParent;
556:   PetscBool      own;
557:   int            depth;
558:   NestedEventId *nstPath;
559: } PetscNestedEventTree;

561: /* Compare timers to sort them in the tree */
562: static int compareTreeItems(const void *item1_, const void *item2_)
563: {
564:   int                   i;
565:   PetscNestedEventTree *item1 = (PetscNestedEventTree *)item1_;
566:   PetscNestedEventTree *item2 = (PetscNestedEventTree *)item2_;

568:   for (i = 0; i < PetscMin(item1->depth, item2->depth); i++) {
569:     if (item1->nstPath[i] < item2->nstPath[i]) return -1;
570:     if (item1->nstPath[i] > item2->nstPath[i]) return +1;
571:   }
572:   if (item1->depth < item2->depth) return -1;
573:   if (item1->depth > item2->depth) return 1;
574:   return 0;
575: }
576: /*
577:  * Do MPI communication to get the complete, nested calling tree for all processes: there may be
578:  * calls that happen in some processes, but not in others.
579:  *
580:  * The output, tree[nTimers] is an array of PetscNestedEventTree-structs.
581:  * The tree is sorted so that the timers can be printed in the order of appearance.
582:  *
583:  * For tree-items which appear in the trees of multiple processes (which will be most items), the
584:  * following rule is followed:
585:  * + if information from my own process is available, then that is the information stored in tree.
586:  *   otherwise it is some other process's information.
587:  */
588: static PetscErrorCode PetscLogNestedTreeCreate(PetscViewer viewer, PetscNestedEventTree **p_tree, int *p_nTimers)
589: {
590:   PetscNestedEventTree *tree = NULL, *newTree;
591:   int                  *treeIndices;
592:   int                   nTimers, totalNTimers, i, j, iTimer0, maxDefaultTimer;
593:   int                   yesno;
594:   PetscBool             done;
595:   int                   maxdepth;
596:   int                   depth;
597:   int                   illegalEvent;
598:   int                   iextra;
599:   NestedEventId        *nstPath, *nstMyPath;
600:   MPI_Comm              comm;

602:   PetscObjectGetComm((PetscObject)viewer, &comm);

604:   /* Calculate memory needed to store everybody's information and allocate tree */
605:   nTimers = 0;
606:   for (i = 0; i < nNestedEvents; i++) nTimers += nestedEvents[i].nParents;

608:   PetscMalloc1(nTimers, &tree);

610:   /* Fill tree with readily available information */
611:   iTimer0         = 0;
612:   maxDefaultTimer = 0;
613:   for (i = 0; i < nNestedEvents; i++) {
614:     int            nParents         = nestedEvents[i].nParents;
615:     NestedEventId  nstEvent         = nestedEvents[i].nstEvent;
616:     PetscLogEvent *dftParentsSorted = nestedEvents[i].dftParentsSorted;
617:     PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
618:     for (j = 0; j < nParents; j++) {
619:       maxDefaultTimer = PetscMax(dftEvents[j], maxDefaultTimer);

621:       tree[iTimer0 + j].dftEvent  = dftEvents[j];
622:       tree[iTimer0 + j].nstEvent  = nstEvent;
623:       tree[iTimer0 + j].dftParent = dftParentsSorted[j];
624:       tree[iTimer0 + j].own       = PETSC_TRUE;

626:       tree[iTimer0 + j].nstParent = 0;
627:       tree[iTimer0 + j].depth     = 0;
628:       tree[iTimer0 + j].nstPath   = NULL;
629:     }
630:     iTimer0 += nParents;
631:   }

633:   /* Calculate the global maximum for the default timer index, so array treeIndices can
634:    * be allocated only once */
635:   MPIU_Allreduce(&maxDefaultTimer, &j, 1, MPI_INT, MPI_MAX, comm);
636:   maxDefaultTimer = j;

638:   /* Find default timer's place in the tree */
639:   PetscCalloc1(maxDefaultTimer + 1, &treeIndices);
640:   treeIndices[0] = 0;
641:   for (i = 0; i < nTimers; i++) {
642:     PetscLogEvent dftEvent = tree[i].dftEvent;
643:     treeIndices[dftEvent]  = i;
644:   }

646:   /* Find each dftParent's nested identification */
647:   for (i = 0; i < nTimers; i++) {
648:     PetscLogEvent dftParent = tree[i].dftParent;
649:     if (dftParent != DFT_ID_AWAKE) {
650:       int j             = treeIndices[dftParent];
651:       tree[i].nstParent = tree[j].nstEvent;
652:     }
653:   }

655:   /* Find depths for each timer path */
656:   done     = PETSC_FALSE;
657:   maxdepth = 0;
658:   while (!done) {
659:     done = PETSC_TRUE;
660:     for (i = 0; i < nTimers; i++) {
661:       if (tree[i].dftParent == DFT_ID_AWAKE) {
662:         tree[i].depth = 1;
663:         maxdepth      = PetscMax(1, maxdepth);
664:       } else {
665:         int j = treeIndices[tree[i].dftParent];
666:         depth = 1 + tree[j].depth;
667:         if (depth > tree[i].depth) {
668:           done          = PETSC_FALSE;
669:           tree[i].depth = depth;
670:           maxdepth      = PetscMax(depth, maxdepth);
671:         }
672:       }
673:     }
674:   }

676:   /* Allocate the paths in the entire tree */
677:   for (i = 0; i < nTimers; i++) {
678:     depth = tree[i].depth;
679:     PetscCalloc1(depth, &tree[i].nstPath);
680:   }

682:   /* Calculate the paths for all timers */
683:   for (depth = 1; depth <= maxdepth; depth++) {
684:     for (i = 0; i < nTimers; i++) {
685:       if (tree[i].depth == depth) {
686:         if (depth > 1) {
687:           int j = treeIndices[tree[i].dftParent];
688:           PetscArraycpy(tree[i].nstPath, tree[j].nstPath, depth - 1);
689:         }
690:         tree[i].nstPath[depth - 1] = tree[i].nstEvent;
691:       }
692:     }
693:   }
694:   PetscFree(treeIndices);

696:   /* Sort the tree on basis of the paths */
697:   qsort(tree, nTimers, sizeof(PetscNestedEventTree), compareTreeItems);

699:   /* Allocate an array to store paths */
700:   depth = maxdepth;
701:   MPIU_Allreduce(&depth, &maxdepth, 1, MPI_INT, MPI_MAX, comm);
702:   PetscMalloc1(maxdepth + 1, &nstPath);
703:   PetscMalloc1(maxdepth + 1, &nstMyPath);

705:   /* Find an illegal nested event index (1+largest nested event index) */
706:   illegalEvent = 1 + nestedEvents[nNestedEvents - 1].nstEvent;
707:   i            = illegalEvent;
708:   MPIU_Allreduce(&i, &illegalEvent, 1, MPI_INT, MPI_MAX, comm);

710:   /* First, detect timers which are not available in this process, but are available in others
711:    *        Allocate a new tree, that can contain all timers
712:    * Then,  fill the new tree with all (own and not-own) timers */
713:   newTree = NULL;
714:   for (yesno = 0; yesno <= 1; yesno++) {
715:     depth  = 1;
716:     i      = 0;
717:     iextra = 0;
718:     while (depth > 0) {
719:       int       j;
720:       PetscBool same;

722:       /* Construct the next path in this process's tree:
723:        * if necessary, supplement with invalid path entries */
724:       depth++;
726:       if (i < nTimers) {
727:         for (j = 0; j < tree[i].depth; j++) nstMyPath[j] = tree[i].nstPath[j];
728:         for (j = tree[i].depth; j < depth; j++) nstMyPath[j] = illegalEvent;
729:       } else {
730:         for (j = 0; j < depth; j++) nstMyPath[j] = illegalEvent;
731:       }

733:       /* Communicate with other processes to obtain the next path and its depth */
734:       MPIU_Allreduce(nstMyPath, nstPath, depth, MPI_INT, MPI_MIN, comm);
735:       for (j = depth - 1; (int)j >= 0; j--) {
736:         if (nstPath[j] == illegalEvent) depth = j;
737:       }

739:       if (depth > 0) {
740:         /* If the path exists */

742:         /* check whether the next path is the same as this process's next path */
743:         same = PETSC_TRUE;
744:         for (j = 0; same && j < depth; j++) same = (same && nstMyPath[j] == nstPath[j]) ? PETSC_TRUE : PETSC_FALSE;

746:         if (same) {
747:           /* Register 'own path' */
748:           if (newTree) newTree[i + iextra] = tree[i];
749:           i++;
750:         } else {
751:           /* Register 'not an own path' */
752:           if (newTree) {
753:             newTree[i + iextra].nstEvent = nstPath[depth - 1];
754:             newTree[i + iextra].own      = PETSC_FALSE;
755:             newTree[i + iextra].depth    = depth;
756:             PetscMalloc1(depth, &newTree[i + iextra].nstPath);
757:             for (j = 0; j < depth; j++) newTree[i + iextra].nstPath[j] = nstPath[j];

759:             newTree[i + iextra].dftEvent  = 0;
760:             newTree[i + iextra].dftParent = 0;
761:             newTree[i + iextra].nstParent = 0;
762:           }
763:           iextra++;
764:         }
765:       }
766:     }

768:     /* Determine the size of the complete tree (with own and not-own timers) and allocate the new tree */
769:     totalNTimers = nTimers + iextra;
770:     if (!newTree) PetscMalloc1(totalNTimers, &newTree);
771:   }
772:   PetscFree(nstPath);
773:   PetscFree(nstMyPath);
774:   PetscFree(tree);
775:   tree    = newTree;
776:   newTree = NULL;

778:   /* Set return value and return */
779:   *p_tree    = tree;
780:   *p_nTimers = totalNTimers;
781:   return 0;
782: }

784: /*
785:  * Delete the nested timer tree
786:  */
787: static PetscErrorCode PetscLogNestedTreeDestroy(PetscNestedEventTree *tree, int nTimers)
788: {
789:   int i;

791:   for (i = 0; i < nTimers; i++) PetscFree(tree[i].nstPath);
792:   PetscFree(tree);
793:   return 0;
794: }

796: /* Print the global performance: max, max/min, average and total of
797:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
798:  */
799: static PetscErrorCode PetscPrintXMLNestedLinePerfResults(PetscViewer viewer, const char *name, PetscLogDouble value, PetscLogDouble minthreshold, PetscLogDouble maxthreshold, PetscLogDouble minmaxtreshold)
800: {
801:   MPI_Comm       comm; /* MPI communicator in reduction */
802:   PetscMPIInt    rank; /* rank of this process */
803:   PetscLogDouble val_in[2], max[2], min[2];
804:   PetscLogDouble minvalue, maxvalue, tot;
805:   PetscMPIInt    size;
806:   PetscMPIInt    minLoc, maxLoc;

808:   PetscObjectGetComm((PetscObject)viewer, &comm);
809:   MPI_Comm_size(comm, &size);
810:   MPI_Comm_rank(comm, &rank);
811:   val_in[0] = value;
812:   val_in[1] = (PetscLogDouble)rank;
813:   MPIU_Allreduce(val_in, max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
814:   MPIU_Allreduce(val_in, min, 1, MPIU_2PETSCLOGDOUBLE, MPI_MINLOC, comm);
815:   maxvalue = max[0];
816:   maxLoc   = (PetscMPIInt)max[1];
817:   minvalue = min[0];
818:   minLoc   = (PetscMPIInt)min[1];
819:   MPIU_Allreduce(&value, &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);

821:   if (maxvalue < maxthreshold && minvalue >= minthreshold) {
822:     /* One call per parent or NO value: don't print */
823:   } else {
824:     PetscViewerXMLStartSection(viewer, name, NULL);
825:     if (maxvalue > minvalue * minmaxtreshold) {
826:       PetscViewerXMLPutDouble(viewer, "avgvalue", NULL, tot / size, "%g");
827:       PetscViewerXMLPutDouble(viewer, "minvalue", NULL, minvalue, "%g");
828:       PetscViewerXMLPutDouble(viewer, "maxvalue", NULL, maxvalue, "%g");
829:       PetscViewerXMLPutInt(viewer, "minloc", NULL, minLoc);
830:       PetscViewerXMLPutInt(viewer, "maxloc", NULL, maxLoc);
831:     } else {
832:       PetscViewerXMLPutDouble(viewer, "value", NULL, tot / size, "%g");
833:     }
834:     PetscViewerXMLEndSection(viewer, name);
835:   }
836:   return 0;
837: }

839:   #define N_COMM 8
840: static PetscErrorCode PetscLogNestedTreePrintLine(PetscViewer viewer, PetscEventPerfInfo perfInfo, PetscLogDouble countsPerCall, int parentCount, int depth, const char *name, PetscLogDouble totalTime, PetscBool *isPrinted)
841: {
842:   PetscLogDouble time = perfInfo.time;
843:   PetscLogDouble timeMx;
844:   MPI_Comm       comm;

846:   PetscObjectGetComm((PetscObject)viewer, &comm);
847:   MPIU_Allreduce(&time, &timeMx, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
848:   *isPrinted = ((timeMx / totalTime) >= THRESHOLD) ? PETSC_TRUE : PETSC_FALSE;
849:   if (*isPrinted) {
850:     PetscViewerXMLStartSection(viewer, "event", NULL);
851:     PetscViewerXMLPutString(viewer, "name", NULL, name);
852:     PetscPrintXMLNestedLinePerfResults(viewer, "time", time / totalTime * 100.0, 0, 0, 1.02);
853:     PetscPrintXMLNestedLinePerfResults(viewer, "ncalls", parentCount > 0 ? countsPerCall : 0, 0.99, 1.01, 1.02);
854:     PetscPrintXMLNestedLinePerfResults(viewer, "mflops", time >= timeMx * 0.001 ? 1e-6 * perfInfo.flops / time : 0, 0, 0.01, 1.05);
855:     PetscPrintXMLNestedLinePerfResults(viewer, "mbps", time >= timeMx * 0.001 ? perfInfo.messageLength / (1024 * 1024 * time) : 0, 0, 0.01, 1.05);
856:     PetscPrintXMLNestedLinePerfResults(viewer, "nreductsps", time >= timeMx * 0.001 ? perfInfo.numReductions / time : 0, 0, 0.01, 1.05);
857:   }
858:   return 0;
859: }

861: /* Count the number of times the parent event was called */

863: static int countParents(const PetscNestedEventTree *tree, PetscEventPerfInfo *eventPerfInfo, int i)
864: {
865:   if (tree[i].depth <= 1) {
866:     return 1; /* Main event: only once */
867:   } else if (!tree[i].own) {
868:     return 1; /* This event didn't happen in this process, but did in another */
869:   } else {
870:     int iParent;
871:     for (iParent = i - 1; iParent >= 0; iParent--) {
872:       if (tree[iParent].depth == tree[i].depth - 1) break;
873:     }
874:     if (tree[iParent].depth != tree[i].depth - 1) {
875:       /* *****  Internal error: cannot find parent */
876:       return -2;
877:     } else {
878:       PetscLogEvent dftEvent = tree[iParent].dftEvent;
879:       return eventPerfInfo[dftEvent].count;
880:     }
881:   }
882: }

884: typedef struct {
885:   int            id;
886:   PetscLogDouble val;
887: } PetscSortItem;

889: static int compareSortItems(const void *item1_, const void *item2_)
890: {
891:   PetscSortItem *item1 = (PetscSortItem *)item1_;
892:   PetscSortItem *item2 = (PetscSortItem *)item2_;
893:   if (item1->val > item2->val) return -1;
894:   if (item1->val < item2->val) return +1;
895:   return 0;
896: }

898: /*
899:  * Find the number of child events.
900:  */
901: static PetscErrorCode PetscLogNestedTreeGetChildrenCount(const PetscNestedEventTree *tree, int nTimers, int iStart, int depth, int *nChildren)
902: {
903:   int n = 0;

905:   for (int i = iStart + 1; i < nTimers; i++) {
906:     if (tree[i].depth <= depth) break;
907:     if (tree[i].depth == depth + 1) n++;
908:   }
909:   *nChildren = n;
910:   return 0;
911: }

913: /*
914:  * Initialize child event sort items with ID and times.
915:  */
916: static PetscErrorCode PetscLogNestedTreeSetChildrenSortItems(const PetscViewer viewer, const PetscNestedEventTree *tree, int nTimers, int iStart, int depth, int nChildren, PetscSortItem **children)
917: {
918:   MPI_Comm            comm;
919:   PetscLogDouble     *times, *maxTimes;
920:   PetscStageLog       stageLog;
921:   PetscEventPerfInfo *eventPerfInfo;
922:   const int           stage = MAINSTAGE;

924:   PetscObjectGetComm((PetscObject)viewer, &comm);
925:   PetscLogGetStageLog(&stageLog);
926:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;

928:   if (nChildren > 0) {
929:     /* Create an array for the id-s and maxTimes of the children,
930:      *  leaving 2 spaces for self-time and other-time */

932:     PetscMalloc1(nChildren + 2, children);
933:     nChildren = 0;
934:     for (int i = iStart + 1; i < nTimers; i++) {
935:       if (tree[i].depth <= depth) break;
936:       if (tree[i].depth == depth + 1) {
937:         (*children)[nChildren].id  = i;
938:         (*children)[nChildren].val = eventPerfInfo[tree[i].dftEvent].time;
939:         nChildren++;
940:       }
941:     }

943:     /* Calculate the children's maximum times, to see whether children will be ignored or printed */
944:     PetscMalloc1(nChildren, &times);
945:     for (int i = 0; i < nChildren; i++) times[i] = (*children)[i].val;

947:     PetscMalloc1(nChildren, &maxTimes);
948:     MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
949:     PetscFree(times);

951:     for (int i = 0; i < nChildren; i++) (*children)[i].val = maxTimes[i];
952:     PetscFree(maxTimes);
953:   }
954:   return 0;
955: }

957: /*
958:  * Set 'self' and 'other' performance info.
959:  */
960: static PetscErrorCode PetscLogNestedTreeSetSelfOtherPerfInfo(const PetscNestedEventTree *tree, int iStart, PetscLogDouble totalTime, const PetscSortItem *children, int nChildren, PetscEventPerfInfo *myPerfInfo, PetscEventPerfInfo *selfPerfInfo, PetscEventPerfInfo *otherPerfInfo, int *parentCount, PetscLogDouble *countsPerCall)
961: {
962:   const int           stage = MAINSTAGE;
963:   PetscStageLog       stageLog;
964:   PetscEventPerfInfo *eventPerfInfo;

966:   PetscLogGetStageLog(&stageLog);
967:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
968:   if (!tree[iStart].own) {
969:     /* Set values for a timer that was not activated in this process
970:    * (but was, in other processes of this run) */
971:     PetscMemzero(myPerfInfo, sizeof(*myPerfInfo));

973:     *selfPerfInfo  = *myPerfInfo;
974:     *otherPerfInfo = *myPerfInfo;

976:     *parentCount   = 1;
977:     *countsPerCall = 0;
978:   } else {
979:     /* Set the values for a timer that was activated in this process */
980:     PetscLogEvent dftEvent = tree[iStart].dftEvent;

982:     *parentCount   = countParents(tree, eventPerfInfo, iStart);
983:     *myPerfInfo    = eventPerfInfo[dftEvent];
984:     *countsPerCall = (PetscLogDouble)myPerfInfo->count / (PetscLogDouble)*parentCount;

986:     *selfPerfInfo                = *myPerfInfo;
987:     otherPerfInfo->time          = 0;
988:     otherPerfInfo->flops         = 0;
989:     otherPerfInfo->numMessages   = 0;
990:     otherPerfInfo->messageLength = 0;
991:     otherPerfInfo->numReductions = 0;

993:     for (int i = 0; i < nChildren; i++) {
994:       /* For all child counters: subtract the child values from self-timers */

996:       PetscLogEvent      dftChild      = tree[children[i].id].dftEvent;
997:       PetscEventPerfInfo childPerfInfo = eventPerfInfo[dftChild];

999:       selfPerfInfo->time -= childPerfInfo.time;
1000:       selfPerfInfo->flops -= childPerfInfo.flops;
1001:       selfPerfInfo->numMessages -= childPerfInfo.numMessages;
1002:       selfPerfInfo->messageLength -= childPerfInfo.messageLength;
1003:       selfPerfInfo->numReductions -= childPerfInfo.numReductions;

1005:       if ((children[i].val / totalTime) < THRESHOLD) {
1006:         /* Add them to 'other' if the time is ignored in the output */
1007:         otherPerfInfo->time += childPerfInfo.time;
1008:         otherPerfInfo->flops += childPerfInfo.flops;
1009:         otherPerfInfo->numMessages += childPerfInfo.numMessages;
1010:         otherPerfInfo->messageLength += childPerfInfo.messageLength;
1011:         otherPerfInfo->numReductions += childPerfInfo.numReductions;
1012:       }
1013:     }
1014:   }
1015:   return 0;
1016: }

1018: /*
1019:  * Set max times across ranks for 'self' and 'other'.
1020:  */
1021: static PetscErrorCode PetscLogNestedTreeSetMaxTimes(MPI_Comm comm, int nChildren, const PetscEventPerfInfo selfPerfInfo, const PetscEventPerfInfo otherPerfInfo, PetscSortItem *children)
1022: {
1023:   PetscLogDouble times[2], maxTimes[2];

1025:   times[0] = selfPerfInfo.time;
1026:   times[1] = otherPerfInfo.time;

1028:   MPIU_Allreduce(times, maxTimes, 2, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1029:   children[nChildren + 0].id  = -1;
1030:   children[nChildren + 0].val = maxTimes[0];
1031:   children[nChildren + 1].id  = -2;
1032:   children[nChildren + 1].val = maxTimes[1];
1033:   return 0;
1034: }

1036: static PetscErrorCode PetscLogNestedTreePrint(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, int iStart, PetscLogDouble totalTime)
1037: {
1038:   int                depth = tree[iStart].depth;
1039:   const char        *name;
1040:   int                parentCount = 1, nChildren;
1041:   PetscSortItem     *children;
1042:   PetscStageLog      stageLog;
1043:   PetscEventRegInfo *eventRegInfo;
1044:   PetscEventPerfInfo myPerfInfo = {0}, selfPerfInfo = {0}, otherPerfInfo = {0};
1045:   PetscLogDouble     countsPerCall = 0;
1046:   PetscBool          wasPrinted;
1047:   PetscBool          childWasPrinted;
1048:   MPI_Comm           comm;

1050:   /* Look up the name of the event and its PerfInfo */
1051:   PetscLogGetStageLog(&stageLog);
1052:   eventRegInfo = stageLog->eventLog->eventInfo;
1053:   name         = eventRegInfo[(PetscLogEvent)tree[iStart].nstEvent].name;
1054:   PetscObjectGetComm((PetscObject)viewer, &comm);

1056:   PetscLogNestedTreeGetChildrenCount(tree, nTimers, iStart, depth, &nChildren);
1057:   PetscLogNestedTreeSetChildrenSortItems(viewer, tree, nTimers, iStart, depth, nChildren, &children);
1058:   PetscLogNestedTreeSetSelfOtherPerfInfo(tree, iStart, totalTime, children, nChildren, &myPerfInfo, &selfPerfInfo, &otherPerfInfo, &parentCount, &countsPerCall);

1060:   /* Main output for this timer */
1061:   PetscLogNestedTreePrintLine(viewer, myPerfInfo, countsPerCall, parentCount, depth, name, totalTime, &wasPrinted);

1063:   /* Now print the lines for the children */
1064:   if (nChildren > 0) {
1065:     int i;

1067:     /* Calculate max-times for 'self' and 'other' */
1068:     PetscLogNestedTreeSetMaxTimes(comm, nChildren, selfPerfInfo, otherPerfInfo, children);

1070:     /* Now sort the children (including 'self' and 'other') on total time */
1071:     qsort(children, nChildren + 2, sizeof(PetscSortItem), compareSortItems);

1073:     /* Print (or ignore) the children in ascending order of total time */
1074:     PetscViewerXMLStartSection(viewer, "events", NULL);
1075:     for (i = 0; i < nChildren + 2; i++) {
1076:       if ((children[i].val / totalTime) < THRESHOLD) {
1077:         /* ignored: no output */
1078:       } else if (children[i].id == -1) {
1079:         PetscLogNestedTreePrintLine(viewer, selfPerfInfo, 1, parentCount, depth + 1, "self", totalTime, &childWasPrinted);
1080:         if (childWasPrinted) PetscViewerXMLEndSection(viewer, "event");
1081:       } else if (children[i].id == -2) {
1082:         size_t len;
1083:         char  *otherName;

1085:         PetscStrlen(name, &len);
1086:         PetscMalloc1(len + 16, &otherName);
1087:         PetscSNPrintf(otherName, len + 16, "%s: other-timed", name);
1088:         PetscLogNestedTreePrintLine(viewer, otherPerfInfo, 1, 1, depth + 1, otherName, totalTime, &childWasPrinted);
1089:         PetscFree(otherName);
1090:         if (childWasPrinted) PetscViewerXMLEndSection(viewer, "event");
1091:       } else {
1092:         /* Print the child with a recursive call to this function */
1093:         PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1094:       }
1095:     }
1096:     PetscViewerXMLEndSection(viewer, "events");
1097:     PetscFree(children);
1098:   }

1100:   if (wasPrinted) PetscViewerXMLEndSection(viewer, "event");
1101:   return 0;
1102: }

1104: static PetscErrorCode PetscLogNestedTreePrintTop(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, PetscLogDouble totalTime)
1105: {
1106:   int            i, nChildren;
1107:   PetscSortItem *children;
1108:   MPI_Comm       comm;

1110:   PetscObjectGetComm((PetscObject)viewer, &comm);

1112:   PetscLogNestedTreeGetChildrenCount(tree, nTimers, -1, 0, &nChildren);
1113:   PetscLogNestedTreeSetChildrenSortItems(viewer, tree, nTimers, -1, 0, nChildren, &children);

1115:   if (nChildren > 0) {
1116:     /* Now sort the children on total time */
1117:     qsort(children, nChildren, sizeof(PetscSortItem), compareSortItems);
1118:     /* Print (or ignore) the children in ascending order of total time */
1119:     PetscViewerXMLStartSection(viewer, "timertree", "Timings tree");
1120:     PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1121:     PetscViewerXMLPutDouble(viewer, "timethreshold", NULL, thresholdTime, "%f");

1123:     for (i = 0; i < nChildren; i++) {
1124:       if ((children[i].val / totalTime) < THRESHOLD) {
1125:         /* ignored: no output */
1126:       } else {
1127:         /* Print the child with a recursive call to this function */
1128:         PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1129:       }
1130:     }
1131:     PetscViewerXMLEndSection(viewer, "timertree");
1132:     PetscFree(children);
1133:   }
1134:   return 0;
1135: }

1137: typedef struct {
1138:   char          *name;
1139:   PetscLogDouble time;
1140:   PetscLogDouble flops;
1141:   PetscLogDouble numMessages;
1142:   PetscLogDouble messageLength;
1143:   PetscLogDouble numReductions;
1144: } PetscSelfTimer;

1146: static PetscErrorCode PetscCalcSelfTime(PetscViewer viewer, PetscSelfTimer **p_self, int *p_nstMax)
1147: {
1148:   const int           stage = MAINSTAGE;
1149:   PetscStageLog       stageLog;
1150:   PetscEventRegInfo  *eventRegInfo;
1151:   PetscEventPerfInfo *eventPerfInfo;
1152:   PetscSelfTimer     *selftimes;
1153:   PetscSelfTimer     *totaltimes;
1154:   NestedEventId      *nstEvents;
1155:   int                 i, j, maxDefaultTimer;
1156:   NestedEventId       nst;
1157:   PetscLogEvent       dft;
1158:   int                 nstMax, nstMax_local;
1159:   MPI_Comm            comm;

1161:   PetscObjectGetComm((PetscObject)viewer, &comm);
1162:   PetscLogGetStageLog(&stageLog);
1163:   eventRegInfo  = stageLog->eventLog->eventInfo;
1164:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;

1166:   /* For each default timer, calculate the (one) nested timer that it corresponds to. */
1167:   maxDefaultTimer = 0;
1168:   for (i = 0; i < nNestedEvents; i++) {
1169:     int            nParents  = nestedEvents[i].nParents;
1170:     PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1171:     for (j = 0; j < nParents; j++) maxDefaultTimer = PetscMax(dftEvents[j], maxDefaultTimer);
1172:   }
1173:   PetscMalloc1(maxDefaultTimer + 1, &nstEvents);
1174:   for (dft = 0; dft < maxDefaultTimer; dft++) nstEvents[dft] = 0;
1175:   for (i = 0; i < nNestedEvents; i++) {
1176:     int            nParents  = nestedEvents[i].nParents;
1177:     NestedEventId  nstEvent  = nestedEvents[i].nstEvent;
1178:     PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1179:     for (j = 0; j < nParents; j++) nstEvents[dftEvents[j]] = nstEvent;
1180:   }

1182:   /* Calculate largest nested event-ID */
1183:   nstMax_local = 0;
1184:   for (i = 0; i < nNestedEvents; i++) nstMax_local = PetscMax(nestedEvents[i].nstEvent, nstMax_local);
1185:   MPIU_Allreduce(&nstMax_local, &nstMax, 1, MPI_INT, MPI_MAX, comm);

1187:   /* Initialize all total-times with zero */
1188:   PetscMalloc1(nstMax + 1, &selftimes);
1189:   PetscMalloc1(nstMax + 1, &totaltimes);
1190:   for (nst = 0; nst <= nstMax; nst++) {
1191:     totaltimes[nst].time          = 0;
1192:     totaltimes[nst].flops         = 0;
1193:     totaltimes[nst].numMessages   = 0;
1194:     totaltimes[nst].messageLength = 0;
1195:     totaltimes[nst].numReductions = 0;
1196:     totaltimes[nst].name          = NULL;
1197:   }

1199:   /* Calculate total-times */
1200:   for (i = 0; i < nNestedEvents; i++) {
1201:     const int            nParents  = nestedEvents[i].nParents;
1202:     const NestedEventId  nstEvent  = nestedEvents[i].nstEvent;
1203:     const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1204:     for (j = 0; j < nParents; j++) {
1205:       const PetscLogEvent dftEvent = dftEvents[j];
1206:       totaltimes[nstEvent].time += eventPerfInfo[dftEvent].time;
1207:       totaltimes[nstEvent].flops += eventPerfInfo[dftEvent].flops;
1208:       totaltimes[nstEvent].numMessages += eventPerfInfo[dftEvent].numMessages;
1209:       totaltimes[nstEvent].messageLength += eventPerfInfo[dftEvent].messageLength;
1210:       totaltimes[nstEvent].numReductions += eventPerfInfo[dftEvent].numReductions;
1211:     }
1212:     totaltimes[nstEvent].name = eventRegInfo[(PetscLogEvent)nstEvent].name;
1213:   }

1215:   /* Initialize: self-times := totaltimes */
1216:   for (nst = 0; nst <= nstMax; nst++) selftimes[nst] = totaltimes[nst];

1218:   /* Subtract timed subprocesses from self-times */
1219:   for (i = 0; i < nNestedEvents; i++) {
1220:     const int            nParents         = nestedEvents[i].nParents;
1221:     const PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
1222:     const NestedEventId *dftParentsSorted = nestedEvents[i].dftParentsSorted;
1223:     for (j = 0; j < nParents; j++) {
1224:       if (dftParentsSorted[j] != DFT_ID_AWAKE) {
1225:         const PetscLogEvent dftEvent  = dftEvents[j];
1226:         const NestedEventId nstParent = nstEvents[dftParentsSorted[j]];
1227:         selftimes[nstParent].time -= eventPerfInfo[dftEvent].time;
1228:         selftimes[nstParent].flops -= eventPerfInfo[dftEvent].flops;
1229:         selftimes[nstParent].numMessages -= eventPerfInfo[dftEvent].numMessages;
1230:         selftimes[nstParent].messageLength -= eventPerfInfo[dftEvent].messageLength;
1231:         selftimes[nstParent].numReductions -= eventPerfInfo[dftEvent].numReductions;
1232:       }
1233:     }
1234:   }

1236:   PetscFree(nstEvents);
1237:   PetscFree(totaltimes);

1239:   /* Set outputs */
1240:   *p_self   = selftimes;
1241:   *p_nstMax = nstMax;
1242:   return 0;
1243: }

1245: static PetscErrorCode PetscPrintSelfTime(PetscViewer viewer, const PetscSelfTimer *selftimes, int nstMax, PetscLogDouble totalTime)
1246: {
1247:   int                i;
1248:   NestedEventId      nst;
1249:   PetscSortItem     *sortSelfTimes;
1250:   PetscLogDouble    *times, *maxTimes;
1251:   PetscStageLog      stageLog;
1252:   PetscEventRegInfo *eventRegInfo;
1253:   const int          dum_depth = 1, dum_count = 1, dum_parentcount = 1;
1254:   PetscBool          wasPrinted;
1255:   MPI_Comm           comm;

1257:   PetscObjectGetComm((PetscObject)viewer, &comm);
1258:   PetscLogGetStageLog(&stageLog);
1259:   eventRegInfo = stageLog->eventLog->eventInfo;

1261:   PetscMalloc1(nstMax + 1, &times);
1262:   PetscMalloc1(nstMax + 1, &maxTimes);
1263:   for (nst = 0; nst <= nstMax; nst++) times[nst] = selftimes[nst].time;
1264:   MPIU_Allreduce(times, maxTimes, nstMax + 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1265:   PetscFree(times);

1267:   PetscMalloc1(nstMax + 1, &sortSelfTimes);

1269:   /* Sort the self-timers on basis of the largest time needed */
1270:   for (nst = 0; nst <= nstMax; nst++) {
1271:     sortSelfTimes[nst].id  = nst;
1272:     sortSelfTimes[nst].val = maxTimes[nst];
1273:   }
1274:   PetscFree(maxTimes);
1275:   qsort(sortSelfTimes, nstMax + 1, sizeof(PetscSortItem), compareSortItems);

1277:   PetscViewerXMLStartSection(viewer, "selftimertable", "Self-timings");
1278:   PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");

1280:   for (i = 0; i <= nstMax; i++) {
1281:     if ((sortSelfTimes[i].val / totalTime) >= THRESHOLD) {
1282:       NestedEventId      nstEvent = sortSelfTimes[i].id;
1283:       const char        *name     = eventRegInfo[(PetscLogEvent)nstEvent].name;
1284:       PetscEventPerfInfo selfPerfInfo;

1286:       selfPerfInfo.time          = selftimes[nstEvent].time;
1287:       selfPerfInfo.flops         = selftimes[nstEvent].flops;
1288:       selfPerfInfo.numMessages   = selftimes[nstEvent].numMessages;
1289:       selfPerfInfo.messageLength = selftimes[nstEvent].messageLength;
1290:       selfPerfInfo.numReductions = selftimes[nstEvent].numReductions;

1292:       PetscLogNestedTreePrintLine(viewer, selfPerfInfo, dum_count, dum_parentcount, dum_depth, name, totalTime, &wasPrinted);
1293:       if (wasPrinted) PetscViewerXMLEndSection(viewer, "event");
1294:     }
1295:   }
1296:   PetscViewerXMLEndSection(viewer, "selftimertable");
1297:   PetscFree(sortSelfTimes);
1298:   return 0;
1299: }

1301: PetscErrorCode PetscLogView_Nested(PetscViewer viewer)
1302: {
1303:   PetscLogDouble        locTotalTime, globTotalTime;
1304:   PetscNestedEventTree *tree       = NULL;
1305:   PetscSelfTimer       *selftimers = NULL;
1306:   int                   nTimers = 0, nstMax = 0;
1307:   MPI_Comm              comm;

1309:   PetscObjectGetComm((PetscObject)viewer, &comm);
1310:   PetscViewerInitASCII_XML(viewer);
1311:   PetscViewerASCIIPrintf(viewer, "<!-- PETSc Performance Summary: -->\n");
1312:   PetscViewerXMLStartSection(viewer, "petscroot", NULL);

1314:   /* Get the total elapsed time, local and global maximum */
1315:   PetscTime(&locTotalTime);
1316:   locTotalTime -= petsc_BaseTime;
1317:   MPIU_Allreduce(&locTotalTime, &globTotalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);

1319:   /* Print global information about this run */
1320:   PetscPrintExeSpecs(viewer);
1321:   PetscPrintGlobalPerformance(viewer, locTotalTime);
1322:   /* Collect nested timer tree info from all processes */
1323:   PetscLogNestedTreeCreate(viewer, &tree, &nTimers);
1324:   PetscLogNestedTreePrintTop(viewer, tree, nTimers, globTotalTime);
1325:   PetscLogNestedTreeDestroy(tree, nTimers);

1327:   /* Calculate self-time for all (not-nested) events */
1328:   PetscCalcSelfTime(viewer, &selftimers, &nstMax);
1329:   PetscPrintSelfTime(viewer, selftimers, nstMax, globTotalTime);
1330:   PetscFree(selftimers);

1332:   PetscViewerXMLEndSection(viewer, "petscroot");
1333:   PetscViewerFinalASCII_XML(viewer);
1334:   return 0;
1335: }

1337: /*
1338:  * Get the name of a nested event.
1339:  */
1340: static PetscErrorCode PetscGetNestedEventName(const PetscNestedEventTree *tree, int id, char **name)
1341: {
1342:   PetscStageLog stageLog;

1344:   PetscLogGetStageLog(&stageLog);
1345:   *name = stageLog->eventLog->eventInfo[(PetscLogEvent)tree[id].nstEvent].name;
1346:   return 0;
1347: }

1349: /*
1350:  * Get the total time elapsed.
1351:  */
1352: static PetscErrorCode PetscGetTotalTime(const PetscViewer viewer, PetscLogDouble *totalTime)
1353: {
1354:   PetscLogDouble locTotalTime;
1355:   MPI_Comm       comm;

1357:   PetscObjectGetComm((PetscObject)viewer, &comm);
1358:   PetscTime(&locTotalTime);
1359:   locTotalTime -= petsc_BaseTime;
1360:   MPIU_Allreduce(&locTotalTime, totalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1361:   return 0;
1362: }

1364: /*
1365:  * Write a line to the flame graph output and then recurse into child events.
1366:  */
1367: static PetscErrorCode PetscLogNestedTreePrintFlamegraph(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, int iStart, PetscLogDouble totalTime, PetscIntStack eventStack)
1368: {
1369:   int                depth = tree[iStart].depth, parentCount = 1, i, nChildren;
1370:   char              *name       = NULL;
1371:   PetscEventPerfInfo myPerfInfo = {0}, selfPerfInfo = {0}, otherPerfInfo = {0};
1372:   PetscLogDouble     countsPerCall = 0, locTime, globTime;
1373:   PetscSortItem     *children;
1374:   PetscStageLog      stageLog;
1375:   MPI_Comm           comm;

1377:   PetscLogGetStageLog(&stageLog);
1378:   PetscObjectGetComm((PetscObject)viewer, &comm);

1380:   /* Determine information about the child events as well as 'self' and 'other' */
1381:   PetscLogNestedTreeGetChildrenCount(tree, nTimers, iStart, depth, &nChildren);
1382:   PetscLogNestedTreeSetChildrenSortItems(viewer, tree, nTimers, iStart, depth, nChildren, &children);
1383:   PetscLogNestedTreeSetSelfOtherPerfInfo(tree, iStart, totalTime, children, nChildren, &myPerfInfo, &selfPerfInfo, &otherPerfInfo, &parentCount, &countsPerCall);

1385:   /* Write line to the file. The time shown is 'self' + 'other' because each entry in the output
1386:    * is the total time spent in the event minus the amount spent in child events. */
1387:   locTime = selfPerfInfo.time + otherPerfInfo.time;
1388:   MPIU_Allreduce(&locTime, &globTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1389:   if (globTime / totalTime > THRESHOLD && tree[iStart].own) {
1390:     /* Iterate over parent events in the stack and write them */
1391:     for (i = 0; i <= eventStack->top; i++) {
1392:       PetscGetNestedEventName(tree, eventStack->stack[i], &name);
1393:       PetscViewerASCIIPrintf(viewer, "%s;", name);
1394:     }
1395:     PetscGetNestedEventName(tree, iStart, &name);
1396:     /* The output is given as an integer in microseconds because otherwise the file cannot be read
1397:      * by apps such as speedscope (https://speedscope.app/). */
1398:     PetscViewerASCIIPrintf(viewer, "%s %" PetscInt64_FMT "\n", name, (PetscInt64)(globTime * 1e6));
1399:   }

1401:   /* Add the current event to the parent stack and write the child events */
1402:   PetscIntStackPush(eventStack, iStart);
1403:   for (i = 0; i < nChildren; i++) PetscLogNestedTreePrintFlamegraph(viewer, tree, nTimers, children[i].id, totalTime, eventStack);
1404:   /* Pop the top item from the stack and immediately discard it */
1405:   {
1406:     int tmp;
1407:     PetscIntStackPop(eventStack, &tmp);
1408:   }
1409:   return 0;
1410: }

1412: /*
1413:  * Print nested logging information to a file suitable for reading into a Flame Graph.
1414:  *
1415:  * The format consists of a semicolon-separated list of events and the event duration in microseconds (which must be an integer).
1416:  * An example output would look like:
1417:  *   MatAssemblyBegin 1
1418:  *   MatAssemblyEnd 10
1419:  *   MatView 302
1420:  *   KSPSetUp 98
1421:  *   KSPSetUp;VecSet 5
1422:  *   KSPSolve 150
1423:  *
1424:  * This option may be requested from the command line by passing in the flag `-log_view :<somefile>.txt:ascii_flamegraph`.
1425:  */
1426: PetscErrorCode PetscLogView_Flamegraph(PetscViewer viewer)
1427: {
1428:   int                   nTimers = 0, i, nChildren;
1429:   PetscIntStack         eventStack;
1430:   PetscLogDouble        totalTime;
1431:   PetscNestedEventTree *tree = NULL;
1432:   PetscSortItem        *children;

1434:   PetscGetTotalTime(viewer, &totalTime);
1435:   PetscLogNestedTreeCreate(viewer, &tree, &nTimers);
1436:   /* We use an integer stack to keep track of parent event IDs */
1437:   PetscIntStackCreate(&eventStack);

1439:   /* Initialize the child events and write them recursively */
1440:   PetscLogNestedTreeGetChildrenCount(tree, nTimers, -1, 0, &nChildren);
1441:   PetscLogNestedTreeSetChildrenSortItems(viewer, tree, nTimers, -1, 0, nChildren, &children);
1442:   for (i = 0; i < nChildren; i++) PetscLogNestedTreePrintFlamegraph(viewer, tree, nTimers, children[i].id, totalTime, eventStack);

1444:   PetscLogNestedTreeDestroy(tree, nTimers);
1445:   PetscIntStackDestroy(eventStack);
1446:   return 0;
1447: }

1449: PETSC_EXTERN PetscErrorCode PetscASend(int count, int datatype)
1450: {
1451:   #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1452:   #endif

1454:   petsc_send_ct++;
1455:   #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1456:   PetscMPITypeSize(count, MPI_Type_f2c((MPI_Fint)datatype), &petsc_send_len);
1457:   #endif
1458:   return 0;
1459: }

1461: PETSC_EXTERN PetscErrorCode PetscARecv(int count, int datatype)
1462: {
1463:   #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1464:   #endif

1466:   petsc_recv_ct++;
1467:   #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1468:   PetscMPITypeSize(count, MPI_Type_f2c((MPI_Fint)datatype), &petsc_recv_len);
1469:   #endif
1470:   return 0;
1471: }

1473: PETSC_EXTERN PetscErrorCode PetscAReduce(void)
1474: {
1475:   petsc_allreduce_ct++;
1476:   return 0;
1477: }

1479: #endif