Actual source code: xmllogevent.c
1: /*************************************************************************************
2: * M A R I T I M E R E S E A R C H I N S T I T U T E N E T H E R L A N D S *
3: *************************************************************************************
4: * authors: Bas van 't Hof, Koos Huijssen, Christiaan M. Klaij *
5: *************************************************************************************
6: * content: Support for nested PetscTimers *
7: *************************************************************************************/
8: #include <petsclog.h>
9: #include <petsc/private/logimpl.h>
10: #include <petsctime.h>
11: #include <petscviewer.h>
12: #include "../src/sys/logging/xmlviewer.h"
14: #if defined(PETSC_USE_LOG)
16: /*
17: * Support for nested PetscTimers
18: *
19: * PetscTimers keep track of a lot of useful information: Wall clock times,
20: * message passing statistics, flop counts. Information about the nested structure
21: * of the timers is lost. Example:
22: *
23: * 7:30 Start: awake
24: * 7:30 Start: morning routine
25: * 7:40 Start: eat
26: * 7:49 Done: eat
27: * 7:43 Done: morning routine
28: * 8:15 Start: work
29: * 12:15 Start: eat
30: * 12:45 Done: eat
31: * 16:00 Done: work
32: * 16:30 Start: evening routine
33: * 18:30 Start: eat
34: * 19:15 Done: eat
35: * 22:00 Done: evening routine
36: * 22:00 Done: awake
37: *
38: * Petsc timers provide the following timer results:
39: *
40: * awake: 1 call 14:30 hours
41: * morning routine: 1 call 0:13 hours
42: * eat: 3 calls 1:24 hours
43: * work: 1 call 7:45 hours
44: * evening routine 1 call 5:30 hours
45: *
46: * Nested timers can be used to get the following table:
47: *
48: * [1 call]: awake 14:30 hours
49: * [1 call]: morning routine 0:13 hours ( 2 % of awake)
50: * [1 call]: eat 0:09 hours (69 % of morning routine)
51: * rest (morning routine) 0:04 hours (31 % of morning routine)
52: * [1 call]: work 7:45 hours (53 % of awake)
53: * [1 call]: eat 0:30 hours ( 6 % of work)
54: * rest (work) 7:15 hours (94 % of work)
55: * [1 call]: evening routine 5:30 hours (38 % of awake)
56: * [1 call]: eat 0:45 hours (14 % of evening routine)
57: * rest (evening routine) 4:45 hours (86 % of morning routine)
58: *
59: * We ignore the concept of 'stages', because these seem to be conflicting notions, or at least,
60: * the nested timers make the stages unnecessary.
61: *
62: */
64: /*
65: * Data structures for keeping track of nested timers:
66: *
67: * nestedEvents: information about the timers that have actually been activated
68: * dftParentActive: if a timer is started now, it is part of (nested inside) the dftParentActive
69: *
70: * The Default-timers are used to time the nested timers. Every nested timer corresponds to
71: * (one or more) default timers, where one of the default timers has the same event-id as the
72: * nested one.
73: *
74: * Because of the risk of confusion between nested timer ids and default timer ids, we
75: * introduce a typedef for nested events (NestedEventId) and use the existing type PetscLogEvent
76: * only for default events. Also, all nested event variables are prepended with 'nst', and
77: * default timers with 'dft'.
78: */
80: #define DFT_ID_AWAKE -1
82: typedef PetscLogEvent NestedEventId;
83: typedef struct {
84: NestedEventId nstEvent; /* event-code for this nested event, argument 'event' in PetscLogEventStartNested */
85: int nParents; /* number of 'dftParents': the default timer which was the dftParentActive when this nested timer was activated */
86: PetscLogEvent *dftParentsSorted; /* The default timers which were the dftParentActive when this nested event was started */
87: PetscLogEvent *dftEvents; /* The default timers which represent the different 'instances' of this nested event */
89: PetscLogEvent *dftParents; /* The default timers which were the dftParentActive when this nested event was started */
90: PetscLogEvent *dftEventsSorted; /* The default timers which represent the different 'instances' of this nested event */
91: } PetscNestedEvent;
93: static PetscLogEvent dftParentActive = DFT_ID_AWAKE;
94: static int nNestedEvents = 0;
95: static int nNestedEventsAllocated = 0;
96: static PetscNestedEvent *nestedEvents = NULL;
97: static PetscLogDouble thresholdTime = 0.01; /* initial value was 0.1 */
99: #define THRESHOLD (thresholdTime / 100.0 + 1e-12)
101: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
102: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
103: PETSC_INTERN PetscErrorCode PetscLogView_Nested(PetscViewer);
104: PETSC_INTERN PetscErrorCode PetscLogView_Flamegraph(PetscViewer);
106: /*@C
107: PetscLogNestedBegin - Turns on nested logging of objects and events. This logs flop
108: rates and object creation and should not slow programs down too much.
110: Logically Collective over `PETSC_COMM_WORLD`
112: Options Database Keys:
113: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file
115: Usage:
116: .vb
117: PetscInitialize(...);
118: PetscLogNestedBegin();
119: ... code ...
120: PetscLogView(viewer);
121: PetscFinalize();
122: .ve
124: Level: advanced
126: .seealso: `PetscLogDump()`, `PetscLogAllBegin()`, `PetscLogView()`, `PetscLogTraceBegin()`, `PetscLogDefaultBegin()`
127: @*/
128: PetscErrorCode PetscLogNestedBegin(void)
129: {
132: nNestedEventsAllocated = 10;
133: PetscMalloc1(nNestedEventsAllocated, &nestedEvents);
134: dftParentActive = DFT_ID_AWAKE;
135: nNestedEvents = 1;
137: /* 'Awake' is nested event 0. It has no parents */
138: nestedEvents[0].nstEvent = 0;
139: nestedEvents[0].nParents = 0;
140: nestedEvents[0].dftParentsSorted = NULL;
141: nestedEvents[0].dftEvents = NULL;
142: nestedEvents[0].dftParents = NULL;
143: nestedEvents[0].dftEventsSorted = NULL;
145: PetscLogSet(PetscLogEventBeginNested, PetscLogEventEndNested);
146: return 0;
147: }
149: /* Delete the data structures for the nested timers */
150: PetscErrorCode PetscLogNestedEnd(void)
151: {
152: int i;
154: if (!nestedEvents) return 0;
155: for (i = 0; i < nNestedEvents; i++) PetscFree4(nestedEvents[i].dftParentsSorted, nestedEvents[i].dftEventsSorted, nestedEvents[i].dftParents, nestedEvents[i].dftEvents);
156: PetscFree(nestedEvents);
157: nestedEvents = NULL;
158: nNestedEvents = 0;
159: nNestedEventsAllocated = 0;
160: return 0;
161: }
163: /*
164: UTILITIES: FIND STUFF IN SORTED ARRAYS
166: dftIndex - index to be found
167: dftArray - sorted array of PetscLogEvent-ids
168: narray - dimension of dftArray
169: entry - entry in the array where dftIndex may be found;
171: if dftArray[entry] != dftIndex, then dftIndex is not part of dftArray
172: In that case, the dftIndex can be inserted at this entry.
173: */
174: static PetscErrorCode PetscLogEventFindDefaultTimer(PetscLogEvent dftIndex, const PetscLogEvent *dftArray, int narray, int *entry)
175: {
176: if (narray == 0 || dftIndex <= dftArray[0]) {
177: *entry = 0;
178: } else if (dftIndex > dftArray[narray - 1]) {
179: *entry = narray;
180: } else {
181: int ihigh = narray - 1, ilow = 0;
182: while (ihigh > ilow) {
183: const int imiddle = (ihigh + ilow) / 2;
184: if (dftArray[imiddle] > dftIndex) {
185: ihigh = imiddle;
186: } else if (dftArray[imiddle] < dftIndex) {
187: ilow = imiddle + 1;
188: } else {
189: ihigh = imiddle;
190: ilow = imiddle;
191: }
192: }
193: *entry = ihigh;
194: }
195: return 0;
196: }
198: /*
199: Utility: find the nested event with given identification
201: nstEvent - Nested event to be found
202: entry - entry in the nestedEvents where nstEvent may be found;
204: if nestedEvents[entry].nstEvent != nstEvent, then index is not part of iarray
205: */
206: static PetscErrorCode PetscLogEventFindNestedTimer(NestedEventId nstEvent, int *entry)
207: {
208: if (nNestedEvents == 0 || nstEvent <= nestedEvents[0].nstEvent) {
209: *entry = 0;
210: } else if (nstEvent > nestedEvents[nNestedEvents - 1].nstEvent) {
211: *entry = nNestedEvents;
212: } else {
213: int ihigh = nNestedEvents - 1, ilow = 0;
214: while (ihigh > ilow) {
215: const int imiddle = (ihigh + ilow) / 2;
216: if (nestedEvents[imiddle].nstEvent > nstEvent) {
217: ihigh = imiddle;
218: } else if (nestedEvents[imiddle].nstEvent < nstEvent) {
219: ilow = imiddle + 1;
220: } else {
221: ihigh = imiddle;
222: ilow = imiddle;
223: }
224: }
225: *entry = ihigh;
226: }
227: return 0;
228: }
230: /*
231: Nested logging is not prepared yet to support user-defined logging stages, so for now we force logging on the main stage.
232: Using PetscLogStage{Push/Pop}() would be more appropriate, but these two calls do extra bookkeeping work we don't need.
233: */
235: #define MAINSTAGE 0
237: static PetscLogStage savedStage = 0;
239: static inline PetscErrorCode PetscLogStageOverride(void)
240: {
241: PetscStageLog stageLog = petsc_stageLog;
243: if (stageLog->curStage == MAINSTAGE) return 0;
244: savedStage = stageLog->curStage;
245: stageLog->curStage = MAINSTAGE;
246: PetscIntStackPush(stageLog->stack, MAINSTAGE);
247: return 0;
248: }
250: static inline PetscErrorCode PetscLogStageRestore(void)
251: {
252: PetscStageLog stageLog = petsc_stageLog;
254: if (savedStage == MAINSTAGE) return 0;
255: stageLog->curStage = savedStage;
256: PetscIntStackPop(stageLog->stack, &savedStage);
257: return 0;
258: }
260: /******************************************************************************************/
261: /* Start a nested event */
262: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
263: {
264: int entry, pentry, tentry, i;
265: PetscLogEvent dftEvent;
267: PetscLogEventFindNestedTimer(nstEvent, &entry);
268: if (entry >= nNestedEvents || nestedEvents[entry].nstEvent != nstEvent) {
269: /* Nested event doesn't exist yet: create it */
271: if (nNestedEvents == nNestedEventsAllocated) {
272: /* Enlarge and re-allocate nestedEvents if needed */
273: PetscNestedEvent *tmp = nestedEvents;
274: PetscMalloc1(2 * nNestedEvents, &nestedEvents);
275: nNestedEventsAllocated *= 2;
276: PetscArraycpy(nestedEvents, tmp, nNestedEvents);
277: PetscFree(tmp);
278: }
280: /* Clear space in nestedEvents for new nested event */
281: nNestedEvents++;
282: for (i = nNestedEvents - 1; i > entry; i--) nestedEvents[i] = nestedEvents[i - 1];
284: /* Create event in nestedEvents */
285: nestedEvents[entry].nstEvent = nstEvent;
286: nestedEvents[entry].nParents = 1;
287: PetscMalloc4(1, &nestedEvents[entry].dftParentsSorted, 1, &nestedEvents[entry].dftEventsSorted, 1, &nestedEvents[entry].dftParents, 1, &nestedEvents[entry].dftEvents);
289: /* Fill in new event */
290: pentry = 0;
291: dftEvent = (PetscLogEvent)nstEvent;
293: nestedEvents[entry].nstEvent = nstEvent;
294: nestedEvents[entry].dftParents[pentry] = dftParentActive;
295: nestedEvents[entry].dftEvents[pentry] = dftEvent;
296: nestedEvents[entry].dftParentsSorted[pentry] = dftParentActive;
297: nestedEvents[entry].dftEventsSorted[pentry] = dftEvent;
299: } else {
300: /* Nested event exists: find current dftParentActive among parents */
301: PetscLogEvent *dftParentsSorted = nestedEvents[entry].dftParentsSorted;
302: PetscLogEvent *dftEvents = nestedEvents[entry].dftEvents;
303: int nParents = nestedEvents[entry].nParents;
305: PetscLogEventFindDefaultTimer(dftParentActive, dftParentsSorted, nParents, &pentry);
307: if (pentry >= nParents || dftParentActive != dftParentsSorted[pentry]) {
308: /* dftParentActive not in the list: add it to the list */
309: int i;
310: PetscLogEvent *dftParents = nestedEvents[entry].dftParents;
311: PetscLogEvent *dftEventsSorted = nestedEvents[entry].dftEventsSorted;
312: char name[100];
314: /* Register a new default timer */
315: sprintf(name, "%d -> %d", (int)dftParentActive, (int)nstEvent);
316: PetscLogEventRegister(name, 0, &dftEvent);
317: PetscLogEventFindDefaultTimer(dftEvent, dftEventsSorted, nParents, &tentry);
319: /* Reallocate parents and dftEvents to make space for new parent */
320: PetscMalloc4(1 + nParents, &nestedEvents[entry].dftParentsSorted, 1 + nParents, &nestedEvents[entry].dftEventsSorted, 1 + nParents, &nestedEvents[entry].dftParents, 1 + nParents, &nestedEvents[entry].dftEvents);
321: PetscArraycpy(nestedEvents[entry].dftParentsSorted, dftParentsSorted, nParents);
322: PetscArraycpy(nestedEvents[entry].dftEventsSorted, dftEventsSorted, nParents);
323: PetscArraycpy(nestedEvents[entry].dftParents, dftParents, nParents);
324: PetscArraycpy(nestedEvents[entry].dftEvents, dftEvents, nParents);
325: PetscFree4(dftParentsSorted, dftEventsSorted, dftParents, dftEvents);
327: dftParents = nestedEvents[entry].dftParents;
328: dftEvents = nestedEvents[entry].dftEvents;
329: dftParentsSorted = nestedEvents[entry].dftParentsSorted;
330: dftEventsSorted = nestedEvents[entry].dftEventsSorted;
332: nestedEvents[entry].nParents++;
333: nParents++;
335: for (i = nParents - 1; i > pentry; i--) {
336: dftParentsSorted[i] = dftParentsSorted[i - 1];
337: dftEvents[i] = dftEvents[i - 1];
338: }
339: for (i = nParents - 1; i > tentry; i--) {
340: dftParents[i] = dftParents[i - 1];
341: dftEventsSorted[i] = dftEventsSorted[i - 1];
342: }
344: /* Fill in the new default timer */
345: dftParentsSorted[pentry] = dftParentActive;
346: dftEvents[pentry] = dftEvent;
347: dftParents[tentry] = dftParentActive;
348: dftEventsSorted[tentry] = dftEvent;
350: } else {
351: /* dftParentActive was found: find the corresponding default 'dftEvent'-timer */
352: dftEvent = nestedEvents[entry].dftEvents[pentry];
353: }
354: }
356: /* Start the default 'dftEvent'-timer and update the dftParentActive */
357: PetscLogStageOverride();
358: PetscLogEventBeginDefault(dftEvent, t, o1, o2, o3, o4);
359: PetscLogStageRestore();
360: dftParentActive = dftEvent;
361: return 0;
362: }
364: /* End a nested event */
365: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
366: {
367: int entry, pentry, nParents;
368: PetscLogEvent *dftEventsSorted;
370: /* Find the nested event */
371: PetscLogEventFindNestedTimer(nstEvent, &entry);
374: dftEventsSorted = nestedEvents[entry].dftEventsSorted;
375: nParents = nestedEvents[entry].nParents;
377: /* Find the current default timer among the 'dftEvents' of this event */
378: PetscLogEventFindDefaultTimer(dftParentActive, dftEventsSorted, nParents, &pentry);
383: /* Stop the default timer and update the dftParentActive */
384: PetscLogStageOverride();
385: PetscLogEventEndDefault(dftParentActive, t, o1, o2, o3, o4);
386: PetscLogStageRestore();
387: dftParentActive = nestedEvents[entry].dftParents[pentry];
388: return 0;
389: }
391: /*@
392: PetscLogSetThreshold - Set the threshold time for logging the events; this is a percentage out of 100, so 1. means any event
393: that takes 1 or more percent of the time.
395: Logically Collective over `PETSC_COMM_WORLD`
397: Input Parameter:
398: . newThresh - the threshold to use
400: Output Parameter:
401: . oldThresh - the previously set threshold value
403: Options Database Keys:
404: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file
406: Usage:
407: .vb
408: PetscInitialize(...);
409: PetscLogNestedBegin();
410: PetscLogSetThreshold(0.1,&oldthresh);
411: ... code ...
412: PetscLogView(viewer);
413: PetscFinalize();
414: .ve
416: Level: advanced
418: .seealso: `PetscLogDump()`, `PetscLogAllBegin()`, `PetscLogView()`, `PetscLogTraceBegin()`, `PetscLogDefaultBegin()`,
419: `PetscLogNestedBegin()`
420: @*/
421: PetscErrorCode PetscLogSetThreshold(PetscLogDouble newThresh, PetscLogDouble *oldThresh)
422: {
423: if (oldThresh) *oldThresh = thresholdTime;
424: if (newThresh == PETSC_DECIDE) newThresh = 0.01;
425: if (newThresh == PETSC_DEFAULT) newThresh = 0.01;
426: thresholdTime = PetscMax(newThresh, 0.0);
427: return 0;
428: }
430: static PetscErrorCode PetscPrintExeSpecs(PetscViewer viewer)
431: {
432: char arch[128], hostname[128], username[128], pname[PETSC_MAX_PATH_LEN], date[128];
433: char version[256], buildoptions[128] = "";
434: PetscMPIInt size;
435: size_t len;
437: MPI_Comm_size(PetscObjectComm((PetscObject)viewer), &size);
438: PetscGetArchType(arch, sizeof(arch));
439: PetscGetHostName(hostname, sizeof(hostname));
440: PetscGetUserName(username, sizeof(username));
441: PetscGetProgramName(pname, sizeof(pname));
442: PetscGetDate(date, sizeof(date));
443: PetscGetVersion(version, sizeof(version));
445: PetscViewerXMLStartSection(viewer, "runspecification", "Run Specification");
446: PetscViewerXMLPutString(viewer, "executable", "Executable", pname);
447: PetscViewerXMLPutString(viewer, "architecture", "Architecture", arch);
448: PetscViewerXMLPutString(viewer, "hostname", "Host", hostname);
449: PetscViewerXMLPutInt(viewer, "nprocesses", "Number of processes", size);
450: PetscViewerXMLPutString(viewer, "user", "Run by user", username);
451: PetscViewerXMLPutString(viewer, "date", "Started at", date);
452: PetscViewerXMLPutString(viewer, "petscrelease", "Petsc Release", version);
454: if (PetscDefined(USE_DEBUG)) PetscStrlcat(buildoptions, "Debug ", sizeof(buildoptions));
455: if (PetscDefined(USE_COMPLEX)) PetscStrlcat(buildoptions, "Complex ", sizeof(buildoptions));
456: if (PetscDefined(USE_REAL_SINGLE)) {
457: PetscStrlcat(buildoptions, "Single ", sizeof(buildoptions));
458: } else if (PetscDefined(USE_REAL___FLOAT128)) {
459: PetscStrlcat(buildoptions, "Quadruple ", sizeof(buildoptions));
460: } else if (PetscDefined(USE_REAL___FP16)) {
461: PetscStrlcat(buildoptions, "Half ", sizeof(buildoptions));
462: }
463: if (PetscDefined(USE_64BIT_INDICES)) PetscStrlcat(buildoptions, "Int64 ", sizeof(buildoptions));
464: #if defined(__cplusplus)
465: PetscStrlcat(buildoptions, "C++ ", sizeof(buildoptions));
466: #endif
467: PetscStrlen(buildoptions, &len);
468: if (len) PetscViewerXMLPutString(viewer, "petscbuildoptions", "Petsc build options", buildoptions);
469: PetscViewerXMLEndSection(viewer, "runspecification");
470: return 0;
471: }
473: /* Print the global performance: max, max/min, average and total of
474: * time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
475: */
476: static PetscErrorCode PetscPrintXMLGlobalPerformanceElement(PetscViewer viewer, const char *name, const char *desc, PetscLogDouble local_val, const PetscBool print_average, const PetscBool print_total)
477: {
478: PetscLogDouble min, tot, ratio, avg;
479: MPI_Comm comm;
480: PetscMPIInt rank, size;
481: PetscLogDouble valrank[2], max[2];
483: PetscObjectGetComm((PetscObject)viewer, &comm);
484: MPI_Comm_size(PetscObjectComm((PetscObject)viewer), &size);
485: MPI_Comm_rank(comm, &rank);
487: valrank[0] = local_val;
488: valrank[1] = (PetscLogDouble)rank;
489: MPIU_Allreduce(&local_val, &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
490: MPIU_Allreduce(valrank, &max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
491: MPIU_Allreduce(&local_val, &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
492: avg = tot / ((PetscLogDouble)size);
493: if (min != 0.0) ratio = max[0] / min;
494: else ratio = 0.0;
496: PetscViewerXMLStartSection(viewer, name, desc);
497: PetscViewerXMLPutDouble(viewer, "max", NULL, max[0], "%e");
498: PetscViewerXMLPutInt(viewer, "maxrank", "rank at which max was found", (PetscMPIInt)max[1]);
499: PetscViewerXMLPutDouble(viewer, "ratio", NULL, ratio, "%f");
500: if (print_average) PetscViewerXMLPutDouble(viewer, "average", NULL, avg, "%e");
501: if (print_total) PetscViewerXMLPutDouble(viewer, "total", NULL, tot, "%e");
502: PetscViewerXMLEndSection(viewer, name);
503: return 0;
504: }
506: /* Print the global performance: max, max/min, average and total of
507: * time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
508: */
509: static PetscErrorCode PetscPrintGlobalPerformance(PetscViewer viewer, PetscLogDouble locTotalTime)
510: {
511: PetscLogDouble flops, mem, red, mess;
512: const PetscBool print_total_yes = PETSC_TRUE, print_total_no = PETSC_FALSE, print_average_no = PETSC_FALSE, print_average_yes = PETSC_TRUE;
514: /* Must preserve reduction count before we go on */
515: red = petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;
517: /* Calculate summary information */
518: PetscViewerXMLStartSection(viewer, "globalperformance", "Global performance");
520: /* Time */
521: PetscPrintXMLGlobalPerformanceElement(viewer, "time", "Time (sec)", locTotalTime, print_average_yes, print_total_no);
523: /* Objects */
524: PetscPrintXMLGlobalPerformanceElement(viewer, "objects", "Objects", (PetscLogDouble)petsc_numObjects, print_average_yes, print_total_no);
526: /* Flop */
527: PetscPrintXMLGlobalPerformanceElement(viewer, "mflop", "MFlop", petsc_TotalFlops / 1.0E6, print_average_yes, print_total_yes);
529: /* Flop/sec -- Must talk to Barry here */
530: if (locTotalTime != 0.0) flops = petsc_TotalFlops / locTotalTime;
531: else flops = 0.0;
532: PetscPrintXMLGlobalPerformanceElement(viewer, "mflops", "MFlop/sec", flops / 1.0E6, print_average_yes, print_total_yes);
534: /* Memory */
535: PetscMallocGetMaximumUsage(&mem);
536: if (mem > 0.0) PetscPrintXMLGlobalPerformanceElement(viewer, "memory", "Memory (MiB)", mem / 1024.0 / 1024.0, print_average_yes, print_total_yes);
537: /* Messages */
538: mess = 0.5 * (petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct);
539: PetscPrintXMLGlobalPerformanceElement(viewer, "messagetransfers", "MPI Message Transfers", mess, print_average_yes, print_total_yes);
541: /* Message Volume */
542: mess = 0.5 * (petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len);
543: PetscPrintXMLGlobalPerformanceElement(viewer, "messagevolume", "MPI Message Volume (MiB)", mess / 1024.0 / 1024.0, print_average_yes, print_total_yes);
545: /* Reductions */
546: PetscPrintXMLGlobalPerformanceElement(viewer, "reductions", "MPI Reductions", red, print_average_no, print_total_no);
547: PetscViewerXMLEndSection(viewer, "globalperformance");
548: return 0;
549: }
551: typedef struct {
552: PetscLogEvent dftEvent;
553: NestedEventId nstEvent;
554: PetscLogEvent dftParent;
555: NestedEventId nstParent;
556: PetscBool own;
557: int depth;
558: NestedEventId *nstPath;
559: } PetscNestedEventTree;
561: /* Compare timers to sort them in the tree */
562: static int compareTreeItems(const void *item1_, const void *item2_)
563: {
564: int i;
565: PetscNestedEventTree *item1 = (PetscNestedEventTree *)item1_;
566: PetscNestedEventTree *item2 = (PetscNestedEventTree *)item2_;
568: for (i = 0; i < PetscMin(item1->depth, item2->depth); i++) {
569: if (item1->nstPath[i] < item2->nstPath[i]) return -1;
570: if (item1->nstPath[i] > item2->nstPath[i]) return +1;
571: }
572: if (item1->depth < item2->depth) return -1;
573: if (item1->depth > item2->depth) return 1;
574: return 0;
575: }
576: /*
577: * Do MPI communication to get the complete, nested calling tree for all processes: there may be
578: * calls that happen in some processes, but not in others.
579: *
580: * The output, tree[nTimers] is an array of PetscNestedEventTree-structs.
581: * The tree is sorted so that the timers can be printed in the order of appearance.
582: *
583: * For tree-items which appear in the trees of multiple processes (which will be most items), the
584: * following rule is followed:
585: * + if information from my own process is available, then that is the information stored in tree.
586: * otherwise it is some other process's information.
587: */
588: static PetscErrorCode PetscLogNestedTreeCreate(PetscViewer viewer, PetscNestedEventTree **p_tree, int *p_nTimers)
589: {
590: PetscNestedEventTree *tree = NULL, *newTree;
591: int *treeIndices;
592: int nTimers, totalNTimers, i, j, iTimer0, maxDefaultTimer;
593: int yesno;
594: PetscBool done;
595: int maxdepth;
596: int depth;
597: int illegalEvent;
598: int iextra;
599: NestedEventId *nstPath, *nstMyPath;
600: MPI_Comm comm;
602: PetscObjectGetComm((PetscObject)viewer, &comm);
604: /* Calculate memory needed to store everybody's information and allocate tree */
605: nTimers = 0;
606: for (i = 0; i < nNestedEvents; i++) nTimers += nestedEvents[i].nParents;
608: PetscMalloc1(nTimers, &tree);
610: /* Fill tree with readily available information */
611: iTimer0 = 0;
612: maxDefaultTimer = 0;
613: for (i = 0; i < nNestedEvents; i++) {
614: int nParents = nestedEvents[i].nParents;
615: NestedEventId nstEvent = nestedEvents[i].nstEvent;
616: PetscLogEvent *dftParentsSorted = nestedEvents[i].dftParentsSorted;
617: PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
618: for (j = 0; j < nParents; j++) {
619: maxDefaultTimer = PetscMax(dftEvents[j], maxDefaultTimer);
621: tree[iTimer0 + j].dftEvent = dftEvents[j];
622: tree[iTimer0 + j].nstEvent = nstEvent;
623: tree[iTimer0 + j].dftParent = dftParentsSorted[j];
624: tree[iTimer0 + j].own = PETSC_TRUE;
626: tree[iTimer0 + j].nstParent = 0;
627: tree[iTimer0 + j].depth = 0;
628: tree[iTimer0 + j].nstPath = NULL;
629: }
630: iTimer0 += nParents;
631: }
633: /* Calculate the global maximum for the default timer index, so array treeIndices can
634: * be allocated only once */
635: MPIU_Allreduce(&maxDefaultTimer, &j, 1, MPI_INT, MPI_MAX, comm);
636: maxDefaultTimer = j;
638: /* Find default timer's place in the tree */
639: PetscCalloc1(maxDefaultTimer + 1, &treeIndices);
640: treeIndices[0] = 0;
641: for (i = 0; i < nTimers; i++) {
642: PetscLogEvent dftEvent = tree[i].dftEvent;
643: treeIndices[dftEvent] = i;
644: }
646: /* Find each dftParent's nested identification */
647: for (i = 0; i < nTimers; i++) {
648: PetscLogEvent dftParent = tree[i].dftParent;
649: if (dftParent != DFT_ID_AWAKE) {
650: int j = treeIndices[dftParent];
651: tree[i].nstParent = tree[j].nstEvent;
652: }
653: }
655: /* Find depths for each timer path */
656: done = PETSC_FALSE;
657: maxdepth = 0;
658: while (!done) {
659: done = PETSC_TRUE;
660: for (i = 0; i < nTimers; i++) {
661: if (tree[i].dftParent == DFT_ID_AWAKE) {
662: tree[i].depth = 1;
663: maxdepth = PetscMax(1, maxdepth);
664: } else {
665: int j = treeIndices[tree[i].dftParent];
666: depth = 1 + tree[j].depth;
667: if (depth > tree[i].depth) {
668: done = PETSC_FALSE;
669: tree[i].depth = depth;
670: maxdepth = PetscMax(depth, maxdepth);
671: }
672: }
673: }
674: }
676: /* Allocate the paths in the entire tree */
677: for (i = 0; i < nTimers; i++) {
678: depth = tree[i].depth;
679: PetscCalloc1(depth, &tree[i].nstPath);
680: }
682: /* Calculate the paths for all timers */
683: for (depth = 1; depth <= maxdepth; depth++) {
684: for (i = 0; i < nTimers; i++) {
685: if (tree[i].depth == depth) {
686: if (depth > 1) {
687: int j = treeIndices[tree[i].dftParent];
688: PetscArraycpy(tree[i].nstPath, tree[j].nstPath, depth - 1);
689: }
690: tree[i].nstPath[depth - 1] = tree[i].nstEvent;
691: }
692: }
693: }
694: PetscFree(treeIndices);
696: /* Sort the tree on basis of the paths */
697: qsort(tree, nTimers, sizeof(PetscNestedEventTree), compareTreeItems);
699: /* Allocate an array to store paths */
700: depth = maxdepth;
701: MPIU_Allreduce(&depth, &maxdepth, 1, MPI_INT, MPI_MAX, comm);
702: PetscMalloc1(maxdepth + 1, &nstPath);
703: PetscMalloc1(maxdepth + 1, &nstMyPath);
705: /* Find an illegal nested event index (1+largest nested event index) */
706: illegalEvent = 1 + nestedEvents[nNestedEvents - 1].nstEvent;
707: i = illegalEvent;
708: MPIU_Allreduce(&i, &illegalEvent, 1, MPI_INT, MPI_MAX, comm);
710: /* First, detect timers which are not available in this process, but are available in others
711: * Allocate a new tree, that can contain all timers
712: * Then, fill the new tree with all (own and not-own) timers */
713: newTree = NULL;
714: for (yesno = 0; yesno <= 1; yesno++) {
715: depth = 1;
716: i = 0;
717: iextra = 0;
718: while (depth > 0) {
719: int j;
720: PetscBool same;
722: /* Construct the next path in this process's tree:
723: * if necessary, supplement with invalid path entries */
724: depth++;
726: if (i < nTimers) {
727: for (j = 0; j < tree[i].depth; j++) nstMyPath[j] = tree[i].nstPath[j];
728: for (j = tree[i].depth; j < depth; j++) nstMyPath[j] = illegalEvent;
729: } else {
730: for (j = 0; j < depth; j++) nstMyPath[j] = illegalEvent;
731: }
733: /* Communicate with other processes to obtain the next path and its depth */
734: MPIU_Allreduce(nstMyPath, nstPath, depth, MPI_INT, MPI_MIN, comm);
735: for (j = depth - 1; (int)j >= 0; j--) {
736: if (nstPath[j] == illegalEvent) depth = j;
737: }
739: if (depth > 0) {
740: /* If the path exists */
742: /* check whether the next path is the same as this process's next path */
743: same = PETSC_TRUE;
744: for (j = 0; same && j < depth; j++) same = (same && nstMyPath[j] == nstPath[j]) ? PETSC_TRUE : PETSC_FALSE;
746: if (same) {
747: /* Register 'own path' */
748: if (newTree) newTree[i + iextra] = tree[i];
749: i++;
750: } else {
751: /* Register 'not an own path' */
752: if (newTree) {
753: newTree[i + iextra].nstEvent = nstPath[depth - 1];
754: newTree[i + iextra].own = PETSC_FALSE;
755: newTree[i + iextra].depth = depth;
756: PetscMalloc1(depth, &newTree[i + iextra].nstPath);
757: for (j = 0; j < depth; j++) newTree[i + iextra].nstPath[j] = nstPath[j];
759: newTree[i + iextra].dftEvent = 0;
760: newTree[i + iextra].dftParent = 0;
761: newTree[i + iextra].nstParent = 0;
762: }
763: iextra++;
764: }
765: }
766: }
768: /* Determine the size of the complete tree (with own and not-own timers) and allocate the new tree */
769: totalNTimers = nTimers + iextra;
770: if (!newTree) PetscMalloc1(totalNTimers, &newTree);
771: }
772: PetscFree(nstPath);
773: PetscFree(nstMyPath);
774: PetscFree(tree);
775: tree = newTree;
776: newTree = NULL;
778: /* Set return value and return */
779: *p_tree = tree;
780: *p_nTimers = totalNTimers;
781: return 0;
782: }
784: /*
785: * Delete the nested timer tree
786: */
787: static PetscErrorCode PetscLogNestedTreeDestroy(PetscNestedEventTree *tree, int nTimers)
788: {
789: int i;
791: for (i = 0; i < nTimers; i++) PetscFree(tree[i].nstPath);
792: PetscFree(tree);
793: return 0;
794: }
796: /* Print the global performance: max, max/min, average and total of
797: * time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
798: */
799: static PetscErrorCode PetscPrintXMLNestedLinePerfResults(PetscViewer viewer, const char *name, PetscLogDouble value, PetscLogDouble minthreshold, PetscLogDouble maxthreshold, PetscLogDouble minmaxtreshold)
800: {
801: MPI_Comm comm; /* MPI communicator in reduction */
802: PetscMPIInt rank; /* rank of this process */
803: PetscLogDouble val_in[2], max[2], min[2];
804: PetscLogDouble minvalue, maxvalue, tot;
805: PetscMPIInt size;
806: PetscMPIInt minLoc, maxLoc;
808: PetscObjectGetComm((PetscObject)viewer, &comm);
809: MPI_Comm_size(comm, &size);
810: MPI_Comm_rank(comm, &rank);
811: val_in[0] = value;
812: val_in[1] = (PetscLogDouble)rank;
813: MPIU_Allreduce(val_in, max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
814: MPIU_Allreduce(val_in, min, 1, MPIU_2PETSCLOGDOUBLE, MPI_MINLOC, comm);
815: maxvalue = max[0];
816: maxLoc = (PetscMPIInt)max[1];
817: minvalue = min[0];
818: minLoc = (PetscMPIInt)min[1];
819: MPIU_Allreduce(&value, &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
821: if (maxvalue < maxthreshold && minvalue >= minthreshold) {
822: /* One call per parent or NO value: don't print */
823: } else {
824: PetscViewerXMLStartSection(viewer, name, NULL);
825: if (maxvalue > minvalue * minmaxtreshold) {
826: PetscViewerXMLPutDouble(viewer, "avgvalue", NULL, tot / size, "%g");
827: PetscViewerXMLPutDouble(viewer, "minvalue", NULL, minvalue, "%g");
828: PetscViewerXMLPutDouble(viewer, "maxvalue", NULL, maxvalue, "%g");
829: PetscViewerXMLPutInt(viewer, "minloc", NULL, minLoc);
830: PetscViewerXMLPutInt(viewer, "maxloc", NULL, maxLoc);
831: } else {
832: PetscViewerXMLPutDouble(viewer, "value", NULL, tot / size, "%g");
833: }
834: PetscViewerXMLEndSection(viewer, name);
835: }
836: return 0;
837: }
839: #define N_COMM 8
840: static PetscErrorCode PetscLogNestedTreePrintLine(PetscViewer viewer, PetscEventPerfInfo perfInfo, PetscLogDouble countsPerCall, int parentCount, int depth, const char *name, PetscLogDouble totalTime, PetscBool *isPrinted)
841: {
842: PetscLogDouble time = perfInfo.time;
843: PetscLogDouble timeMx;
844: MPI_Comm comm;
846: PetscObjectGetComm((PetscObject)viewer, &comm);
847: MPIU_Allreduce(&time, &timeMx, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
848: *isPrinted = ((timeMx / totalTime) >= THRESHOLD) ? PETSC_TRUE : PETSC_FALSE;
849: if (*isPrinted) {
850: PetscViewerXMLStartSection(viewer, "event", NULL);
851: PetscViewerXMLPutString(viewer, "name", NULL, name);
852: PetscPrintXMLNestedLinePerfResults(viewer, "time", time / totalTime * 100.0, 0, 0, 1.02);
853: PetscPrintXMLNestedLinePerfResults(viewer, "ncalls", parentCount > 0 ? countsPerCall : 0, 0.99, 1.01, 1.02);
854: PetscPrintXMLNestedLinePerfResults(viewer, "mflops", time >= timeMx * 0.001 ? 1e-6 * perfInfo.flops / time : 0, 0, 0.01, 1.05);
855: PetscPrintXMLNestedLinePerfResults(viewer, "mbps", time >= timeMx * 0.001 ? perfInfo.messageLength / (1024 * 1024 * time) : 0, 0, 0.01, 1.05);
856: PetscPrintXMLNestedLinePerfResults(viewer, "nreductsps", time >= timeMx * 0.001 ? perfInfo.numReductions / time : 0, 0, 0.01, 1.05);
857: }
858: return 0;
859: }
861: /* Count the number of times the parent event was called */
863: static int countParents(const PetscNestedEventTree *tree, PetscEventPerfInfo *eventPerfInfo, int i)
864: {
865: if (tree[i].depth <= 1) {
866: return 1; /* Main event: only once */
867: } else if (!tree[i].own) {
868: return 1; /* This event didn't happen in this process, but did in another */
869: } else {
870: int iParent;
871: for (iParent = i - 1; iParent >= 0; iParent--) {
872: if (tree[iParent].depth == tree[i].depth - 1) break;
873: }
874: if (tree[iParent].depth != tree[i].depth - 1) {
875: /* ***** Internal error: cannot find parent */
876: return -2;
877: } else {
878: PetscLogEvent dftEvent = tree[iParent].dftEvent;
879: return eventPerfInfo[dftEvent].count;
880: }
881: }
882: }
884: typedef struct {
885: int id;
886: PetscLogDouble val;
887: } PetscSortItem;
889: static int compareSortItems(const void *item1_, const void *item2_)
890: {
891: PetscSortItem *item1 = (PetscSortItem *)item1_;
892: PetscSortItem *item2 = (PetscSortItem *)item2_;
893: if (item1->val > item2->val) return -1;
894: if (item1->val < item2->val) return +1;
895: return 0;
896: }
898: /*
899: * Find the number of child events.
900: */
901: static PetscErrorCode PetscLogNestedTreeGetChildrenCount(const PetscNestedEventTree *tree, int nTimers, int iStart, int depth, int *nChildren)
902: {
903: int n = 0;
905: for (int i = iStart + 1; i < nTimers; i++) {
906: if (tree[i].depth <= depth) break;
907: if (tree[i].depth == depth + 1) n++;
908: }
909: *nChildren = n;
910: return 0;
911: }
913: /*
914: * Initialize child event sort items with ID and times.
915: */
916: static PetscErrorCode PetscLogNestedTreeSetChildrenSortItems(const PetscViewer viewer, const PetscNestedEventTree *tree, int nTimers, int iStart, int depth, int nChildren, PetscSortItem **children)
917: {
918: MPI_Comm comm;
919: PetscLogDouble *times, *maxTimes;
920: PetscStageLog stageLog;
921: PetscEventPerfInfo *eventPerfInfo;
922: const int stage = MAINSTAGE;
924: PetscObjectGetComm((PetscObject)viewer, &comm);
925: PetscLogGetStageLog(&stageLog);
926: eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
928: if (nChildren > 0) {
929: /* Create an array for the id-s and maxTimes of the children,
930: * leaving 2 spaces for self-time and other-time */
932: PetscMalloc1(nChildren + 2, children);
933: nChildren = 0;
934: for (int i = iStart + 1; i < nTimers; i++) {
935: if (tree[i].depth <= depth) break;
936: if (tree[i].depth == depth + 1) {
937: (*children)[nChildren].id = i;
938: (*children)[nChildren].val = eventPerfInfo[tree[i].dftEvent].time;
939: nChildren++;
940: }
941: }
943: /* Calculate the children's maximum times, to see whether children will be ignored or printed */
944: PetscMalloc1(nChildren, ×);
945: for (int i = 0; i < nChildren; i++) times[i] = (*children)[i].val;
947: PetscMalloc1(nChildren, &maxTimes);
948: MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
949: PetscFree(times);
951: for (int i = 0; i < nChildren; i++) (*children)[i].val = maxTimes[i];
952: PetscFree(maxTimes);
953: }
954: return 0;
955: }
957: /*
958: * Set 'self' and 'other' performance info.
959: */
960: static PetscErrorCode PetscLogNestedTreeSetSelfOtherPerfInfo(const PetscNestedEventTree *tree, int iStart, PetscLogDouble totalTime, const PetscSortItem *children, int nChildren, PetscEventPerfInfo *myPerfInfo, PetscEventPerfInfo *selfPerfInfo, PetscEventPerfInfo *otherPerfInfo, int *parentCount, PetscLogDouble *countsPerCall)
961: {
962: const int stage = MAINSTAGE;
963: PetscStageLog stageLog;
964: PetscEventPerfInfo *eventPerfInfo;
966: PetscLogGetStageLog(&stageLog);
967: eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
968: if (!tree[iStart].own) {
969: /* Set values for a timer that was not activated in this process
970: * (but was, in other processes of this run) */
971: PetscMemzero(myPerfInfo, sizeof(*myPerfInfo));
973: *selfPerfInfo = *myPerfInfo;
974: *otherPerfInfo = *myPerfInfo;
976: *parentCount = 1;
977: *countsPerCall = 0;
978: } else {
979: /* Set the values for a timer that was activated in this process */
980: PetscLogEvent dftEvent = tree[iStart].dftEvent;
982: *parentCount = countParents(tree, eventPerfInfo, iStart);
983: *myPerfInfo = eventPerfInfo[dftEvent];
984: *countsPerCall = (PetscLogDouble)myPerfInfo->count / (PetscLogDouble)*parentCount;
986: *selfPerfInfo = *myPerfInfo;
987: otherPerfInfo->time = 0;
988: otherPerfInfo->flops = 0;
989: otherPerfInfo->numMessages = 0;
990: otherPerfInfo->messageLength = 0;
991: otherPerfInfo->numReductions = 0;
993: for (int i = 0; i < nChildren; i++) {
994: /* For all child counters: subtract the child values from self-timers */
996: PetscLogEvent dftChild = tree[children[i].id].dftEvent;
997: PetscEventPerfInfo childPerfInfo = eventPerfInfo[dftChild];
999: selfPerfInfo->time -= childPerfInfo.time;
1000: selfPerfInfo->flops -= childPerfInfo.flops;
1001: selfPerfInfo->numMessages -= childPerfInfo.numMessages;
1002: selfPerfInfo->messageLength -= childPerfInfo.messageLength;
1003: selfPerfInfo->numReductions -= childPerfInfo.numReductions;
1005: if ((children[i].val / totalTime) < THRESHOLD) {
1006: /* Add them to 'other' if the time is ignored in the output */
1007: otherPerfInfo->time += childPerfInfo.time;
1008: otherPerfInfo->flops += childPerfInfo.flops;
1009: otherPerfInfo->numMessages += childPerfInfo.numMessages;
1010: otherPerfInfo->messageLength += childPerfInfo.messageLength;
1011: otherPerfInfo->numReductions += childPerfInfo.numReductions;
1012: }
1013: }
1014: }
1015: return 0;
1016: }
1018: /*
1019: * Set max times across ranks for 'self' and 'other'.
1020: */
1021: static PetscErrorCode PetscLogNestedTreeSetMaxTimes(MPI_Comm comm, int nChildren, const PetscEventPerfInfo selfPerfInfo, const PetscEventPerfInfo otherPerfInfo, PetscSortItem *children)
1022: {
1023: PetscLogDouble times[2], maxTimes[2];
1025: times[0] = selfPerfInfo.time;
1026: times[1] = otherPerfInfo.time;
1028: MPIU_Allreduce(times, maxTimes, 2, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1029: children[nChildren + 0].id = -1;
1030: children[nChildren + 0].val = maxTimes[0];
1031: children[nChildren + 1].id = -2;
1032: children[nChildren + 1].val = maxTimes[1];
1033: return 0;
1034: }
1036: static PetscErrorCode PetscLogNestedTreePrint(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, int iStart, PetscLogDouble totalTime)
1037: {
1038: int depth = tree[iStart].depth;
1039: const char *name;
1040: int parentCount = 1, nChildren;
1041: PetscSortItem *children;
1042: PetscStageLog stageLog;
1043: PetscEventRegInfo *eventRegInfo;
1044: PetscEventPerfInfo myPerfInfo = {0}, selfPerfInfo = {0}, otherPerfInfo = {0};
1045: PetscLogDouble countsPerCall = 0;
1046: PetscBool wasPrinted;
1047: PetscBool childWasPrinted;
1048: MPI_Comm comm;
1050: /* Look up the name of the event and its PerfInfo */
1051: PetscLogGetStageLog(&stageLog);
1052: eventRegInfo = stageLog->eventLog->eventInfo;
1053: name = eventRegInfo[(PetscLogEvent)tree[iStart].nstEvent].name;
1054: PetscObjectGetComm((PetscObject)viewer, &comm);
1056: PetscLogNestedTreeGetChildrenCount(tree, nTimers, iStart, depth, &nChildren);
1057: PetscLogNestedTreeSetChildrenSortItems(viewer, tree, nTimers, iStart, depth, nChildren, &children);
1058: PetscLogNestedTreeSetSelfOtherPerfInfo(tree, iStart, totalTime, children, nChildren, &myPerfInfo, &selfPerfInfo, &otherPerfInfo, &parentCount, &countsPerCall);
1060: /* Main output for this timer */
1061: PetscLogNestedTreePrintLine(viewer, myPerfInfo, countsPerCall, parentCount, depth, name, totalTime, &wasPrinted);
1063: /* Now print the lines for the children */
1064: if (nChildren > 0) {
1065: int i;
1067: /* Calculate max-times for 'self' and 'other' */
1068: PetscLogNestedTreeSetMaxTimes(comm, nChildren, selfPerfInfo, otherPerfInfo, children);
1070: /* Now sort the children (including 'self' and 'other') on total time */
1071: qsort(children, nChildren + 2, sizeof(PetscSortItem), compareSortItems);
1073: /* Print (or ignore) the children in ascending order of total time */
1074: PetscViewerXMLStartSection(viewer, "events", NULL);
1075: for (i = 0; i < nChildren + 2; i++) {
1076: if ((children[i].val / totalTime) < THRESHOLD) {
1077: /* ignored: no output */
1078: } else if (children[i].id == -1) {
1079: PetscLogNestedTreePrintLine(viewer, selfPerfInfo, 1, parentCount, depth + 1, "self", totalTime, &childWasPrinted);
1080: if (childWasPrinted) PetscViewerXMLEndSection(viewer, "event");
1081: } else if (children[i].id == -2) {
1082: size_t len;
1083: char *otherName;
1085: PetscStrlen(name, &len);
1086: PetscMalloc1(len + 16, &otherName);
1087: PetscSNPrintf(otherName, len + 16, "%s: other-timed", name);
1088: PetscLogNestedTreePrintLine(viewer, otherPerfInfo, 1, 1, depth + 1, otherName, totalTime, &childWasPrinted);
1089: PetscFree(otherName);
1090: if (childWasPrinted) PetscViewerXMLEndSection(viewer, "event");
1091: } else {
1092: /* Print the child with a recursive call to this function */
1093: PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1094: }
1095: }
1096: PetscViewerXMLEndSection(viewer, "events");
1097: PetscFree(children);
1098: }
1100: if (wasPrinted) PetscViewerXMLEndSection(viewer, "event");
1101: return 0;
1102: }
1104: static PetscErrorCode PetscLogNestedTreePrintTop(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, PetscLogDouble totalTime)
1105: {
1106: int i, nChildren;
1107: PetscSortItem *children;
1108: MPI_Comm comm;
1110: PetscObjectGetComm((PetscObject)viewer, &comm);
1112: PetscLogNestedTreeGetChildrenCount(tree, nTimers, -1, 0, &nChildren);
1113: PetscLogNestedTreeSetChildrenSortItems(viewer, tree, nTimers, -1, 0, nChildren, &children);
1115: if (nChildren > 0) {
1116: /* Now sort the children on total time */
1117: qsort(children, nChildren, sizeof(PetscSortItem), compareSortItems);
1118: /* Print (or ignore) the children in ascending order of total time */
1119: PetscViewerXMLStartSection(viewer, "timertree", "Timings tree");
1120: PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1121: PetscViewerXMLPutDouble(viewer, "timethreshold", NULL, thresholdTime, "%f");
1123: for (i = 0; i < nChildren; i++) {
1124: if ((children[i].val / totalTime) < THRESHOLD) {
1125: /* ignored: no output */
1126: } else {
1127: /* Print the child with a recursive call to this function */
1128: PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1129: }
1130: }
1131: PetscViewerXMLEndSection(viewer, "timertree");
1132: PetscFree(children);
1133: }
1134: return 0;
1135: }
1137: typedef struct {
1138: char *name;
1139: PetscLogDouble time;
1140: PetscLogDouble flops;
1141: PetscLogDouble numMessages;
1142: PetscLogDouble messageLength;
1143: PetscLogDouble numReductions;
1144: } PetscSelfTimer;
1146: static PetscErrorCode PetscCalcSelfTime(PetscViewer viewer, PetscSelfTimer **p_self, int *p_nstMax)
1147: {
1148: const int stage = MAINSTAGE;
1149: PetscStageLog stageLog;
1150: PetscEventRegInfo *eventRegInfo;
1151: PetscEventPerfInfo *eventPerfInfo;
1152: PetscSelfTimer *selftimes;
1153: PetscSelfTimer *totaltimes;
1154: NestedEventId *nstEvents;
1155: int i, j, maxDefaultTimer;
1156: NestedEventId nst;
1157: PetscLogEvent dft;
1158: int nstMax, nstMax_local;
1159: MPI_Comm comm;
1161: PetscObjectGetComm((PetscObject)viewer, &comm);
1162: PetscLogGetStageLog(&stageLog);
1163: eventRegInfo = stageLog->eventLog->eventInfo;
1164: eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
1166: /* For each default timer, calculate the (one) nested timer that it corresponds to. */
1167: maxDefaultTimer = 0;
1168: for (i = 0; i < nNestedEvents; i++) {
1169: int nParents = nestedEvents[i].nParents;
1170: PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1171: for (j = 0; j < nParents; j++) maxDefaultTimer = PetscMax(dftEvents[j], maxDefaultTimer);
1172: }
1173: PetscMalloc1(maxDefaultTimer + 1, &nstEvents);
1174: for (dft = 0; dft < maxDefaultTimer; dft++) nstEvents[dft] = 0;
1175: for (i = 0; i < nNestedEvents; i++) {
1176: int nParents = nestedEvents[i].nParents;
1177: NestedEventId nstEvent = nestedEvents[i].nstEvent;
1178: PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1179: for (j = 0; j < nParents; j++) nstEvents[dftEvents[j]] = nstEvent;
1180: }
1182: /* Calculate largest nested event-ID */
1183: nstMax_local = 0;
1184: for (i = 0; i < nNestedEvents; i++) nstMax_local = PetscMax(nestedEvents[i].nstEvent, nstMax_local);
1185: MPIU_Allreduce(&nstMax_local, &nstMax, 1, MPI_INT, MPI_MAX, comm);
1187: /* Initialize all total-times with zero */
1188: PetscMalloc1(nstMax + 1, &selftimes);
1189: PetscMalloc1(nstMax + 1, &totaltimes);
1190: for (nst = 0; nst <= nstMax; nst++) {
1191: totaltimes[nst].time = 0;
1192: totaltimes[nst].flops = 0;
1193: totaltimes[nst].numMessages = 0;
1194: totaltimes[nst].messageLength = 0;
1195: totaltimes[nst].numReductions = 0;
1196: totaltimes[nst].name = NULL;
1197: }
1199: /* Calculate total-times */
1200: for (i = 0; i < nNestedEvents; i++) {
1201: const int nParents = nestedEvents[i].nParents;
1202: const NestedEventId nstEvent = nestedEvents[i].nstEvent;
1203: const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1204: for (j = 0; j < nParents; j++) {
1205: const PetscLogEvent dftEvent = dftEvents[j];
1206: totaltimes[nstEvent].time += eventPerfInfo[dftEvent].time;
1207: totaltimes[nstEvent].flops += eventPerfInfo[dftEvent].flops;
1208: totaltimes[nstEvent].numMessages += eventPerfInfo[dftEvent].numMessages;
1209: totaltimes[nstEvent].messageLength += eventPerfInfo[dftEvent].messageLength;
1210: totaltimes[nstEvent].numReductions += eventPerfInfo[dftEvent].numReductions;
1211: }
1212: totaltimes[nstEvent].name = eventRegInfo[(PetscLogEvent)nstEvent].name;
1213: }
1215: /* Initialize: self-times := totaltimes */
1216: for (nst = 0; nst <= nstMax; nst++) selftimes[nst] = totaltimes[nst];
1218: /* Subtract timed subprocesses from self-times */
1219: for (i = 0; i < nNestedEvents; i++) {
1220: const int nParents = nestedEvents[i].nParents;
1221: const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1222: const NestedEventId *dftParentsSorted = nestedEvents[i].dftParentsSorted;
1223: for (j = 0; j < nParents; j++) {
1224: if (dftParentsSorted[j] != DFT_ID_AWAKE) {
1225: const PetscLogEvent dftEvent = dftEvents[j];
1226: const NestedEventId nstParent = nstEvents[dftParentsSorted[j]];
1227: selftimes[nstParent].time -= eventPerfInfo[dftEvent].time;
1228: selftimes[nstParent].flops -= eventPerfInfo[dftEvent].flops;
1229: selftimes[nstParent].numMessages -= eventPerfInfo[dftEvent].numMessages;
1230: selftimes[nstParent].messageLength -= eventPerfInfo[dftEvent].messageLength;
1231: selftimes[nstParent].numReductions -= eventPerfInfo[dftEvent].numReductions;
1232: }
1233: }
1234: }
1236: PetscFree(nstEvents);
1237: PetscFree(totaltimes);
1239: /* Set outputs */
1240: *p_self = selftimes;
1241: *p_nstMax = nstMax;
1242: return 0;
1243: }
1245: static PetscErrorCode PetscPrintSelfTime(PetscViewer viewer, const PetscSelfTimer *selftimes, int nstMax, PetscLogDouble totalTime)
1246: {
1247: int i;
1248: NestedEventId nst;
1249: PetscSortItem *sortSelfTimes;
1250: PetscLogDouble *times, *maxTimes;
1251: PetscStageLog stageLog;
1252: PetscEventRegInfo *eventRegInfo;
1253: const int dum_depth = 1, dum_count = 1, dum_parentcount = 1;
1254: PetscBool wasPrinted;
1255: MPI_Comm comm;
1257: PetscObjectGetComm((PetscObject)viewer, &comm);
1258: PetscLogGetStageLog(&stageLog);
1259: eventRegInfo = stageLog->eventLog->eventInfo;
1261: PetscMalloc1(nstMax + 1, ×);
1262: PetscMalloc1(nstMax + 1, &maxTimes);
1263: for (nst = 0; nst <= nstMax; nst++) times[nst] = selftimes[nst].time;
1264: MPIU_Allreduce(times, maxTimes, nstMax + 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1265: PetscFree(times);
1267: PetscMalloc1(nstMax + 1, &sortSelfTimes);
1269: /* Sort the self-timers on basis of the largest time needed */
1270: for (nst = 0; nst <= nstMax; nst++) {
1271: sortSelfTimes[nst].id = nst;
1272: sortSelfTimes[nst].val = maxTimes[nst];
1273: }
1274: PetscFree(maxTimes);
1275: qsort(sortSelfTimes, nstMax + 1, sizeof(PetscSortItem), compareSortItems);
1277: PetscViewerXMLStartSection(viewer, "selftimertable", "Self-timings");
1278: PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1280: for (i = 0; i <= nstMax; i++) {
1281: if ((sortSelfTimes[i].val / totalTime) >= THRESHOLD) {
1282: NestedEventId nstEvent = sortSelfTimes[i].id;
1283: const char *name = eventRegInfo[(PetscLogEvent)nstEvent].name;
1284: PetscEventPerfInfo selfPerfInfo;
1286: selfPerfInfo.time = selftimes[nstEvent].time;
1287: selfPerfInfo.flops = selftimes[nstEvent].flops;
1288: selfPerfInfo.numMessages = selftimes[nstEvent].numMessages;
1289: selfPerfInfo.messageLength = selftimes[nstEvent].messageLength;
1290: selfPerfInfo.numReductions = selftimes[nstEvent].numReductions;
1292: PetscLogNestedTreePrintLine(viewer, selfPerfInfo, dum_count, dum_parentcount, dum_depth, name, totalTime, &wasPrinted);
1293: if (wasPrinted) PetscViewerXMLEndSection(viewer, "event");
1294: }
1295: }
1296: PetscViewerXMLEndSection(viewer, "selftimertable");
1297: PetscFree(sortSelfTimes);
1298: return 0;
1299: }
1301: PetscErrorCode PetscLogView_Nested(PetscViewer viewer)
1302: {
1303: PetscLogDouble locTotalTime, globTotalTime;
1304: PetscNestedEventTree *tree = NULL;
1305: PetscSelfTimer *selftimers = NULL;
1306: int nTimers = 0, nstMax = 0;
1307: MPI_Comm comm;
1309: PetscObjectGetComm((PetscObject)viewer, &comm);
1310: PetscViewerInitASCII_XML(viewer);
1311: PetscViewerASCIIPrintf(viewer, "<!-- PETSc Performance Summary: -->\n");
1312: PetscViewerXMLStartSection(viewer, "petscroot", NULL);
1314: /* Get the total elapsed time, local and global maximum */
1315: PetscTime(&locTotalTime);
1316: locTotalTime -= petsc_BaseTime;
1317: MPIU_Allreduce(&locTotalTime, &globTotalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1319: /* Print global information about this run */
1320: PetscPrintExeSpecs(viewer);
1321: PetscPrintGlobalPerformance(viewer, locTotalTime);
1322: /* Collect nested timer tree info from all processes */
1323: PetscLogNestedTreeCreate(viewer, &tree, &nTimers);
1324: PetscLogNestedTreePrintTop(viewer, tree, nTimers, globTotalTime);
1325: PetscLogNestedTreeDestroy(tree, nTimers);
1327: /* Calculate self-time for all (not-nested) events */
1328: PetscCalcSelfTime(viewer, &selftimers, &nstMax);
1329: PetscPrintSelfTime(viewer, selftimers, nstMax, globTotalTime);
1330: PetscFree(selftimers);
1332: PetscViewerXMLEndSection(viewer, "petscroot");
1333: PetscViewerFinalASCII_XML(viewer);
1334: return 0;
1335: }
1337: /*
1338: * Get the name of a nested event.
1339: */
1340: static PetscErrorCode PetscGetNestedEventName(const PetscNestedEventTree *tree, int id, char **name)
1341: {
1342: PetscStageLog stageLog;
1344: PetscLogGetStageLog(&stageLog);
1345: *name = stageLog->eventLog->eventInfo[(PetscLogEvent)tree[id].nstEvent].name;
1346: return 0;
1347: }
1349: /*
1350: * Get the total time elapsed.
1351: */
1352: static PetscErrorCode PetscGetTotalTime(const PetscViewer viewer, PetscLogDouble *totalTime)
1353: {
1354: PetscLogDouble locTotalTime;
1355: MPI_Comm comm;
1357: PetscObjectGetComm((PetscObject)viewer, &comm);
1358: PetscTime(&locTotalTime);
1359: locTotalTime -= petsc_BaseTime;
1360: MPIU_Allreduce(&locTotalTime, totalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1361: return 0;
1362: }
1364: /*
1365: * Write a line to the flame graph output and then recurse into child events.
1366: */
1367: static PetscErrorCode PetscLogNestedTreePrintFlamegraph(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, int iStart, PetscLogDouble totalTime, PetscIntStack eventStack)
1368: {
1369: int depth = tree[iStart].depth, parentCount = 1, i, nChildren;
1370: char *name = NULL;
1371: PetscEventPerfInfo myPerfInfo = {0}, selfPerfInfo = {0}, otherPerfInfo = {0};
1372: PetscLogDouble countsPerCall = 0, locTime, globTime;
1373: PetscSortItem *children;
1374: PetscStageLog stageLog;
1375: MPI_Comm comm;
1377: PetscLogGetStageLog(&stageLog);
1378: PetscObjectGetComm((PetscObject)viewer, &comm);
1380: /* Determine information about the child events as well as 'self' and 'other' */
1381: PetscLogNestedTreeGetChildrenCount(tree, nTimers, iStart, depth, &nChildren);
1382: PetscLogNestedTreeSetChildrenSortItems(viewer, tree, nTimers, iStart, depth, nChildren, &children);
1383: PetscLogNestedTreeSetSelfOtherPerfInfo(tree, iStart, totalTime, children, nChildren, &myPerfInfo, &selfPerfInfo, &otherPerfInfo, &parentCount, &countsPerCall);
1385: /* Write line to the file. The time shown is 'self' + 'other' because each entry in the output
1386: * is the total time spent in the event minus the amount spent in child events. */
1387: locTime = selfPerfInfo.time + otherPerfInfo.time;
1388: MPIU_Allreduce(&locTime, &globTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1389: if (globTime / totalTime > THRESHOLD && tree[iStart].own) {
1390: /* Iterate over parent events in the stack and write them */
1391: for (i = 0; i <= eventStack->top; i++) {
1392: PetscGetNestedEventName(tree, eventStack->stack[i], &name);
1393: PetscViewerASCIIPrintf(viewer, "%s;", name);
1394: }
1395: PetscGetNestedEventName(tree, iStart, &name);
1396: /* The output is given as an integer in microseconds because otherwise the file cannot be read
1397: * by apps such as speedscope (https://speedscope.app/). */
1398: PetscViewerASCIIPrintf(viewer, "%s %" PetscInt64_FMT "\n", name, (PetscInt64)(globTime * 1e6));
1399: }
1401: /* Add the current event to the parent stack and write the child events */
1402: PetscIntStackPush(eventStack, iStart);
1403: for (i = 0; i < nChildren; i++) PetscLogNestedTreePrintFlamegraph(viewer, tree, nTimers, children[i].id, totalTime, eventStack);
1404: /* Pop the top item from the stack and immediately discard it */
1405: {
1406: int tmp;
1407: PetscIntStackPop(eventStack, &tmp);
1408: }
1409: return 0;
1410: }
1412: /*
1413: * Print nested logging information to a file suitable for reading into a Flame Graph.
1414: *
1415: * The format consists of a semicolon-separated list of events and the event duration in microseconds (which must be an integer).
1416: * An example output would look like:
1417: * MatAssemblyBegin 1
1418: * MatAssemblyEnd 10
1419: * MatView 302
1420: * KSPSetUp 98
1421: * KSPSetUp;VecSet 5
1422: * KSPSolve 150
1423: *
1424: * This option may be requested from the command line by passing in the flag `-log_view :<somefile>.txt:ascii_flamegraph`.
1425: */
1426: PetscErrorCode PetscLogView_Flamegraph(PetscViewer viewer)
1427: {
1428: int nTimers = 0, i, nChildren;
1429: PetscIntStack eventStack;
1430: PetscLogDouble totalTime;
1431: PetscNestedEventTree *tree = NULL;
1432: PetscSortItem *children;
1434: PetscGetTotalTime(viewer, &totalTime);
1435: PetscLogNestedTreeCreate(viewer, &tree, &nTimers);
1436: /* We use an integer stack to keep track of parent event IDs */
1437: PetscIntStackCreate(&eventStack);
1439: /* Initialize the child events and write them recursively */
1440: PetscLogNestedTreeGetChildrenCount(tree, nTimers, -1, 0, &nChildren);
1441: PetscLogNestedTreeSetChildrenSortItems(viewer, tree, nTimers, -1, 0, nChildren, &children);
1442: for (i = 0; i < nChildren; i++) PetscLogNestedTreePrintFlamegraph(viewer, tree, nTimers, children[i].id, totalTime, eventStack);
1444: PetscLogNestedTreeDestroy(tree, nTimers);
1445: PetscIntStackDestroy(eventStack);
1446: return 0;
1447: }
1449: PETSC_EXTERN PetscErrorCode PetscASend(int count, int datatype)
1450: {
1451: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1452: #endif
1454: petsc_send_ct++;
1455: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1456: PetscMPITypeSize(count, MPI_Type_f2c((MPI_Fint)datatype), &petsc_send_len);
1457: #endif
1458: return 0;
1459: }
1461: PETSC_EXTERN PetscErrorCode PetscARecv(int count, int datatype)
1462: {
1463: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1464: #endif
1466: petsc_recv_ct++;
1467: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1468: PetscMPITypeSize(count, MPI_Type_f2c((MPI_Fint)datatype), &petsc_recv_len);
1469: #endif
1470: return 0;
1471: }
1473: PETSC_EXTERN PetscErrorCode PetscAReduce(void)
1474: {
1475: petsc_allreduce_ct++;
1476: return 0;
1477: }
1479: #endif