Actual source code: gs.c


  2: /***********************************gs.c***************************************

  4: Author: Henry M. Tufo III

  6: e-mail: hmt@cs.brown.edu

  8: snail-mail:
  9: Division of Applied Mathematics
 10: Brown University
 11: Providence, RI 02912

 13: Last Modification:
 14: 6.21.97
 15: ************************************gs.c**************************************/

 17: /***********************************gs.c***************************************
 18: File Description:
 19: -----------------

 21: ************************************gs.c**************************************/

 23: #include <../src/ksp/pc/impls/tfs/tfs.h>

 25: /* default length of number of items via tree - doubles if exceeded */
 26: #define TREE_BUF_SZ 2048;
 27: #define GS_VEC_SZ   1

 29: /***********************************gs.c***************************************
 30: Type: struct gather_scatter_id
 31: ------------------------------

 33: ************************************gs.c**************************************/
 34: typedef struct gather_scatter_id {
 35:   PetscInt     id;
 36:   PetscInt     nel_min;
 37:   PetscInt     nel_max;
 38:   PetscInt     nel_sum;
 39:   PetscInt     negl;
 40:   PetscInt     gl_max;
 41:   PetscInt     gl_min;
 42:   PetscInt     repeats;
 43:   PetscInt     ordered;
 44:   PetscInt     positive;
 45:   PetscScalar *vals;

 47:   /* bit mask info */
 48:   PetscInt *my_proc_mask;
 49:   PetscInt  mask_sz;
 50:   PetscInt *ngh_buf;
 51:   PetscInt  ngh_buf_sz;
 52:   PetscInt *nghs;
 53:   PetscInt  num_nghs;
 54:   PetscInt  max_nghs;
 55:   PetscInt *pw_nghs;
 56:   PetscInt  num_pw_nghs;
 57:   PetscInt *tree_nghs;
 58:   PetscInt  num_tree_nghs;

 60:   PetscInt num_loads;

 62:   /* repeats == true -> local info */
 63:   PetscInt  nel;  /* number of unique elememts */
 64:   PetscInt *elms; /* of size nel */
 65:   PetscInt  nel_total;
 66:   PetscInt *local_elms; /* of size nel_total */
 67:   PetscInt *companion;  /* of size nel_total */

 69:   /* local info */
 70:   PetscInt   num_local_total;
 71:   PetscInt   local_strength;
 72:   PetscInt   num_local;
 73:   PetscInt  *num_local_reduce;
 74:   PetscInt **local_reduce;
 75:   PetscInt   num_local_gop;
 76:   PetscInt  *num_gop_local_reduce;
 77:   PetscInt **gop_local_reduce;

 79:   /* pairwise info */
 80:   PetscInt     level;
 81:   PetscInt     num_pairs;
 82:   PetscInt     max_pairs;
 83:   PetscInt     loc_node_pairs;
 84:   PetscInt     max_node_pairs;
 85:   PetscInt     min_node_pairs;
 86:   PetscInt     avg_node_pairs;
 87:   PetscInt    *pair_list;
 88:   PetscInt    *msg_sizes;
 89:   PetscInt   **node_list;
 90:   PetscInt     len_pw_list;
 91:   PetscInt    *pw_elm_list;
 92:   PetscScalar *pw_vals;

 94:   MPI_Request *msg_ids_in;
 95:   MPI_Request *msg_ids_out;

 97:   PetscScalar *out;
 98:   PetscScalar *in;
 99:   PetscInt     msg_total;

101:   /* tree - crystal accumulator info */
102:   PetscInt   max_left_over;
103:   PetscInt  *pre;
104:   PetscInt  *in_num;
105:   PetscInt  *out_num;
106:   PetscInt **in_list;
107:   PetscInt **out_list;

109:   /* new tree work*/
110:   PetscInt     tree_nel;
111:   PetscInt    *tree_elms;
112:   PetscScalar *tree_buf;
113:   PetscScalar *tree_work;

115:   PetscInt  tree_map_sz;
116:   PetscInt *tree_map_in;
117:   PetscInt *tree_map_out;

119:   /* current memory status */
120:   PetscInt gl_bss_min;
121:   PetscInt gl_perm_min;

123:   /* max segment size for PCTFS_gs_gop_vec() */
124:   PetscInt vec_sz;

126:   /* hack to make paul happy */
127:   MPI_Comm PCTFS_gs_comm;

129: } PCTFS_gs_id;

131: static PCTFS_gs_id   *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
132: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
133: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
134: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
135: static PCTFS_gs_id   *gsi_new(void);
136: static PetscErrorCode set_tree(PCTFS_gs_id *gs);

138: /* same for all but vector flavor */
139: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
140: /* vector flavor */
141: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);

143: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
144: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
145: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
146: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
147: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);

149: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
150: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);

152: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
153: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
154: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);

156: /* global vars */
157: /* from comm.c module */

159: static PetscInt num_gs_ids = 0;

161: /* should make this dynamic ... later */
162: static PetscInt  msg_buf     = MAX_MSG_BUF;
163: static PetscInt  vec_sz      = GS_VEC_SZ;
164: static PetscInt *tree_buf    = NULL;
165: static PetscInt  tree_buf_sz = 0;
166: static PetscInt  ntree       = 0;

168: /***************************************************************************/
169: PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
170: {
171:   vec_sz = size;
172:   return 0;
173: }

175: /******************************************************************************/
176: PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
177: {
178:   msg_buf = buf_size;
179:   return 0;
180: }

182: /******************************************************************************/
183: PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
184: {
185:   PCTFS_gs_id *gs;
186:   MPI_Group    PCTFS_gs_group;
187:   MPI_Comm     PCTFS_gs_comm;

189:   /* ensure that communication package has been initialized */
190:   PCTFS_comm_init();

192:   /* determines if we have enough dynamic/semi-static memory */
193:   /* checks input, allocs and sets gd_id template            */
194:   gs = gsi_check_args(elms, nel, level);

196:   /* only bit mask version up and working for the moment    */
197:   /* LATER :: get int list version working for sparse pblms */
198:   PETSC_COMM_WORLD, gsi_via_bit_mask(gs);

200:   PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group);
201:   PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm);
202:   PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group);

204:   gs->PCTFS_gs_comm = PCTFS_gs_comm;

206:   return (gs);
207: }

209: /******************************************************************************/
210: static PCTFS_gs_id *gsi_new(void)
211: {
212:   PCTFS_gs_id *gs;
213:   gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id));
214:   PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id));
215:   return (gs);
216: }

218: /******************************************************************************/
219: static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
220: {
221:   PetscInt     i, j, k, t2;
222:   PetscInt    *companion, *elms, *unique, *iptr;
223:   PetscInt     num_local = 0, *num_to_reduce, **local_reduce;
224:   PetscInt     oprs[]    = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND};
225:   PetscInt     vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
226:   PetscInt     work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
227:   PCTFS_gs_id *gs;

229:   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n");
230:   if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n");

232:   if (nel == 0) PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n");

234:   /* get space for gs template */
235:   gs     = gsi_new();
236:   gs->id = ++num_gs_ids;

238:   /* hmt 6.4.99                                            */
239:   /* caller can set global ids that don't participate to 0 */
240:   /* PCTFS_gs_init ignores all zeros in elm list                 */
241:   /* negative global ids are still invalid                 */
242:   for (i = j = 0; i < nel; i++) {
243:     if (in_elms[i] != 0) j++;
244:   }

246:   k   = nel;
247:   nel = j;

249:   /* copy over in_elms list and create inverse map */
250:   elms      = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt));
251:   companion = (PetscInt *)malloc(nel * sizeof(PetscInt));

253:   for (i = j = 0; i < k; i++) {
254:     if (in_elms[i] != 0) {
255:       elms[j]        = in_elms[i];
256:       companion[j++] = i;
257:     }
258:   }

260:   if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n");

262:   /* pre-pass ... check to see if sorted */
263:   elms[nel] = INT_MAX;
264:   iptr      = elms;
265:   unique    = elms + 1;
266:   j         = 0;
267:   while (*iptr != INT_MAX) {
268:     if (*iptr++ > *unique++) {
269:       j = 1;
270:       break;
271:     }
272:   }

274:   /* set up inverse map */
275:   if (j) {
276:     PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n");
277:     PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER);
278:   } else PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n");
279:   elms[nel] = INT_MIN;

281:   /* first pass */
282:   /* determine number of unique elements, check pd */
283:   for (i = k = 0; i < nel; i += j) {
284:     t2 = elms[i];
285:     j  = ++i;

287:     /* clump 'em for now */
288:     while (elms[j] == t2) j++;

290:     /* how many together and num local */
291:     if (j -= i) {
292:       num_local++;
293:       k += j;
294:     }
295:   }

297:   /* how many unique elements? */
298:   gs->repeats = k;
299:   gs->nel     = nel - k;

301:   /* number of repeats? */
302:   gs->num_local = num_local;
303:   num_local += 2;
304:   gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *));
305:   gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt));

307:   unique         = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt));
308:   gs->elms       = unique;
309:   gs->nel_total  = nel;
310:   gs->local_elms = elms;
311:   gs->companion  = companion;

313:   /* compess map as well as keep track of local ops */
314:   for (num_local = i = j = 0; i < gs->nel; i++) {
315:     k  = j;
316:     t2 = unique[i] = elms[j];
317:     companion[i]   = companion[j];

319:     while (elms[j] == t2) j++;

321:     if ((t2 = (j - k)) > 1) {
322:       /* number together */
323:       num_to_reduce[num_local] = t2++;

325:       iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt));

327:       /* to use binary searching don't remap until we check intersection */
328:       *iptr++ = i;

330:       /* note that we're skipping the first one */
331:       while (++k < j) *(iptr++) = companion[k];
332:       *iptr = -1;
333:     }
334:   }

336:   /* sentinel for ngh_buf */
337:   unique[gs->nel] = INT_MAX;

339:   /* for two partition sort hack */
340:   num_to_reduce[num_local]   = 0;
341:   local_reduce[num_local]    = NULL;
342:   num_to_reduce[++num_local] = 0;
343:   local_reduce[num_local]    = NULL;

345:   /* load 'em up */
346:   /* note one extra to hold NON_UNIFORM flag!!! */
347:   vals[2] = vals[1] = vals[0] = nel;
348:   if (gs->nel > 0) {
349:     vals[3] = unique[0];
350:     vals[4] = unique[gs->nel - 1];
351:   } else {
352:     vals[3] = INT_MAX;
353:     vals[4] = INT_MIN;
354:   }
355:   vals[5] = level;
356:   vals[6] = num_gs_ids;

358:   /* GLOBAL: send 'em out */
359:   PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs);

361:   /* must be semi-pos def - only pairwise depends on this */
362:   /* LATER - remove this restriction */
363:   if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n");
364:   if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n");

366:   gs->nel_min = vals[0];
367:   gs->nel_max = vals[1];
368:   gs->nel_sum = vals[2];
369:   gs->gl_min  = vals[3];
370:   gs->gl_max  = vals[4];
371:   gs->negl    = vals[4] - vals[3] + 1;

373:   if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl);

375:   /* LATER :: add level == -1 -> program selects level */
376:   if (vals[5] < 0) vals[5] = 0;
377:   else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes;
378:   gs->level = vals[5];

380:   return (gs);
381: }

383: /******************************************************************************/
384: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
385: {
386:   PetscInt   i, nel, *elms;
387:   PetscInt   t1;
388:   PetscInt **reduce;
389:   PetscInt  *map;

391:   /* totally local removes ... PCTFS_ct_bits == 0 */
392:   get_ngh_buf(gs);

394:   if (gs->level) set_pairwise(gs);
395:   if (gs->max_left_over) set_tree(gs);

397:   /* intersection local and pairwise/tree? */
398:   gs->num_local_total      = gs->num_local;
399:   gs->gop_local_reduce     = gs->local_reduce;
400:   gs->num_gop_local_reduce = gs->num_local_reduce;

402:   map = gs->companion;

404:   /* is there any local compression */
405:   if (!gs->num_local) {
406:     gs->local_strength = NONE;
407:     gs->num_local_gop  = 0;
408:   } else {
409:     /* ok find intersection */
410:     map    = gs->companion;
411:     reduce = gs->local_reduce;
412:     for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) {
413:       if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) {
414:         t1++;
416:         gs->num_local_reduce[i] *= -1;
417:       }
418:       **reduce = map[**reduce];
419:     }

421:     /* intersection is empty */
422:     if (!t1) {
423:       gs->local_strength = FULL;
424:       gs->num_local_gop  = 0;
425:     } else { /* intersection not empty */
426:       gs->local_strength = PARTIAL;

428:       PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);

430:       gs->num_local_gop   = t1;
431:       gs->num_local_total = gs->num_local;
432:       gs->num_local -= t1;
433:       gs->gop_local_reduce     = gs->local_reduce;
434:       gs->num_gop_local_reduce = gs->num_local_reduce;

436:       for (i = 0; i < t1; i++) {
438:         gs->num_gop_local_reduce[i] *= -1;
439:         gs->local_reduce++;
440:         gs->num_local_reduce++;
441:       }
442:       gs->local_reduce++;
443:       gs->num_local_reduce++;
444:     }
445:   }

447:   elms = gs->pw_elm_list;
448:   nel  = gs->len_pw_list;
449:   for (i = 0; i < nel; i++) elms[i] = map[elms[i]];

451:   elms = gs->tree_map_in;
452:   nel  = gs->tree_map_sz;
453:   for (i = 0; i < nel; i++) elms[i] = map[elms[i]];

455:   /* clean up */
456:   free((void *)gs->local_elms);
457:   free((void *)gs->companion);
458:   free((void *)gs->elms);
459:   free((void *)gs->ngh_buf);
460:   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
461:   return 0;
462: }

464: /******************************************************************************/
465: static PetscErrorCode place_in_tree(PetscInt elm)
466: {
467:   PetscInt *tp, n;

469:   if (ntree == tree_buf_sz) {
470:     if (tree_buf_sz) {
471:       tp = tree_buf;
472:       n  = tree_buf_sz;
473:       tree_buf_sz <<= 1;
474:       tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
475:       PCTFS_ivec_copy(tree_buf, tp, n);
476:       free(tp);
477:     } else {
478:       tree_buf_sz = TREE_BUF_SZ;
479:       tree_buf    = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
480:     }
481:   }

483:   tree_buf[ntree++] = elm;
484:   return 0;
485: }

487: /******************************************************************************/
488: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
489: {
490:   PetscInt  i, j, npw = 0, ntree_map = 0;
491:   PetscInt  p_mask_size, ngh_buf_size, buf_size;
492:   PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
493:   PetscInt *ngh_buf, *buf1, *buf2;
494:   PetscInt  offset, per_load, num_loads, or_ct, start, end;
495:   PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
496:   PetscInt  oper = GL_B_OR;
497:   PetscInt *ptr3, *t_mask, level, ct1, ct2;

499:   /* to make life easier */
500:   nel   = gs->nel;
501:   elms  = gs->elms;
502:   level = gs->level;

504:   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
505:   p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes));
506:   PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id);

508:   /* allocate space for masks and info bufs */
509:   gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size);
510:   gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size);
511:   gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel;
512:   t_mask                        = (PetscInt *)malloc(p_mask_size);
513:   gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size);

515:   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
516:   /* had thought I could exploit rendezvous threshold */

518:   /* default is one pass */
519:   per_load = negl = gs->negl;
520:   gs->num_loads = num_loads = 1;
521:   i                         = p_mask_size * negl;

523:   /* possible overflow on buffer size */
524:   /* overflow hack                    */
525:   if (i < 0) i = INT_MAX;

527:   buf_size = PetscMin(msg_buf, i);

529:   /* can we do it? */

532:   /* get PCTFS_giop buf space ... make *only* one malloc */
533:   buf1 = (PetscInt *)malloc(buf_size << 1);

535:   /* more than one gior exchange needed? */
536:   if (buf_size != i) {
537:     per_load      = buf_size / p_mask_size;
538:     buf_size      = per_load * p_mask_size;
539:     gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0);
540:   }

542:   /* convert buf sizes from #bytes to #ints - 32 bit only! */
543:   p_mask_size /= sizeof(PetscInt);
544:   ngh_buf_size /= sizeof(PetscInt);
545:   buf_size /= sizeof(PetscInt);

547:   /* find PCTFS_giop work space */
548:   buf2 = buf1 + buf_size;

550:   /* hold #ints needed for processor masks */
551:   gs->mask_sz = p_mask_size;

553:   /* init buffers */
554:   PCTFS_ivec_zero(sh_proc_mask, p_mask_size);
555:   PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size);
556:   PCTFS_ivec_zero(ngh_buf, ngh_buf_size);

558:   /* HACK reset tree info */
559:   tree_buf    = NULL;
560:   tree_buf_sz = ntree = 0;

562:   /* ok do it */
563:   for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) {
564:     /* identity for bitwise or is 000...000 */
565:     PCTFS_ivec_zero(buf1, buf_size);

567:     /* load msg buffer */
568:     for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) {
569:       offset = (offset - start) * p_mask_size;
570:       PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size);
571:     }

573:     /* GLOBAL: pass buffer */
574:     PCTFS_giop(buf1, buf2, buf_size, &oper);

576:     /* unload buffer into ngh_buf */
577:     ptr2 = (elms + i_start);
578:     for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) {
579:       /* I own it ... may have to pairwise it */
580:       if (j == *ptr2) {
581:         /* do i share it w/anyone? */
582:         ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
583:         /* guess not */
584:         if (ct1 < 2) {
585:           ptr2++;
586:           ptr1 += p_mask_size;
587:           continue;
588:         }

590:         /* i do ... so keep info and turn off my bit */
591:         PCTFS_ivec_copy(ptr1, ptr3, p_mask_size);
592:         PCTFS_ivec_xor(ptr1, p_mask, p_mask_size);
593:         PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size);

595:         /* is it to be done pairwise? */
596:         if (--ct1 <= level) {
597:           npw++;

599:           /* turn on high bit to indicate pw need to process */
600:           *ptr2++ |= TOP_BIT;
601:           PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size);
602:           ptr1 += p_mask_size;
603:           continue;
604:         }

606:         /* get set for next and note that I have a tree contribution */
607:         /* could save exact elm index for tree here -> save a search */
608:         ptr2++;
609:         ptr1 += p_mask_size;
610:         ntree_map++;
611:       } else { /* i don't but still might be involved in tree */

613:         /* shared by how many? */
614:         ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));

616:         /* none! */
617:         if (ct1 < 2) continue;

619:         /* is it going to be done pairwise? but not by me of course!*/
620:         if (--ct1 <= level) continue;
621:       }
622:       /* LATER we're going to have to process it NOW */
623:       /* nope ... tree it */
624:       place_in_tree(j);
625:     }
626:   }

628:   free((void *)t_mask);
629:   free((void *)buf1);

631:   gs->len_pw_list = npw;
632:   gs->num_nghs    = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));

634:   /* expand from bit mask list to int list and save ngh list */
635:   gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt));
636:   PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs);

638:   gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt));

640:   oper = GL_MAX;
641:   ct1  = gs->num_nghs;
642:   PCTFS_giop(&ct1, &ct2, 1, &oper);
643:   gs->max_nghs = ct1;

645:   gs->tree_map_sz   = ntree_map;
646:   gs->max_left_over = ntree;

648:   free((void *)p_mask);
649:   free((void *)sh_proc_mask);
650:   return 0;
651: }

653: /******************************************************************************/
654: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
655: {
656:   PetscInt  i, j;
657:   PetscInt  p_mask_size;
658:   PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
659:   PetscInt *ngh_buf, *buf2;
660:   PetscInt  offset;
661:   PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
662:   PetscInt *pairwise_elm_list, len_pair_list = 0;
663:   PetscInt *iptr, t1, i_start, nel, *elms;
664:   PetscInt  ct;

666:   /* to make life easier */
667:   nel          = gs->nel;
668:   elms         = gs->elms;
669:   ngh_buf      = gs->ngh_buf;
670:   sh_proc_mask = gs->pw_nghs;

672:   /* need a few temp masks */
673:   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
674:   p_mask        = (PetscInt *)malloc(p_mask_size);
675:   tmp_proc_mask = (PetscInt *)malloc(p_mask_size);

677:   /* set mask to my PCTFS_my_id's bit mask */
678:   PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id);

680:   p_mask_size /= sizeof(PetscInt);

682:   len_pair_list   = gs->len_pw_list;
683:   gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt));

685:   /* how many processors (nghs) do we have to exchange with? */
686:   nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));

688:   /* allocate space for PCTFS_gs_gop() info */
689:   gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
690:   gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
691:   gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1));

693:   /* init msg_size list */
694:   PCTFS_ivec_zero(msg_size, nprs);

696:   /* expand from bit mask list to int list */
697:   PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list);

699:   /* keep list of elements being handled pairwise */
700:   for (i = j = 0; i < nel; i++) {
701:     if (elms[i] & TOP_BIT) {
702:       elms[i] ^= TOP_BIT;
703:       pairwise_elm_list[j++] = i;
704:     }
705:   }
706:   pairwise_elm_list[j] = -1;

708:   gs->msg_ids_out       = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
709:   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
710:   gs->msg_ids_in        = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
711:   gs->msg_ids_in[nprs]  = MPI_REQUEST_NULL;
712:   gs->pw_vals           = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz);

714:   /* find who goes to each processor */
715:   for (i_start = i = 0; i < nprs; i++) {
716:     /* processor i's mask */
717:     PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i]);

719:     /* det # going to processor i */
720:     for (ct = j = 0; j < len_pair_list; j++) {
721:       buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
722:       PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size);
723:       if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++;
724:     }
725:     msg_size[i] = ct;
726:     i_start     = PetscMax(i_start, ct);

728:     /*space to hold nodes in message to first neighbor */
729:     msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1));

731:     for (j = 0; j < len_pair_list; j++) {
732:       buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
733:       PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size);
734:       if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j;
735:     }
736:     *iptr = -1;
737:   }
738:   msg_nodes[nprs] = NULL;

740:   j = gs->loc_node_pairs = i_start;
741:   t1                     = GL_MAX;
742:   PCTFS_giop(&i_start, &offset, 1, &t1);
743:   gs->max_node_pairs = i_start;

745:   i_start = j;
746:   t1      = GL_MIN;
747:   PCTFS_giop(&i_start, &offset, 1, &t1);
748:   gs->min_node_pairs = i_start;

750:   i_start = j;
751:   t1      = GL_ADD;
752:   PCTFS_giop(&i_start, &offset, 1, &t1);
753:   gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1;

755:   i_start = nprs;
756:   t1      = GL_MAX;
757:   PCTFS_giop(&i_start, &offset, 1, &t1);
758:   gs->max_pairs = i_start;

760:   /* remap pairwise in tail of gsi_via_bit_mask() */
761:   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs);
762:   gs->out       = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
763:   gs->in        = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);

765:   /* reset malloc pool */
766:   free((void *)p_mask);
767:   free((void *)tmp_proc_mask);
768:   return 0;
769: }

771: /* to do pruned tree just save ngh buf copy for each one and decode here!
772: ******************************************************************************/
773: static PetscErrorCode set_tree(PCTFS_gs_id *gs)
774: {
775:   PetscInt  i, j, n, nel;
776:   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;

778:   /* local work ptrs */
779:   elms = gs->elms;
780:   nel  = gs->nel;

782:   /* how many via tree */
783:   gs->tree_nel = n = ntree;
784:   gs->tree_elms = tree_elms = iptr_in = tree_buf;
785:   gs->tree_buf                        = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
786:   gs->tree_work                       = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
787:   j                                   = gs->tree_map_sz;
788:   gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
789:   gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));

791:   /* search the longer of the two lists */
792:   /* note ... could save this info in get_ngh_buf and save searches */
793:   if (n <= nel) {
794:     /* bijective fct w/remap - search elm list */
795:     for (i = 0; i < n; i++) {
796:       if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) {
797:         *iptr_in++  = j;
798:         *iptr_out++ = i;
799:       }
800:     }
801:   } else {
802:     for (i = 0; i < nel; i++) {
803:       if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) {
804:         *iptr_in++  = i;
805:         *iptr_out++ = j;
806:       }
807:     }
808:   }

810:   /* sentinel */
811:   *iptr_in = *iptr_out = -1;
812:   return 0;
813: }

815: /******************************************************************************/
816: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals)
817: {
818:   PetscInt   *num, *map, **reduce;
819:   PetscScalar tmp;

821:   num    = gs->num_gop_local_reduce;
822:   reduce = gs->gop_local_reduce;
823:   while ((map = *reduce++)) {
824:     /* wall */
825:     if (*num == 2) {
826:       num++;
827:       vals[map[1]] = vals[map[0]];
828:     } else if (*num == 3) { /* corner shared by three elements */
829:       num++;
830:       vals[map[2]] = vals[map[1]] = vals[map[0]];
831:     } else if (*num == 4) { /* corner shared by four elements */
832:       num++;
833:       vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
834:     } else { /* general case ... odd geoms ... 3D*/
835:       num++;
836:       tmp = *(vals + *map++);
837:       while (*map >= 0) *(vals + *map++) = tmp;
838:     }
839:   }
840:   return 0;
841: }

843: /******************************************************************************/
844: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals)
845: {
846:   PetscInt   *num, *map, **reduce;
847:   PetscScalar tmp;

849:   num    = gs->num_local_reduce;
850:   reduce = gs->local_reduce;
851:   while ((map = *reduce)) {
852:     /* wall */
853:     if (*num == 2) {
854:       num++;
855:       reduce++;
856:       vals[map[1]] = vals[map[0]] += vals[map[1]];
857:     } else if (*num == 3) { /* corner shared by three elements */
858:       num++;
859:       reduce++;
860:       vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]);
861:     } else if (*num == 4) { /* corner shared by four elements */
862:       num++;
863:       reduce++;
864:       vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
865:     } else { /* general case ... odd geoms ... 3D*/
866:       num++;
867:       tmp = 0.0;
868:       while (*map >= 0) tmp += *(vals + *map++);

870:       map = *reduce++;
871:       while (*map >= 0) *(vals + *map++) = tmp;
872:     }
873:   }
874:   return 0;
875: }

877: /******************************************************************************/
878: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals)
879: {
880:   PetscInt    *num, *map, **reduce;
881:   PetscScalar *base;

883:   num    = gs->num_gop_local_reduce;
884:   reduce = gs->gop_local_reduce;
885:   while ((map = *reduce++)) {
886:     /* wall */
887:     if (*num == 2) {
888:       num++;
889:       vals[map[0]] += vals[map[1]];
890:     } else if (*num == 3) { /* corner shared by three elements */
891:       num++;
892:       vals[map[0]] += (vals[map[1]] + vals[map[2]]);
893:     } else if (*num == 4) { /* corner shared by four elements */
894:       num++;
895:       vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
896:     } else { /* general case ... odd geoms ... 3D*/
897:       num++;
898:       base = vals + *map++;
899:       while (*map >= 0) *base += *(vals + *map++);
900:     }
901:   }
902:   return 0;
903: }

905: /******************************************************************************/
906: PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
907: {
908:   PetscInt i;

910:   MPI_Comm_free(&gs->PCTFS_gs_comm);
911:   if (gs->nghs) free((void *)gs->nghs);
912:   if (gs->pw_nghs) free((void *)gs->pw_nghs);

914:   /* tree */
915:   if (gs->max_left_over) {
916:     if (gs->tree_elms) free((void *)gs->tree_elms);
917:     if (gs->tree_buf) free((void *)gs->tree_buf);
918:     if (gs->tree_work) free((void *)gs->tree_work);
919:     if (gs->tree_map_in) free((void *)gs->tree_map_in);
920:     if (gs->tree_map_out) free((void *)gs->tree_map_out);
921:   }

923:   /* pairwise info */
924:   if (gs->num_pairs) {
925:     /* should be NULL already */
926:     if (gs->ngh_buf) free((void *)gs->ngh_buf);
927:     if (gs->elms) free((void *)gs->elms);
928:     if (gs->local_elms) free((void *)gs->local_elms);
929:     if (gs->companion) free((void *)gs->companion);

931:     /* only set if pairwise */
932:     if (gs->vals) free((void *)gs->vals);
933:     if (gs->in) free((void *)gs->in);
934:     if (gs->out) free((void *)gs->out);
935:     if (gs->msg_ids_in) free((void *)gs->msg_ids_in);
936:     if (gs->msg_ids_out) free((void *)gs->msg_ids_out);
937:     if (gs->pw_vals) free((void *)gs->pw_vals);
938:     if (gs->pw_elm_list) free((void *)gs->pw_elm_list);
939:     if (gs->node_list) {
940:       for (i = 0; i < gs->num_pairs; i++) {
941:         if (gs->node_list[i]) free((void *)gs->node_list[i]);
942:       }
943:       free((void *)gs->node_list);
944:     }
945:     if (gs->msg_sizes) free((void *)gs->msg_sizes);
946:     if (gs->pair_list) free((void *)gs->pair_list);
947:   }

949:   /* local info */
950:   if (gs->num_local_total >= 0) {
951:     for (i = 0; i < gs->num_local_total + 1; i++) {
952:       if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]);
953:     }
954:   }

956:   /* if intersection tree/pairwise and local isn't empty */
957:   if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce);
958:   if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce);

960:   free((void *)gs);
961:   return 0;
962: }

964: /******************************************************************************/
965: PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
966: {
967:   switch (*op) {
968:   case '+':
969:     PCTFS_gs_gop_vec_plus(gs, vals, step);
970:     break;
971:   default:
972:     PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0]);
973:     PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n");
974:     PCTFS_gs_gop_vec_plus(gs, vals, step);
975:     break;
976:   }
977:   return 0;
978: }

980: /******************************************************************************/
981: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
982: {

985:   /* local only operations!!! */
986:   if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs, vals, step);

988:   /* if intersection tree/pairwise and local isn't empty */
989:   if (gs->num_local_gop) {
990:     PCTFS_gs_gop_vec_local_in_plus(gs, vals, step);

992:     /* pairwise */
993:     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step);

995:     /* tree */
996:     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs, vals, step);

998:     PCTFS_gs_gop_vec_local_out(gs, vals, step);
999:   } else { /* if intersection tree/pairwise and local is empty */
1000:     /* pairwise */
1001:     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step);

1003:     /* tree */
1004:     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs, vals, step);
1005:   }
1006:   return 0;
1007: }

1009: /******************************************************************************/
1010: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1011: {
1012:   PetscInt    *num, *map, **reduce;
1013:   PetscScalar *base;

1015:   num    = gs->num_local_reduce;
1016:   reduce = gs->local_reduce;
1017:   while ((map = *reduce)) {
1018:     base = vals + map[0] * step;

1020:     /* wall */
1021:     if (*num == 2) {
1022:       num++;
1023:       reduce++;
1024:       PCTFS_rvec_add(base, vals + map[1] * step, step);
1025:       PCTFS_rvec_copy(vals + map[1] * step, base, step);
1026:     } else if (*num == 3) { /* corner shared by three elements */
1027:       num++;
1028:       reduce++;
1029:       PCTFS_rvec_add(base, vals + map[1] * step, step);
1030:       PCTFS_rvec_add(base, vals + map[2] * step, step);
1031:       PCTFS_rvec_copy(vals + map[2] * step, base, step);
1032:       PCTFS_rvec_copy(vals + map[1] * step, base, step);
1033:     } else if (*num == 4) { /* corner shared by four elements */
1034:       num++;
1035:       reduce++;
1036:       PCTFS_rvec_add(base, vals + map[1] * step, step);
1037:       PCTFS_rvec_add(base, vals + map[2] * step, step);
1038:       PCTFS_rvec_add(base, vals + map[3] * step, step);
1039:       PCTFS_rvec_copy(vals + map[3] * step, base, step);
1040:       PCTFS_rvec_copy(vals + map[2] * step, base, step);
1041:       PCTFS_rvec_copy(vals + map[1] * step, base, step);
1042:     } else { /* general case ... odd geoms ... 3D */
1043:       num++;
1044:       while (*++map >= 0) PCTFS_rvec_add(base, vals + *map * step, step);

1046:       map = *reduce;
1047:       while (*++map >= 0) PCTFS_rvec_copy(vals + *map * step, base, step);

1049:       reduce++;
1050:     }
1051:   }
1052:   return 0;
1053: }

1055: /******************************************************************************/
1056: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1057: {
1058:   PetscInt    *num, *map, **reduce;
1059:   PetscScalar *base;

1061:   num    = gs->num_gop_local_reduce;
1062:   reduce = gs->gop_local_reduce;
1063:   while ((map = *reduce++)) {
1064:     base = vals + map[0] * step;

1066:     /* wall */
1067:     if (*num == 2) {
1068:       num++;
1069:       PCTFS_rvec_add(base, vals + map[1] * step, step);
1070:     } else if (*num == 3) { /* corner shared by three elements */
1071:       num++;
1072:       PCTFS_rvec_add(base, vals + map[1] * step, step);
1073:       PCTFS_rvec_add(base, vals + map[2] * step, step);
1074:     } else if (*num == 4) { /* corner shared by four elements */
1075:       num++;
1076:       PCTFS_rvec_add(base, vals + map[1] * step, step);
1077:       PCTFS_rvec_add(base, vals + map[2] * step, step);
1078:       PCTFS_rvec_add(base, vals + map[3] * step, step);
1079:     } else { /* general case ... odd geoms ... 3D*/
1080:       num++;
1081:       while (*++map >= 0) PCTFS_rvec_add(base, vals + *map * step, step);
1082:     }
1083:   }
1084:   return 0;
1085: }

1087: /******************************************************************************/
1088: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1089: {
1090:   PetscInt    *num, *map, **reduce;
1091:   PetscScalar *base;

1093:   num    = gs->num_gop_local_reduce;
1094:   reduce = gs->gop_local_reduce;
1095:   while ((map = *reduce++)) {
1096:     base = vals + map[0] * step;

1098:     /* wall */
1099:     if (*num == 2) {
1100:       num++;
1101:       PCTFS_rvec_copy(vals + map[1] * step, base, step);
1102:     } else if (*num == 3) { /* corner shared by three elements */
1103:       num++;
1104:       PCTFS_rvec_copy(vals + map[1] * step, base, step);
1105:       PCTFS_rvec_copy(vals + map[2] * step, base, step);
1106:     } else if (*num == 4) { /* corner shared by four elements */
1107:       num++;
1108:       PCTFS_rvec_copy(vals + map[1] * step, base, step);
1109:       PCTFS_rvec_copy(vals + map[2] * step, base, step);
1110:       PCTFS_rvec_copy(vals + map[3] * step, base, step);
1111:     } else { /* general case ... odd geoms ... 3D*/
1112:       num++;
1113:       while (*++map >= 0) PCTFS_rvec_copy(vals + *map * step, base, step);
1114:     }
1115:   }
1116:   return 0;
1117: }

1119: /******************************************************************************/
1120: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step)
1121: {
1122:   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1123:   PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
1124:   PetscInt    *pw, *list, *size, **nodes;
1125:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1126:   MPI_Status   status;
1127:   PetscBLASInt i1 = 1, dstep;

1129:   /* strip and load s */
1130:   msg_list = list = gs->pair_list;
1131:   msg_size = size = gs->msg_sizes;
1132:   msg_nodes = nodes = gs->node_list;
1133:   iptr = pw = gs->pw_elm_list;
1134:   dptr1 = dptr3 = gs->pw_vals;
1135:   msg_ids_in = ids_in = gs->msg_ids_in;
1136:   msg_ids_out = ids_out = gs->msg_ids_out;
1137:   dptr2                 = gs->out;
1138:   in1 = in2 = gs->in;

1140:   /* post the receives */
1141:   /*  msg_nodes=nodes; */
1142:   do {
1143:     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1144:         second one *list and do list++ afterwards */
1145:     MPI_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1146:     list++;
1147:     msg_ids_in++;
1148:     in1 += *size++ * step;
1149:   } while (*++msg_nodes);
1150:   msg_nodes = nodes;

1152:   /* load gs values into in out gs buffers */
1153:   while (*iptr >= 0) {
1154:     PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step);
1155:     dptr3 += step;
1156:     iptr++;
1157:   }

1159:   /* load out buffers and post the sends */
1160:   while ((iptr = *msg_nodes++)) {
1161:     dptr3 = dptr2;
1162:     while (*iptr >= 0) {
1163:       PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step);
1164:       dptr2 += step;
1165:       iptr++;
1166:     }
1167:     MPI_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1168:     msg_size++;
1169:     msg_list++;
1170:     msg_ids_out++;
1171:   }

1173:   /* tree */
1174:   if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step);

1176:   /* process the received data */
1177:   msg_nodes = nodes;
1178:   while ((iptr = *nodes++)) {
1179:     PetscScalar d1 = 1.0;

1181:     /* Should I check the return value of MPI_Wait() or status? */
1182:     /* Can this loop be replaced by a call to MPI_Waitall()? */
1183:     MPI_Wait(ids_in, &status);
1184:     ids_in++;
1185:     while (*iptr >= 0) {
1186:       PetscBLASIntCast(step, &dstep);
1187:       PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1));
1188:       in2 += step;
1189:       iptr++;
1190:     }
1191:   }

1193:   /* replace vals */
1194:   while (*pw >= 0) {
1195:     PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step);
1196:     dptr1 += step;
1197:     pw++;
1198:   }

1200:   /* clear isend message handles */
1201:   /* This changed for clarity though it could be the same */

1203:   /* Should I check the return value of MPI_Wait() or status? */
1204:   /* Can this loop be replaced by a call to MPI_Waitall()? */
1205:   while (*msg_nodes++) {
1206:     MPI_Wait(ids_out, &status);
1207:     ids_out++;
1208:   }
1209:   return 0;
1210: }

1212: /******************************************************************************/
1213: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1214: {
1215:   PetscInt     size, *in, *out;
1216:   PetscScalar *buf, *work;
1217:   PetscInt     op[] = {GL_ADD, 0};
1218:   PetscBLASInt i1   = 1;
1219:   PetscBLASInt dstep;

1221:   /* copy over to local variables */
1222:   in   = gs->tree_map_in;
1223:   out  = gs->tree_map_out;
1224:   buf  = gs->tree_buf;
1225:   work = gs->tree_work;
1226:   size = gs->tree_nel * step;

1228:   /* zero out collection buffer */
1229:   PCTFS_rvec_zero(buf, size);

1231:   /* copy over my contributions */
1232:   while (*in >= 0) {
1233:     PetscBLASIntCast(step, &dstep);
1234:     PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1));
1235:   }

1237:   /* perform fan in/out on full buffer */
1238:   /* must change PCTFS_grop to handle the blas */
1239:   PCTFS_grop(buf, work, size, op);

1241:   /* reset */
1242:   in  = gs->tree_map_in;
1243:   out = gs->tree_map_out;

1245:   /* get the portion of the results I need */
1246:   while (*in >= 0) {
1247:     PetscBLASIntCast(step, &dstep);
1248:     PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1));
1249:   }
1250:   return 0;
1251: }

1253: /******************************************************************************/
1254: PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
1255: {
1256:   switch (*op) {
1257:   case '+':
1258:     PCTFS_gs_gop_plus_hc(gs, vals, dim);
1259:     break;
1260:   default:
1261:     PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0]);
1262:     PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n");
1263:     PCTFS_gs_gop_plus_hc(gs, vals, dim);
1264:     break;
1265:   }
1266:   return 0;
1267: }

1269: /******************************************************************************/
1270: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1271: {
1272:   /* if there's nothing to do return */
1273:   if (dim <= 0) return 0;

1275:   /* can't do more dimensions then exist */
1276:   dim = PetscMin(dim, PCTFS_i_log2_num_nodes);

1278:   /* local only operations!!! */
1279:   if (gs->num_local) PCTFS_gs_gop_local_plus(gs, vals);

1281:   /* if intersection tree/pairwise and local isn't empty */
1282:   if (gs->num_local_gop) {
1283:     PCTFS_gs_gop_local_in_plus(gs, vals);

1285:     /* pairwise will do tree inside ... */
1286:     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim); /* tree only */
1287:     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs, vals, dim);

1289:     PCTFS_gs_gop_local_out(gs, vals);
1290:   } else { /* if intersection tree/pairwise and local is empty */
1291:     /* pairwise will do tree inside */
1292:     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim); /* tree */
1293:     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs, vals, dim);
1294:   }
1295:   return 0;
1296: }

1298: /******************************************************************************/
1299: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim)
1300: {
1301:   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1302:   PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
1303:   PetscInt    *pw, *list, *size, **nodes;
1304:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1305:   MPI_Status   status;
1306:   PetscInt     i, mask = 1;

1308:   for (i = 1; i < dim; i++) {
1309:     mask <<= 1;
1310:     mask++;
1311:   }

1313:   /* strip and load s */
1314:   msg_list = list = gs->pair_list;
1315:   msg_size = size = gs->msg_sizes;
1316:   msg_nodes = nodes = gs->node_list;
1317:   iptr = pw = gs->pw_elm_list;
1318:   dptr1 = dptr3 = gs->pw_vals;
1319:   msg_ids_in = ids_in = gs->msg_ids_in;
1320:   msg_ids_out = ids_out = gs->msg_ids_out;
1321:   dptr2                 = gs->out;
1322:   in1 = in2 = gs->in;

1324:   /* post the receives */
1325:   /*  msg_nodes=nodes; */
1326:   do {
1327:     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1328:         second one *list and do list++ afterwards */
1329:     if ((PCTFS_my_id | mask) == (*list | mask)) {
1330:       MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1331:       list++;
1332:       msg_ids_in++;
1333:       in1 += *size++;
1334:     } else {
1335:       list++;
1336:       size++;
1337:     }
1338:   } while (*++msg_nodes);

1340:   /* load gs values into in out gs buffers */
1341:   while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);

1343:   /* load out buffers and post the sends */
1344:   msg_nodes = nodes;
1345:   list      = msg_list;
1346:   while ((iptr = *msg_nodes++)) {
1347:     if ((PCTFS_my_id | mask) == (*list | mask)) {
1348:       dptr3 = dptr2;
1349:       while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1350:       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1351:       /* is msg_ids_out++ correct? */
1352:       MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1353:       msg_size++;
1354:       list++;
1355:       msg_ids_out++;
1356:     } else {
1357:       list++;
1358:       msg_size++;
1359:     }
1360:   }

1362:   /* do the tree while we're waiting */
1363:   if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim);

1365:   /* process the received data */
1366:   msg_nodes = nodes;
1367:   list      = msg_list;
1368:   while ((iptr = *nodes++)) {
1369:     if ((PCTFS_my_id | mask) == (*list | mask)) {
1370:       /* Should I check the return value of MPI_Wait() or status? */
1371:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1372:       MPI_Wait(ids_in, &status);
1373:       ids_in++;
1374:       while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1375:     }
1376:     list++;
1377:   }

1379:   /* replace vals */
1380:   while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;

1382:   /* clear isend message handles */
1383:   /* This changed for clarity though it could be the same */
1384:   while (*msg_nodes++) {
1385:     if ((PCTFS_my_id | mask) == (*msg_list | mask)) {
1386:       /* Should I check the return value of MPI_Wait() or status? */
1387:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1388:       MPI_Wait(ids_out, &status);
1389:       ids_out++;
1390:     }
1391:     msg_list++;
1392:   }
1393:   return 0;
1394: }

1396: /******************************************************************************/
1397: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1398: {
1399:   PetscInt     size;
1400:   PetscInt    *in, *out;
1401:   PetscScalar *buf, *work;
1402:   PetscInt     op[] = {GL_ADD, 0};

1404:   in   = gs->tree_map_in;
1405:   out  = gs->tree_map_out;
1406:   buf  = gs->tree_buf;
1407:   work = gs->tree_work;
1408:   size = gs->tree_nel;

1410:   PCTFS_rvec_zero(buf, size);

1412:   while (*in >= 0) *(buf + *out++) = *(vals + *in++);

1414:   in  = gs->tree_map_in;
1415:   out = gs->tree_map_out;

1417:   PCTFS_grop_hc(buf, work, size, op, dim);

1419:   while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1420:   return 0;
1421: }