Actual source code: gs.c
2: /***********************************gs.c***************************************
4: Author: Henry M. Tufo III
6: e-mail: hmt@cs.brown.edu
8: snail-mail:
9: Division of Applied Mathematics
10: Brown University
11: Providence, RI 02912
13: Last Modification:
14: 6.21.97
15: ************************************gs.c**************************************/
17: /***********************************gs.c***************************************
18: File Description:
19: -----------------
21: ************************************gs.c**************************************/
23: #include <../src/ksp/pc/impls/tfs/tfs.h>
25: /* default length of number of items via tree - doubles if exceeded */
26: #define TREE_BUF_SZ 2048;
27: #define GS_VEC_SZ 1
29: /***********************************gs.c***************************************
30: Type: struct gather_scatter_id
31: ------------------------------
33: ************************************gs.c**************************************/
34: typedef struct gather_scatter_id {
35: PetscInt id;
36: PetscInt nel_min;
37: PetscInt nel_max;
38: PetscInt nel_sum;
39: PetscInt negl;
40: PetscInt gl_max;
41: PetscInt gl_min;
42: PetscInt repeats;
43: PetscInt ordered;
44: PetscInt positive;
45: PetscScalar *vals;
47: /* bit mask info */
48: PetscInt *my_proc_mask;
49: PetscInt mask_sz;
50: PetscInt *ngh_buf;
51: PetscInt ngh_buf_sz;
52: PetscInt *nghs;
53: PetscInt num_nghs;
54: PetscInt max_nghs;
55: PetscInt *pw_nghs;
56: PetscInt num_pw_nghs;
57: PetscInt *tree_nghs;
58: PetscInt num_tree_nghs;
60: PetscInt num_loads;
62: /* repeats == true -> local info */
63: PetscInt nel; /* number of unique elememts */
64: PetscInt *elms; /* of size nel */
65: PetscInt nel_total;
66: PetscInt *local_elms; /* of size nel_total */
67: PetscInt *companion; /* of size nel_total */
69: /* local info */
70: PetscInt num_local_total;
71: PetscInt local_strength;
72: PetscInt num_local;
73: PetscInt *num_local_reduce;
74: PetscInt **local_reduce;
75: PetscInt num_local_gop;
76: PetscInt *num_gop_local_reduce;
77: PetscInt **gop_local_reduce;
79: /* pairwise info */
80: PetscInt level;
81: PetscInt num_pairs;
82: PetscInt max_pairs;
83: PetscInt loc_node_pairs;
84: PetscInt max_node_pairs;
85: PetscInt min_node_pairs;
86: PetscInt avg_node_pairs;
87: PetscInt *pair_list;
88: PetscInt *msg_sizes;
89: PetscInt **node_list;
90: PetscInt len_pw_list;
91: PetscInt *pw_elm_list;
92: PetscScalar *pw_vals;
94: MPI_Request *msg_ids_in;
95: MPI_Request *msg_ids_out;
97: PetscScalar *out;
98: PetscScalar *in;
99: PetscInt msg_total;
101: /* tree - crystal accumulator info */
102: PetscInt max_left_over;
103: PetscInt *pre;
104: PetscInt *in_num;
105: PetscInt *out_num;
106: PetscInt **in_list;
107: PetscInt **out_list;
109: /* new tree work*/
110: PetscInt tree_nel;
111: PetscInt *tree_elms;
112: PetscScalar *tree_buf;
113: PetscScalar *tree_work;
115: PetscInt tree_map_sz;
116: PetscInt *tree_map_in;
117: PetscInt *tree_map_out;
119: /* current memory status */
120: PetscInt gl_bss_min;
121: PetscInt gl_perm_min;
123: /* max segment size for PCTFS_gs_gop_vec() */
124: PetscInt vec_sz;
126: /* hack to make paul happy */
127: MPI_Comm PCTFS_gs_comm;
129: } PCTFS_gs_id;
131: static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
132: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
133: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
134: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
135: static PCTFS_gs_id *gsi_new(void);
136: static PetscErrorCode set_tree(PCTFS_gs_id *gs);
138: /* same for all but vector flavor */
139: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
140: /* vector flavor */
141: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
143: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
144: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
145: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
146: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
147: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
149: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
150: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
152: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
153: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
154: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
156: /* global vars */
157: /* from comm.c module */
159: static PetscInt num_gs_ids = 0;
161: /* should make this dynamic ... later */
162: static PetscInt msg_buf = MAX_MSG_BUF;
163: static PetscInt vec_sz = GS_VEC_SZ;
164: static PetscInt *tree_buf = NULL;
165: static PetscInt tree_buf_sz = 0;
166: static PetscInt ntree = 0;
168: /***************************************************************************/
169: PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
170: {
171: vec_sz = size;
172: return 0;
173: }
175: /******************************************************************************/
176: PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
177: {
178: msg_buf = buf_size;
179: return 0;
180: }
182: /******************************************************************************/
183: PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
184: {
185: PCTFS_gs_id *gs;
186: MPI_Group PCTFS_gs_group;
187: MPI_Comm PCTFS_gs_comm;
189: /* ensure that communication package has been initialized */
190: PCTFS_comm_init();
192: /* determines if we have enough dynamic/semi-static memory */
193: /* checks input, allocs and sets gd_id template */
194: gs = gsi_check_args(elms, nel, level);
196: /* only bit mask version up and working for the moment */
197: /* LATER :: get int list version working for sparse pblms */
198: PETSC_COMM_WORLD, gsi_via_bit_mask(gs);
200: PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group);
201: PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm);
202: PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group);
204: gs->PCTFS_gs_comm = PCTFS_gs_comm;
206: return (gs);
207: }
209: /******************************************************************************/
210: static PCTFS_gs_id *gsi_new(void)
211: {
212: PCTFS_gs_id *gs;
213: gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id));
214: PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id));
215: return (gs);
216: }
218: /******************************************************************************/
219: static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
220: {
221: PetscInt i, j, k, t2;
222: PetscInt *companion, *elms, *unique, *iptr;
223: PetscInt num_local = 0, *num_to_reduce, **local_reduce;
224: PetscInt oprs[] = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND};
225: PetscInt vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
226: PetscInt work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
227: PCTFS_gs_id *gs;
229: if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n");
230: if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n");
232: if (nel == 0) PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n");
234: /* get space for gs template */
235: gs = gsi_new();
236: gs->id = ++num_gs_ids;
238: /* hmt 6.4.99 */
239: /* caller can set global ids that don't participate to 0 */
240: /* PCTFS_gs_init ignores all zeros in elm list */
241: /* negative global ids are still invalid */
242: for (i = j = 0; i < nel; i++) {
243: if (in_elms[i] != 0) j++;
244: }
246: k = nel;
247: nel = j;
249: /* copy over in_elms list and create inverse map */
250: elms = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt));
251: companion = (PetscInt *)malloc(nel * sizeof(PetscInt));
253: for (i = j = 0; i < k; i++) {
254: if (in_elms[i] != 0) {
255: elms[j] = in_elms[i];
256: companion[j++] = i;
257: }
258: }
260: if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n");
262: /* pre-pass ... check to see if sorted */
263: elms[nel] = INT_MAX;
264: iptr = elms;
265: unique = elms + 1;
266: j = 0;
267: while (*iptr != INT_MAX) {
268: if (*iptr++ > *unique++) {
269: j = 1;
270: break;
271: }
272: }
274: /* set up inverse map */
275: if (j) {
276: PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n");
277: PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER);
278: } else PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n");
279: elms[nel] = INT_MIN;
281: /* first pass */
282: /* determine number of unique elements, check pd */
283: for (i = k = 0; i < nel; i += j) {
284: t2 = elms[i];
285: j = ++i;
287: /* clump 'em for now */
288: while (elms[j] == t2) j++;
290: /* how many together and num local */
291: if (j -= i) {
292: num_local++;
293: k += j;
294: }
295: }
297: /* how many unique elements? */
298: gs->repeats = k;
299: gs->nel = nel - k;
301: /* number of repeats? */
302: gs->num_local = num_local;
303: num_local += 2;
304: gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *));
305: gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt));
307: unique = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt));
308: gs->elms = unique;
309: gs->nel_total = nel;
310: gs->local_elms = elms;
311: gs->companion = companion;
313: /* compess map as well as keep track of local ops */
314: for (num_local = i = j = 0; i < gs->nel; i++) {
315: k = j;
316: t2 = unique[i] = elms[j];
317: companion[i] = companion[j];
319: while (elms[j] == t2) j++;
321: if ((t2 = (j - k)) > 1) {
322: /* number together */
323: num_to_reduce[num_local] = t2++;
325: iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt));
327: /* to use binary searching don't remap until we check intersection */
328: *iptr++ = i;
330: /* note that we're skipping the first one */
331: while (++k < j) *(iptr++) = companion[k];
332: *iptr = -1;
333: }
334: }
336: /* sentinel for ngh_buf */
337: unique[gs->nel] = INT_MAX;
339: /* for two partition sort hack */
340: num_to_reduce[num_local] = 0;
341: local_reduce[num_local] = NULL;
342: num_to_reduce[++num_local] = 0;
343: local_reduce[num_local] = NULL;
345: /* load 'em up */
346: /* note one extra to hold NON_UNIFORM flag!!! */
347: vals[2] = vals[1] = vals[0] = nel;
348: if (gs->nel > 0) {
349: vals[3] = unique[0];
350: vals[4] = unique[gs->nel - 1];
351: } else {
352: vals[3] = INT_MAX;
353: vals[4] = INT_MIN;
354: }
355: vals[5] = level;
356: vals[6] = num_gs_ids;
358: /* GLOBAL: send 'em out */
359: PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs);
361: /* must be semi-pos def - only pairwise depends on this */
362: /* LATER - remove this restriction */
363: if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n");
364: if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n");
366: gs->nel_min = vals[0];
367: gs->nel_max = vals[1];
368: gs->nel_sum = vals[2];
369: gs->gl_min = vals[3];
370: gs->gl_max = vals[4];
371: gs->negl = vals[4] - vals[3] + 1;
373: if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl);
375: /* LATER :: add level == -1 -> program selects level */
376: if (vals[5] < 0) vals[5] = 0;
377: else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes;
378: gs->level = vals[5];
380: return (gs);
381: }
383: /******************************************************************************/
384: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
385: {
386: PetscInt i, nel, *elms;
387: PetscInt t1;
388: PetscInt **reduce;
389: PetscInt *map;
391: /* totally local removes ... PCTFS_ct_bits == 0 */
392: get_ngh_buf(gs);
394: if (gs->level) set_pairwise(gs);
395: if (gs->max_left_over) set_tree(gs);
397: /* intersection local and pairwise/tree? */
398: gs->num_local_total = gs->num_local;
399: gs->gop_local_reduce = gs->local_reduce;
400: gs->num_gop_local_reduce = gs->num_local_reduce;
402: map = gs->companion;
404: /* is there any local compression */
405: if (!gs->num_local) {
406: gs->local_strength = NONE;
407: gs->num_local_gop = 0;
408: } else {
409: /* ok find intersection */
410: map = gs->companion;
411: reduce = gs->local_reduce;
412: for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) {
413: if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) {
414: t1++;
416: gs->num_local_reduce[i] *= -1;
417: }
418: **reduce = map[**reduce];
419: }
421: /* intersection is empty */
422: if (!t1) {
423: gs->local_strength = FULL;
424: gs->num_local_gop = 0;
425: } else { /* intersection not empty */
426: gs->local_strength = PARTIAL;
428: PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);
430: gs->num_local_gop = t1;
431: gs->num_local_total = gs->num_local;
432: gs->num_local -= t1;
433: gs->gop_local_reduce = gs->local_reduce;
434: gs->num_gop_local_reduce = gs->num_local_reduce;
436: for (i = 0; i < t1; i++) {
438: gs->num_gop_local_reduce[i] *= -1;
439: gs->local_reduce++;
440: gs->num_local_reduce++;
441: }
442: gs->local_reduce++;
443: gs->num_local_reduce++;
444: }
445: }
447: elms = gs->pw_elm_list;
448: nel = gs->len_pw_list;
449: for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
451: elms = gs->tree_map_in;
452: nel = gs->tree_map_sz;
453: for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
455: /* clean up */
456: free((void *)gs->local_elms);
457: free((void *)gs->companion);
458: free((void *)gs->elms);
459: free((void *)gs->ngh_buf);
460: gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
461: return 0;
462: }
464: /******************************************************************************/
465: static PetscErrorCode place_in_tree(PetscInt elm)
466: {
467: PetscInt *tp, n;
469: if (ntree == tree_buf_sz) {
470: if (tree_buf_sz) {
471: tp = tree_buf;
472: n = tree_buf_sz;
473: tree_buf_sz <<= 1;
474: tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
475: PCTFS_ivec_copy(tree_buf, tp, n);
476: free(tp);
477: } else {
478: tree_buf_sz = TREE_BUF_SZ;
479: tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
480: }
481: }
483: tree_buf[ntree++] = elm;
484: return 0;
485: }
487: /******************************************************************************/
488: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
489: {
490: PetscInt i, j, npw = 0, ntree_map = 0;
491: PetscInt p_mask_size, ngh_buf_size, buf_size;
492: PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
493: PetscInt *ngh_buf, *buf1, *buf2;
494: PetscInt offset, per_load, num_loads, or_ct, start, end;
495: PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
496: PetscInt oper = GL_B_OR;
497: PetscInt *ptr3, *t_mask, level, ct1, ct2;
499: /* to make life easier */
500: nel = gs->nel;
501: elms = gs->elms;
502: level = gs->level;
504: /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
505: p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes));
506: PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id);
508: /* allocate space for masks and info bufs */
509: gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size);
510: gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size);
511: gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel;
512: t_mask = (PetscInt *)malloc(p_mask_size);
513: gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size);
515: /* comm buffer size ... memory usage bounded by ~2*msg_buf */
516: /* had thought I could exploit rendezvous threshold */
518: /* default is one pass */
519: per_load = negl = gs->negl;
520: gs->num_loads = num_loads = 1;
521: i = p_mask_size * negl;
523: /* possible overflow on buffer size */
524: /* overflow hack */
525: if (i < 0) i = INT_MAX;
527: buf_size = PetscMin(msg_buf, i);
529: /* can we do it? */
532: /* get PCTFS_giop buf space ... make *only* one malloc */
533: buf1 = (PetscInt *)malloc(buf_size << 1);
535: /* more than one gior exchange needed? */
536: if (buf_size != i) {
537: per_load = buf_size / p_mask_size;
538: buf_size = per_load * p_mask_size;
539: gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0);
540: }
542: /* convert buf sizes from #bytes to #ints - 32 bit only! */
543: p_mask_size /= sizeof(PetscInt);
544: ngh_buf_size /= sizeof(PetscInt);
545: buf_size /= sizeof(PetscInt);
547: /* find PCTFS_giop work space */
548: buf2 = buf1 + buf_size;
550: /* hold #ints needed for processor masks */
551: gs->mask_sz = p_mask_size;
553: /* init buffers */
554: PCTFS_ivec_zero(sh_proc_mask, p_mask_size);
555: PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size);
556: PCTFS_ivec_zero(ngh_buf, ngh_buf_size);
558: /* HACK reset tree info */
559: tree_buf = NULL;
560: tree_buf_sz = ntree = 0;
562: /* ok do it */
563: for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) {
564: /* identity for bitwise or is 000...000 */
565: PCTFS_ivec_zero(buf1, buf_size);
567: /* load msg buffer */
568: for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) {
569: offset = (offset - start) * p_mask_size;
570: PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size);
571: }
573: /* GLOBAL: pass buffer */
574: PCTFS_giop(buf1, buf2, buf_size, &oper);
576: /* unload buffer into ngh_buf */
577: ptr2 = (elms + i_start);
578: for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) {
579: /* I own it ... may have to pairwise it */
580: if (j == *ptr2) {
581: /* do i share it w/anyone? */
582: ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
583: /* guess not */
584: if (ct1 < 2) {
585: ptr2++;
586: ptr1 += p_mask_size;
587: continue;
588: }
590: /* i do ... so keep info and turn off my bit */
591: PCTFS_ivec_copy(ptr1, ptr3, p_mask_size);
592: PCTFS_ivec_xor(ptr1, p_mask, p_mask_size);
593: PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size);
595: /* is it to be done pairwise? */
596: if (--ct1 <= level) {
597: npw++;
599: /* turn on high bit to indicate pw need to process */
600: *ptr2++ |= TOP_BIT;
601: PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size);
602: ptr1 += p_mask_size;
603: continue;
604: }
606: /* get set for next and note that I have a tree contribution */
607: /* could save exact elm index for tree here -> save a search */
608: ptr2++;
609: ptr1 += p_mask_size;
610: ntree_map++;
611: } else { /* i don't but still might be involved in tree */
613: /* shared by how many? */
614: ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
616: /* none! */
617: if (ct1 < 2) continue;
619: /* is it going to be done pairwise? but not by me of course!*/
620: if (--ct1 <= level) continue;
621: }
622: /* LATER we're going to have to process it NOW */
623: /* nope ... tree it */
624: place_in_tree(j);
625: }
626: }
628: free((void *)t_mask);
629: free((void *)buf1);
631: gs->len_pw_list = npw;
632: gs->num_nghs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
634: /* expand from bit mask list to int list and save ngh list */
635: gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt));
636: PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs);
638: gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt));
640: oper = GL_MAX;
641: ct1 = gs->num_nghs;
642: PCTFS_giop(&ct1, &ct2, 1, &oper);
643: gs->max_nghs = ct1;
645: gs->tree_map_sz = ntree_map;
646: gs->max_left_over = ntree;
648: free((void *)p_mask);
649: free((void *)sh_proc_mask);
650: return 0;
651: }
653: /******************************************************************************/
654: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
655: {
656: PetscInt i, j;
657: PetscInt p_mask_size;
658: PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
659: PetscInt *ngh_buf, *buf2;
660: PetscInt offset;
661: PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
662: PetscInt *pairwise_elm_list, len_pair_list = 0;
663: PetscInt *iptr, t1, i_start, nel, *elms;
664: PetscInt ct;
666: /* to make life easier */
667: nel = gs->nel;
668: elms = gs->elms;
669: ngh_buf = gs->ngh_buf;
670: sh_proc_mask = gs->pw_nghs;
672: /* need a few temp masks */
673: p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes);
674: p_mask = (PetscInt *)malloc(p_mask_size);
675: tmp_proc_mask = (PetscInt *)malloc(p_mask_size);
677: /* set mask to my PCTFS_my_id's bit mask */
678: PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id);
680: p_mask_size /= sizeof(PetscInt);
682: len_pair_list = gs->len_pw_list;
683: gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt));
685: /* how many processors (nghs) do we have to exchange with? */
686: nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
688: /* allocate space for PCTFS_gs_gop() info */
689: gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
690: gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
691: gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1));
693: /* init msg_size list */
694: PCTFS_ivec_zero(msg_size, nprs);
696: /* expand from bit mask list to int list */
697: PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list);
699: /* keep list of elements being handled pairwise */
700: for (i = j = 0; i < nel; i++) {
701: if (elms[i] & TOP_BIT) {
702: elms[i] ^= TOP_BIT;
703: pairwise_elm_list[j++] = i;
704: }
705: }
706: pairwise_elm_list[j] = -1;
708: gs->msg_ids_out = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
709: gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
710: gs->msg_ids_in = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
711: gs->msg_ids_in[nprs] = MPI_REQUEST_NULL;
712: gs->pw_vals = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz);
714: /* find who goes to each processor */
715: for (i_start = i = 0; i < nprs; i++) {
716: /* processor i's mask */
717: PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i]);
719: /* det # going to processor i */
720: for (ct = j = 0; j < len_pair_list; j++) {
721: buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
722: PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size);
723: if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++;
724: }
725: msg_size[i] = ct;
726: i_start = PetscMax(i_start, ct);
728: /*space to hold nodes in message to first neighbor */
729: msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1));
731: for (j = 0; j < len_pair_list; j++) {
732: buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
733: PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size);
734: if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j;
735: }
736: *iptr = -1;
737: }
738: msg_nodes[nprs] = NULL;
740: j = gs->loc_node_pairs = i_start;
741: t1 = GL_MAX;
742: PCTFS_giop(&i_start, &offset, 1, &t1);
743: gs->max_node_pairs = i_start;
745: i_start = j;
746: t1 = GL_MIN;
747: PCTFS_giop(&i_start, &offset, 1, &t1);
748: gs->min_node_pairs = i_start;
750: i_start = j;
751: t1 = GL_ADD;
752: PCTFS_giop(&i_start, &offset, 1, &t1);
753: gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1;
755: i_start = nprs;
756: t1 = GL_MAX;
757: PCTFS_giop(&i_start, &offset, 1, &t1);
758: gs->max_pairs = i_start;
760: /* remap pairwise in tail of gsi_via_bit_mask() */
761: gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs);
762: gs->out = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
763: gs->in = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
765: /* reset malloc pool */
766: free((void *)p_mask);
767: free((void *)tmp_proc_mask);
768: return 0;
769: }
771: /* to do pruned tree just save ngh buf copy for each one and decode here!
772: ******************************************************************************/
773: static PetscErrorCode set_tree(PCTFS_gs_id *gs)
774: {
775: PetscInt i, j, n, nel;
776: PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
778: /* local work ptrs */
779: elms = gs->elms;
780: nel = gs->nel;
782: /* how many via tree */
783: gs->tree_nel = n = ntree;
784: gs->tree_elms = tree_elms = iptr_in = tree_buf;
785: gs->tree_buf = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
786: gs->tree_work = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
787: j = gs->tree_map_sz;
788: gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
789: gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
791: /* search the longer of the two lists */
792: /* note ... could save this info in get_ngh_buf and save searches */
793: if (n <= nel) {
794: /* bijective fct w/remap - search elm list */
795: for (i = 0; i < n; i++) {
796: if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) {
797: *iptr_in++ = j;
798: *iptr_out++ = i;
799: }
800: }
801: } else {
802: for (i = 0; i < nel; i++) {
803: if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) {
804: *iptr_in++ = i;
805: *iptr_out++ = j;
806: }
807: }
808: }
810: /* sentinel */
811: *iptr_in = *iptr_out = -1;
812: return 0;
813: }
815: /******************************************************************************/
816: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals)
817: {
818: PetscInt *num, *map, **reduce;
819: PetscScalar tmp;
821: num = gs->num_gop_local_reduce;
822: reduce = gs->gop_local_reduce;
823: while ((map = *reduce++)) {
824: /* wall */
825: if (*num == 2) {
826: num++;
827: vals[map[1]] = vals[map[0]];
828: } else if (*num == 3) { /* corner shared by three elements */
829: num++;
830: vals[map[2]] = vals[map[1]] = vals[map[0]];
831: } else if (*num == 4) { /* corner shared by four elements */
832: num++;
833: vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
834: } else { /* general case ... odd geoms ... 3D*/
835: num++;
836: tmp = *(vals + *map++);
837: while (*map >= 0) *(vals + *map++) = tmp;
838: }
839: }
840: return 0;
841: }
843: /******************************************************************************/
844: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals)
845: {
846: PetscInt *num, *map, **reduce;
847: PetscScalar tmp;
849: num = gs->num_local_reduce;
850: reduce = gs->local_reduce;
851: while ((map = *reduce)) {
852: /* wall */
853: if (*num == 2) {
854: num++;
855: reduce++;
856: vals[map[1]] = vals[map[0]] += vals[map[1]];
857: } else if (*num == 3) { /* corner shared by three elements */
858: num++;
859: reduce++;
860: vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]);
861: } else if (*num == 4) { /* corner shared by four elements */
862: num++;
863: reduce++;
864: vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
865: } else { /* general case ... odd geoms ... 3D*/
866: num++;
867: tmp = 0.0;
868: while (*map >= 0) tmp += *(vals + *map++);
870: map = *reduce++;
871: while (*map >= 0) *(vals + *map++) = tmp;
872: }
873: }
874: return 0;
875: }
877: /******************************************************************************/
878: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals)
879: {
880: PetscInt *num, *map, **reduce;
881: PetscScalar *base;
883: num = gs->num_gop_local_reduce;
884: reduce = gs->gop_local_reduce;
885: while ((map = *reduce++)) {
886: /* wall */
887: if (*num == 2) {
888: num++;
889: vals[map[0]] += vals[map[1]];
890: } else if (*num == 3) { /* corner shared by three elements */
891: num++;
892: vals[map[0]] += (vals[map[1]] + vals[map[2]]);
893: } else if (*num == 4) { /* corner shared by four elements */
894: num++;
895: vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
896: } else { /* general case ... odd geoms ... 3D*/
897: num++;
898: base = vals + *map++;
899: while (*map >= 0) *base += *(vals + *map++);
900: }
901: }
902: return 0;
903: }
905: /******************************************************************************/
906: PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
907: {
908: PetscInt i;
910: MPI_Comm_free(&gs->PCTFS_gs_comm);
911: if (gs->nghs) free((void *)gs->nghs);
912: if (gs->pw_nghs) free((void *)gs->pw_nghs);
914: /* tree */
915: if (gs->max_left_over) {
916: if (gs->tree_elms) free((void *)gs->tree_elms);
917: if (gs->tree_buf) free((void *)gs->tree_buf);
918: if (gs->tree_work) free((void *)gs->tree_work);
919: if (gs->tree_map_in) free((void *)gs->tree_map_in);
920: if (gs->tree_map_out) free((void *)gs->tree_map_out);
921: }
923: /* pairwise info */
924: if (gs->num_pairs) {
925: /* should be NULL already */
926: if (gs->ngh_buf) free((void *)gs->ngh_buf);
927: if (gs->elms) free((void *)gs->elms);
928: if (gs->local_elms) free((void *)gs->local_elms);
929: if (gs->companion) free((void *)gs->companion);
931: /* only set if pairwise */
932: if (gs->vals) free((void *)gs->vals);
933: if (gs->in) free((void *)gs->in);
934: if (gs->out) free((void *)gs->out);
935: if (gs->msg_ids_in) free((void *)gs->msg_ids_in);
936: if (gs->msg_ids_out) free((void *)gs->msg_ids_out);
937: if (gs->pw_vals) free((void *)gs->pw_vals);
938: if (gs->pw_elm_list) free((void *)gs->pw_elm_list);
939: if (gs->node_list) {
940: for (i = 0; i < gs->num_pairs; i++) {
941: if (gs->node_list[i]) free((void *)gs->node_list[i]);
942: }
943: free((void *)gs->node_list);
944: }
945: if (gs->msg_sizes) free((void *)gs->msg_sizes);
946: if (gs->pair_list) free((void *)gs->pair_list);
947: }
949: /* local info */
950: if (gs->num_local_total >= 0) {
951: for (i = 0; i < gs->num_local_total + 1; i++) {
952: if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]);
953: }
954: }
956: /* if intersection tree/pairwise and local isn't empty */
957: if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce);
958: if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce);
960: free((void *)gs);
961: return 0;
962: }
964: /******************************************************************************/
965: PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
966: {
967: switch (*op) {
968: case '+':
969: PCTFS_gs_gop_vec_plus(gs, vals, step);
970: break;
971: default:
972: PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0]);
973: PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n");
974: PCTFS_gs_gop_vec_plus(gs, vals, step);
975: break;
976: }
977: return 0;
978: }
980: /******************************************************************************/
981: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
982: {
985: /* local only operations!!! */
986: if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs, vals, step);
988: /* if intersection tree/pairwise and local isn't empty */
989: if (gs->num_local_gop) {
990: PCTFS_gs_gop_vec_local_in_plus(gs, vals, step);
992: /* pairwise */
993: if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step);
995: /* tree */
996: else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs, vals, step);
998: PCTFS_gs_gop_vec_local_out(gs, vals, step);
999: } else { /* if intersection tree/pairwise and local is empty */
1000: /* pairwise */
1001: if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step);
1003: /* tree */
1004: else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs, vals, step);
1005: }
1006: return 0;
1007: }
1009: /******************************************************************************/
1010: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1011: {
1012: PetscInt *num, *map, **reduce;
1013: PetscScalar *base;
1015: num = gs->num_local_reduce;
1016: reduce = gs->local_reduce;
1017: while ((map = *reduce)) {
1018: base = vals + map[0] * step;
1020: /* wall */
1021: if (*num == 2) {
1022: num++;
1023: reduce++;
1024: PCTFS_rvec_add(base, vals + map[1] * step, step);
1025: PCTFS_rvec_copy(vals + map[1] * step, base, step);
1026: } else if (*num == 3) { /* corner shared by three elements */
1027: num++;
1028: reduce++;
1029: PCTFS_rvec_add(base, vals + map[1] * step, step);
1030: PCTFS_rvec_add(base, vals + map[2] * step, step);
1031: PCTFS_rvec_copy(vals + map[2] * step, base, step);
1032: PCTFS_rvec_copy(vals + map[1] * step, base, step);
1033: } else if (*num == 4) { /* corner shared by four elements */
1034: num++;
1035: reduce++;
1036: PCTFS_rvec_add(base, vals + map[1] * step, step);
1037: PCTFS_rvec_add(base, vals + map[2] * step, step);
1038: PCTFS_rvec_add(base, vals + map[3] * step, step);
1039: PCTFS_rvec_copy(vals + map[3] * step, base, step);
1040: PCTFS_rvec_copy(vals + map[2] * step, base, step);
1041: PCTFS_rvec_copy(vals + map[1] * step, base, step);
1042: } else { /* general case ... odd geoms ... 3D */
1043: num++;
1044: while (*++map >= 0) PCTFS_rvec_add(base, vals + *map * step, step);
1046: map = *reduce;
1047: while (*++map >= 0) PCTFS_rvec_copy(vals + *map * step, base, step);
1049: reduce++;
1050: }
1051: }
1052: return 0;
1053: }
1055: /******************************************************************************/
1056: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1057: {
1058: PetscInt *num, *map, **reduce;
1059: PetscScalar *base;
1061: num = gs->num_gop_local_reduce;
1062: reduce = gs->gop_local_reduce;
1063: while ((map = *reduce++)) {
1064: base = vals + map[0] * step;
1066: /* wall */
1067: if (*num == 2) {
1068: num++;
1069: PCTFS_rvec_add(base, vals + map[1] * step, step);
1070: } else if (*num == 3) { /* corner shared by three elements */
1071: num++;
1072: PCTFS_rvec_add(base, vals + map[1] * step, step);
1073: PCTFS_rvec_add(base, vals + map[2] * step, step);
1074: } else if (*num == 4) { /* corner shared by four elements */
1075: num++;
1076: PCTFS_rvec_add(base, vals + map[1] * step, step);
1077: PCTFS_rvec_add(base, vals + map[2] * step, step);
1078: PCTFS_rvec_add(base, vals + map[3] * step, step);
1079: } else { /* general case ... odd geoms ... 3D*/
1080: num++;
1081: while (*++map >= 0) PCTFS_rvec_add(base, vals + *map * step, step);
1082: }
1083: }
1084: return 0;
1085: }
1087: /******************************************************************************/
1088: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1089: {
1090: PetscInt *num, *map, **reduce;
1091: PetscScalar *base;
1093: num = gs->num_gop_local_reduce;
1094: reduce = gs->gop_local_reduce;
1095: while ((map = *reduce++)) {
1096: base = vals + map[0] * step;
1098: /* wall */
1099: if (*num == 2) {
1100: num++;
1101: PCTFS_rvec_copy(vals + map[1] * step, base, step);
1102: } else if (*num == 3) { /* corner shared by three elements */
1103: num++;
1104: PCTFS_rvec_copy(vals + map[1] * step, base, step);
1105: PCTFS_rvec_copy(vals + map[2] * step, base, step);
1106: } else if (*num == 4) { /* corner shared by four elements */
1107: num++;
1108: PCTFS_rvec_copy(vals + map[1] * step, base, step);
1109: PCTFS_rvec_copy(vals + map[2] * step, base, step);
1110: PCTFS_rvec_copy(vals + map[3] * step, base, step);
1111: } else { /* general case ... odd geoms ... 3D*/
1112: num++;
1113: while (*++map >= 0) PCTFS_rvec_copy(vals + *map * step, base, step);
1114: }
1115: }
1116: return 0;
1117: }
1119: /******************************************************************************/
1120: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step)
1121: {
1122: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1123: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1124: PetscInt *pw, *list, *size, **nodes;
1125: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1126: MPI_Status status;
1127: PetscBLASInt i1 = 1, dstep;
1129: /* strip and load s */
1130: msg_list = list = gs->pair_list;
1131: msg_size = size = gs->msg_sizes;
1132: msg_nodes = nodes = gs->node_list;
1133: iptr = pw = gs->pw_elm_list;
1134: dptr1 = dptr3 = gs->pw_vals;
1135: msg_ids_in = ids_in = gs->msg_ids_in;
1136: msg_ids_out = ids_out = gs->msg_ids_out;
1137: dptr2 = gs->out;
1138: in1 = in2 = gs->in;
1140: /* post the receives */
1141: /* msg_nodes=nodes; */
1142: do {
1143: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1144: second one *list and do list++ afterwards */
1145: MPI_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1146: list++;
1147: msg_ids_in++;
1148: in1 += *size++ * step;
1149: } while (*++msg_nodes);
1150: msg_nodes = nodes;
1152: /* load gs values into in out gs buffers */
1153: while (*iptr >= 0) {
1154: PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step);
1155: dptr3 += step;
1156: iptr++;
1157: }
1159: /* load out buffers and post the sends */
1160: while ((iptr = *msg_nodes++)) {
1161: dptr3 = dptr2;
1162: while (*iptr >= 0) {
1163: PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step);
1164: dptr2 += step;
1165: iptr++;
1166: }
1167: MPI_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1168: msg_size++;
1169: msg_list++;
1170: msg_ids_out++;
1171: }
1173: /* tree */
1174: if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step);
1176: /* process the received data */
1177: msg_nodes = nodes;
1178: while ((iptr = *nodes++)) {
1179: PetscScalar d1 = 1.0;
1181: /* Should I check the return value of MPI_Wait() or status? */
1182: /* Can this loop be replaced by a call to MPI_Waitall()? */
1183: MPI_Wait(ids_in, &status);
1184: ids_in++;
1185: while (*iptr >= 0) {
1186: PetscBLASIntCast(step, &dstep);
1187: PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1));
1188: in2 += step;
1189: iptr++;
1190: }
1191: }
1193: /* replace vals */
1194: while (*pw >= 0) {
1195: PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step);
1196: dptr1 += step;
1197: pw++;
1198: }
1200: /* clear isend message handles */
1201: /* This changed for clarity though it could be the same */
1203: /* Should I check the return value of MPI_Wait() or status? */
1204: /* Can this loop be replaced by a call to MPI_Waitall()? */
1205: while (*msg_nodes++) {
1206: MPI_Wait(ids_out, &status);
1207: ids_out++;
1208: }
1209: return 0;
1210: }
1212: /******************************************************************************/
1213: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1214: {
1215: PetscInt size, *in, *out;
1216: PetscScalar *buf, *work;
1217: PetscInt op[] = {GL_ADD, 0};
1218: PetscBLASInt i1 = 1;
1219: PetscBLASInt dstep;
1221: /* copy over to local variables */
1222: in = gs->tree_map_in;
1223: out = gs->tree_map_out;
1224: buf = gs->tree_buf;
1225: work = gs->tree_work;
1226: size = gs->tree_nel * step;
1228: /* zero out collection buffer */
1229: PCTFS_rvec_zero(buf, size);
1231: /* copy over my contributions */
1232: while (*in >= 0) {
1233: PetscBLASIntCast(step, &dstep);
1234: PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1));
1235: }
1237: /* perform fan in/out on full buffer */
1238: /* must change PCTFS_grop to handle the blas */
1239: PCTFS_grop(buf, work, size, op);
1241: /* reset */
1242: in = gs->tree_map_in;
1243: out = gs->tree_map_out;
1245: /* get the portion of the results I need */
1246: while (*in >= 0) {
1247: PetscBLASIntCast(step, &dstep);
1248: PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1));
1249: }
1250: return 0;
1251: }
1253: /******************************************************************************/
1254: PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
1255: {
1256: switch (*op) {
1257: case '+':
1258: PCTFS_gs_gop_plus_hc(gs, vals, dim);
1259: break;
1260: default:
1261: PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0]);
1262: PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n");
1263: PCTFS_gs_gop_plus_hc(gs, vals, dim);
1264: break;
1265: }
1266: return 0;
1267: }
1269: /******************************************************************************/
1270: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1271: {
1272: /* if there's nothing to do return */
1273: if (dim <= 0) return 0;
1275: /* can't do more dimensions then exist */
1276: dim = PetscMin(dim, PCTFS_i_log2_num_nodes);
1278: /* local only operations!!! */
1279: if (gs->num_local) PCTFS_gs_gop_local_plus(gs, vals);
1281: /* if intersection tree/pairwise and local isn't empty */
1282: if (gs->num_local_gop) {
1283: PCTFS_gs_gop_local_in_plus(gs, vals);
1285: /* pairwise will do tree inside ... */
1286: if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim); /* tree only */
1287: else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs, vals, dim);
1289: PCTFS_gs_gop_local_out(gs, vals);
1290: } else { /* if intersection tree/pairwise and local is empty */
1291: /* pairwise will do tree inside */
1292: if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim); /* tree */
1293: else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs, vals, dim);
1294: }
1295: return 0;
1296: }
1298: /******************************************************************************/
1299: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim)
1300: {
1301: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1302: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1303: PetscInt *pw, *list, *size, **nodes;
1304: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1305: MPI_Status status;
1306: PetscInt i, mask = 1;
1308: for (i = 1; i < dim; i++) {
1309: mask <<= 1;
1310: mask++;
1311: }
1313: /* strip and load s */
1314: msg_list = list = gs->pair_list;
1315: msg_size = size = gs->msg_sizes;
1316: msg_nodes = nodes = gs->node_list;
1317: iptr = pw = gs->pw_elm_list;
1318: dptr1 = dptr3 = gs->pw_vals;
1319: msg_ids_in = ids_in = gs->msg_ids_in;
1320: msg_ids_out = ids_out = gs->msg_ids_out;
1321: dptr2 = gs->out;
1322: in1 = in2 = gs->in;
1324: /* post the receives */
1325: /* msg_nodes=nodes; */
1326: do {
1327: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1328: second one *list and do list++ afterwards */
1329: if ((PCTFS_my_id | mask) == (*list | mask)) {
1330: MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1331: list++;
1332: msg_ids_in++;
1333: in1 += *size++;
1334: } else {
1335: list++;
1336: size++;
1337: }
1338: } while (*++msg_nodes);
1340: /* load gs values into in out gs buffers */
1341: while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);
1343: /* load out buffers and post the sends */
1344: msg_nodes = nodes;
1345: list = msg_list;
1346: while ((iptr = *msg_nodes++)) {
1347: if ((PCTFS_my_id | mask) == (*list | mask)) {
1348: dptr3 = dptr2;
1349: while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1350: /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1351: /* is msg_ids_out++ correct? */
1352: MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1353: msg_size++;
1354: list++;
1355: msg_ids_out++;
1356: } else {
1357: list++;
1358: msg_size++;
1359: }
1360: }
1362: /* do the tree while we're waiting */
1363: if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim);
1365: /* process the received data */
1366: msg_nodes = nodes;
1367: list = msg_list;
1368: while ((iptr = *nodes++)) {
1369: if ((PCTFS_my_id | mask) == (*list | mask)) {
1370: /* Should I check the return value of MPI_Wait() or status? */
1371: /* Can this loop be replaced by a call to MPI_Waitall()? */
1372: MPI_Wait(ids_in, &status);
1373: ids_in++;
1374: while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1375: }
1376: list++;
1377: }
1379: /* replace vals */
1380: while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;
1382: /* clear isend message handles */
1383: /* This changed for clarity though it could be the same */
1384: while (*msg_nodes++) {
1385: if ((PCTFS_my_id | mask) == (*msg_list | mask)) {
1386: /* Should I check the return value of MPI_Wait() or status? */
1387: /* Can this loop be replaced by a call to MPI_Waitall()? */
1388: MPI_Wait(ids_out, &status);
1389: ids_out++;
1390: }
1391: msg_list++;
1392: }
1393: return 0;
1394: }
1396: /******************************************************************************/
1397: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1398: {
1399: PetscInt size;
1400: PetscInt *in, *out;
1401: PetscScalar *buf, *work;
1402: PetscInt op[] = {GL_ADD, 0};
1404: in = gs->tree_map_in;
1405: out = gs->tree_map_out;
1406: buf = gs->tree_buf;
1407: work = gs->tree_work;
1408: size = gs->tree_nel;
1410: PCTFS_rvec_zero(buf, size);
1412: while (*in >= 0) *(buf + *out++) = *(vals + *in++);
1414: in = gs->tree_map_in;
1415: out = gs->tree_map_out;
1417: PCTFS_grop_hc(buf, work, size, op, dim);
1419: while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1420: return 0;
1421: }