Actual source code: sfimpl.h

  1: #if !defined(PETSCSFIMPL_H)
  2: #define PETSCSFIMPL_H

  4: #include <petscvec.h>
  5: #include <petscsf.h>
  6: #include <petscdevice.h>
  7: #include <petsc/private/petscimpl.h>

  9: PETSC_EXTERN PetscLogEvent PETSCSF_SetGraph;
 10: PETSC_EXTERN PetscLogEvent PETSCSF_SetUp;
 11: PETSC_EXTERN PetscLogEvent PETSCSF_BcastBegin;
 12: PETSC_EXTERN PetscLogEvent PETSCSF_BcastEnd;
 13: PETSC_EXTERN PetscLogEvent PETSCSF_BcastBegin;
 14: PETSC_EXTERN PetscLogEvent PETSCSF_BcastEnd;
 15: PETSC_EXTERN PetscLogEvent PETSCSF_ReduceBegin;
 16: PETSC_EXTERN PetscLogEvent PETSCSF_ReduceEnd;
 17: PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpBegin;
 18: PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpEnd;
 19: PETSC_EXTERN PetscLogEvent PETSCSF_EmbedSF;
 20: PETSC_EXTERN PetscLogEvent PETSCSF_DistSect;
 21: PETSC_EXTERN PetscLogEvent PETSCSF_SectSF;
 22: PETSC_EXTERN PetscLogEvent PETSCSF_RemoteOff;
 23: PETSC_EXTERN PetscLogEvent PETSCSF_Pack;
 24: PETSC_EXTERN PetscLogEvent PETSCSF_Unpack;

 26: typedef enum {PETSCSF_../../..2LEAF=0, PETSCSF_LEAF2../../..} PetscSFDirection;
 27: typedef enum {PETSCSF_BCAST=0, PETSCSF_REDUCE, PETSCSF_FETCH} PetscSFOperation;
 28: /* When doing device-aware MPI, a backend refers to the SF/device interface */
 29: typedef enum {PETSCSF_BACKEND_INVALID=0,PETSCSF_BACKEND_CUDA,PETSCSF_BACKEND_HIP,PETSCSF_BACKEND_KOKKOS} PetscSFBackend;

 31: struct _PetscSFOps {
 32:   PetscErrorCode (*Reset)(PetscSF);
 33:   PetscErrorCode (*Destroy)(PetscSF);
 34:   PetscErrorCode (*SetUp)(PetscSF);
 35:   PetscErrorCode (*SetFromOptions)(PetscOptionItems*,PetscSF);
 36:   PetscErrorCode (*View)(PetscSF,PetscViewer);
 37:   PetscErrorCode (*Duplicate)(PetscSF,PetscSFDuplicateOption,PetscSF);
 38:   PetscErrorCode (*BcastBegin)     (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,void*,MPI_Op);
 39:   PetscErrorCode (*BcastEnd)       (PetscSF,MPI_Datatype,const void*,void*,MPI_Op);
 40:   PetscErrorCode (*ReduceBegin)    (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,void*,MPI_Op);
 41:   PetscErrorCode (*ReduceEnd)      (PetscSF,MPI_Datatype,const void*,void*,MPI_Op);
 42:   PetscErrorCode (*FetchAndOpBegin)(PetscSF,MPI_Datatype,PetscMemType,void*,PetscMemType,const void*,void*,MPI_Op);
 43:   PetscErrorCode (*FetchAndOpEnd)  (PetscSF,MPI_Datatype,void*,const void*,void*,MPI_Op);
 44:   PetscErrorCode (*BcastToZero)    (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,      void*); /* For interal use only */
 45:   PetscErrorCode (*GetRootRanks)(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**,const PetscInt**);
 46:   PetscErrorCode (*GetLeafRanks)(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**);
 47:   PetscErrorCode (*CreateLocalSF)(PetscSF,PetscSF*);
 48:   PetscErrorCode (*GetGraph)(PetscSF,PetscInt*,PetscInt*,const PetscInt**,const PetscSFNode**);
 49:   PetscErrorCode (*CreateEmbeddedRootSF)(PetscSF,PetscInt,const PetscInt*,PetscSF*);
 50:   PetscErrorCode (*CreateEmbeddedLeafSF)(PetscSF,PetscInt,const PetscInt*,PetscSF*);

 52:   PetscErrorCode (*Malloc)(PetscMemType,size_t,void**);
 53:   PetscErrorCode (*Free)(PetscMemType,void*);
 54: };

 56: typedef struct _n_PetscSFPackOpt *PetscSFPackOpt;

 58: struct _p_PetscSF {
 59:   PETSCHEADER(struct _PetscSFOps);
 60:   struct { /* Fields needed to implement VecScatter behavior */
 61:     PetscInt          from_n,to_n;   /* Recorded local sizes of the input from/to vectors in VecScatterCreate(). Used subsequently for error checking. */
 62:     PetscBool         beginandendtogether;  /* Indicates that the scatter begin and end  function are called together, VecScatterEnd() is then treated as a nop */
 63:     PetscBool         packongpu;     /* For GPU vectors, pack needed entries on GPU instead of pulling the whole vector down to CPU and then packing on CPU */
 64:     const PetscScalar *xdata;        /* Vector data to read from */
 65:     PetscScalar       *ydata;        /* Vector data to write to. The two pointers are recorded in VecScatterBegin. Memory is not managed by SF. */
 66:     PetscSF           lsf;           /* The local part of the scatter, used in SCATTER_LOCAL. Built on demand. */
 67:     PetscInt          bs;            /* Block size, determined by IS passed to VecScatterCreate */
 68:     MPI_Datatype      unit;          /* one unit = bs PetscScalars */
 69:     PetscBool         logging;       /* Indicate if vscat log events are happening. If yes, avoid duplicated SF logging to have clear -log_view */
 70:   } vscat;

 72:   /* Fields for generic PetscSF functionality */
 73:   PetscInt        nroots;          /* Number of root vertices on current process (candidates for incoming edges) */
 74:   PetscInt        nleaves;         /* Number of leaf vertices on current process (this process specifies a root for each leaf) */
 75:   PetscInt        *mine;           /* Location of leaves in leafdata arrays provided to the communication routines */
 76:   PetscInt        *mine_alloc;
 77:   PetscInt        minleaf,maxleaf;
 78:   PetscSFNode     *remote;         /* Remote references to roots for each local leaf */
 79:   PetscSFNode     *remote_alloc;
 80:   PetscInt        nranks;          /* Number of ranks owning roots connected to my leaves */
 81:   PetscInt        ndranks;         /* Number of ranks in distinguished group holding roots connected to my leaves */
 82:   PetscMPIInt     *ranks;          /* List of ranks referenced by "remote" */
 83:   PetscInt        *roffset;        /* Array of length nranks+1, offset in rmine/rremote for each rank */
 84:   PetscInt        *rmine;          /* Concatenated array holding local indices referencing each remote rank */
 85:   PetscInt        *rmine_d[2];     /* A copy of rmine[local/remote] in device memory if needed */

 87:   /* Some results useful in packing by analyzing rmine[] */
 88:   PetscInt        leafbuflen[2];   /* Length (in unit) of leaf buffers, in layout of [PETSCSF_LOCAL/REMOTE] */
 89:   PetscBool       leafcontig[2];   /* True means indices in rmine[self part] or rmine[remote part] are contiguous, and they start from ... */
 90:   PetscInt        leafstart[2];    /* ... leafstart[0] and leafstart[1] respectively */
 91:   PetscSFPackOpt  leafpackopt[2];  /* Optimization plans to (un)pack leaves connected to remote roots, based on index patterns in rmine[]. NULL for no optimization */
 92:   PetscSFPackOpt  leafpackopt_d[2];/* Copy of leafpackopt_d[] on device if needed */
 93:   PetscBool       leafdups[2];     /* Indices in rmine[] for self(0)/remote(1) communication have dups respectively? TRUE implies theads working on them in parallel may have data race. */

 95:   PetscInt        nleafreqs;       /* Number of MPI reqests for leaves */
 96:   PetscInt        *rremote;        /* Concatenated array holding remote indices referenced for each remote rank */
 97:   PetscBool       degreeknown;     /* The degree is currently known, do not have to recompute */
 98:   PetscInt        *degree;         /* Degree of each of my root vertices */
 99:   PetscInt        *degreetmp;      /* Temporary local array for computing degree */
100:   PetscBool       rankorder;       /* Sort ranks for gather and scatter operations */
101:   MPI_Group       ingroup;         /* Group of processes connected to my roots */
102:   MPI_Group       outgroup;        /* Group of processes connected to my leaves */
103:   PetscSF         multi;           /* Internal graph used to implement gather and scatter operations */
104:   PetscBool       graphset;        /* Flag indicating that the graph has been set, required before calling communication routines */
105:   PetscBool       setupcalled;     /* Type and communication structures have been set up */
106:   PetscSFPattern  pattern;         /* Pattern of the graph */
107:   PetscBool       persistent;      /* Does this SF use MPI persistent requests for communication */
108:   PetscLayout     map;             /* Layout of leaves over all processes when building a patterned graph */
109:   PetscBool       unknown_input_stream;/* If true, SF does not know which streams root/leafdata is on. Default is false, since we only use petsc default stream */
110:   PetscBool       use_gpu_aware_mpi;   /* If true, SF assumes it can pass GPU pointers to MPI */
111:   PetscBool       use_stream_aware_mpi;/* If true, SF assumes the underlying MPI is cuda-stream aware and we won't sync streams for send/recv buffers passed to MPI */
112:   PetscInt        maxResidentThreadsPerGPU;
113:   PetscSFBackend  backend;         /* The device backend (if any) SF will use */
114:   void *data;                      /* Pointer to implementation */

116:  #if defined(PETSC_HAVE_NVSHMEM)
117:   PetscBool       use_nvshmem;     /* TRY to use nvshmem on cuda devices with this SF when possible */
118:   PetscBool       use_nvshmem_get; /* If true, use nvshmem_get based protocal, otherwise, use nvshmem_put based protocol */
119:   PetscBool       checked_nvshmem_eligibility; /* Have we checked eligibility of using NVSHMEM on this sf? */
120:   PetscBool       setup_nvshmem;   /* Have we already set up NVSHMEM related fields below? These fields are built on-demand */
121:   PetscInt        leafbuflen_rmax; /* max leafbuflen[REMOTE] over comm */
122:   PetscInt        nRemoteRootRanks;/* nranks - ndranks */
123:   PetscInt        nRemoteRootRanksMax; /* max nranks-ndranks over comm */

125:   /* The following two fields look confusing but actually make sense: They are offsets of buffers at the remote side. We're doing one-sided communication! */
126:   PetscInt        *rootsigdisp;    /* [nRemoteRootRanks]. For my i-th remote root rank, I will access its rootsigdisp[i]-th root signal */
127:   PetscInt        *rootbufdisp;    /* [nRemoteRootRanks]. For my i-th remote root rank, I will access its root buf at offset rootbufdisp[i], in <unit> to be set */

129:   PetscInt        *rootbufdisp_d;
130:   PetscInt        *rootsigdisp_d;  /* Copy of rootsigdisp[] on device */
131:   PetscMPIInt     *ranks_d;        /* Copy of the remote part of (root) ranks[] on device */
132:   PetscInt        *roffset_d;      /* Copy of the remote part of roffset[] on device */
133:  #endif
134: };

136: PETSC_EXTERN PetscBool PetscSFRegisterAllCalled;
137: PETSC_EXTERN PetscErrorCode PetscSFRegisterAll(void);

139: PETSC_INTERN PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF,PetscSF*);
140: PETSC_INTERN PetscErrorCode PetscSFBcastToZero_Private(PetscSF,MPI_Datatype,const void*,void*);

142: PETSC_EXTERN PetscErrorCode MPIPetsc_Type_unwrap(MPI_Datatype,MPI_Datatype*,PetscBool*);
143: PETSC_EXTERN PetscErrorCode MPIPetsc_Type_compare(MPI_Datatype,MPI_Datatype,PetscBool*);
144: PETSC_EXTERN PetscErrorCode MPIPetsc_Type_compare_contig(MPI_Datatype,MPI_Datatype,PetscInt*);

146: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
147: #define MPIU_Iscatter(a,b,c,d,e,f,g,h,req)     MPI_Iscatter(a,b,c,d,e,f,g,h,req)
148: #define MPIU_Iscatterv(a,b,c,d,e,f,g,h,i,req)  MPI_Iscatterv(a,b,c,d,e,f,g,h,i,req)
149: #define MPIU_Igather(a,b,c,d,e,f,g,h,req)      MPI_Igather(a,b,c,d,e,f,g,h,req)
150: #define MPIU_Igatherv(a,b,c,d,e,f,g,h,i,req)   MPI_Igatherv(a,b,c,d,e,f,g,h,i,req)
151: #define MPIU_Iallgather(a,b,c,d,e,f,g,req)     MPI_Iallgather(a,b,c,d,e,f,g,req)
152: #define MPIU_Iallgatherv(a,b,c,d,e,f,g,h,req)  MPI_Iallgatherv(a,b,c,d,e,f,g,h,req)
153: #define MPIU_Ialltoall(a,b,c,d,e,f,g,req)      MPI_Ialltoall(a,b,c,d,e,f,g,req)
154: #else
155: /* Ignore req, the MPI_Request argument, and use MPI blocking collectives. One should initialize req
156:    to MPI_REQUEST_NULL so that one can do MPI_Wait(req,status) no matter the call is blocking or not.
157:  */
158: #define MPIU_Iscatter(a,b,c,d,e,f,g,h,req)     MPI_Scatter(a,b,c,d,e,f,g,h)
159: #define MPIU_Iscatterv(a,b,c,d,e,f,g,h,i,req)  MPI_Scatterv(a,b,c,d,e,f,g,h,i)
160: #define MPIU_Igather(a,b,c,d,e,f,g,h,req)      MPI_Gather(a,b,c,d,e,f,g,h)
161: #define MPIU_Igatherv(a,b,c,d,e,f,g,h,i,req)   MPI_Gatherv(a,b,c,d,e,f,g,h,i)
162: #define MPIU_Iallgather(a,b,c,d,e,f,g,req)     MPI_Allgather(a,b,c,d,e,f,g)
163: #define MPIU_Iallgatherv(a,b,c,d,e,f,g,h,req)  MPI_Allgatherv(a,b,c,d,e,f,g,h)
164: #define MPIU_Ialltoall(a,b,c,d,e,f,g,req)      MPI_Alltoall(a,b,c,d,e,f,g)
165: #endif

167: PETSC_EXTERN PetscErrorCode VecScatterGetRemoteCount_Private(VecScatter,PetscBool,PetscInt*,PetscInt*);
168: PETSC_EXTERN PetscErrorCode VecScatterGetRemote_Private(VecScatter,PetscBool,PetscInt*,const PetscInt**,const PetscInt**,const PetscMPIInt**,PetscInt*);
169: PETSC_EXTERN PetscErrorCode VecScatterGetRemoteOrdered_Private(VecScatter,PetscBool,PetscInt*,const PetscInt**,const PetscInt**,const PetscMPIInt**,PetscInt*);
170: PETSC_EXTERN PetscErrorCode VecScatterRestoreRemote_Private(VecScatter,PetscBool,PetscInt*,const PetscInt**,const PetscInt**,const PetscMPIInt**,PetscInt*);
171: PETSC_EXTERN PetscErrorCode VecScatterRestoreRemoteOrdered_Private(VecScatter,PetscBool,PetscInt*,const PetscInt**,const PetscInt**,const PetscMPIInt**,PetscInt*);

173: #if defined(PETSC_HAVE_CUDA)
174: PETSC_EXTERN PetscErrorCode PetscSFMalloc_CUDA(PetscMemType,size_t,void**);
175: PETSC_EXTERN PetscErrorCode PetscSFFree_CUDA(PetscMemType,void*);
176: #endif
177: #if defined(PETSC_HAVE_HIP)
178: PETSC_EXTERN PetscErrorCode PetscSFMalloc_HIP(PetscMemType,size_t,void**);
179: PETSC_EXTERN PetscErrorCode PetscSFFree_HIP(PetscMemType,void*);
180: #endif

182: #if defined(PETSC_HAVE_KOKKOS)
183: PETSC_EXTERN PetscErrorCode PetscSFMalloc_Kokkos(PetscMemType,size_t,void**);
184: PETSC_EXTERN PetscErrorCode PetscSFFree_Kokkos(PetscMemType,void*);
185: #endif

187: /* SF only supports CUDA and Kokkos devices. Even VIENNACL is a device, its device pointers are invisible to SF.
188:    Through VecGetArray(), we copy data of VECVIENNACL from device to host and pass host pointers to SF.
189:  */
190: #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_KOKKOS) || defined(PETSC_HAVE_HIP)
191:   #define PetscSFMalloc(sf,mtype,sz,ptr)  ((*(sf)->ops->Malloc)(mtype,sz,ptr))
192:   /* Free memory and set ptr to NULL when succeeded */
193:   #define PetscSFFree(sf,mtype,ptr)       ((ptr) && ((*(sf)->ops->Free)(mtype,ptr) || ((ptr)=NULL,0)))
194: #else
195:   /* If pure host code, do with less indirection */
196:   #define PetscSFMalloc(sf,mtype,sz,ptr)  PetscMalloc(sz,ptr)
197:   #define PetscSFFree(sf,mtype,ptr)       PetscFree(ptr)
198: #endif

200: #endif