Actual source code: veccuda2.cu
1: /*
2: Implements the sequential cuda vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <petsc/private/vecimpl.h>
9: #include <../src/vec/vec/impls/dvecimpl.h>
10: #include <petsc/private/cudavecimpl.h>
12: #include <cuda_runtime.h>
13: #include <thrust/device_ptr.h>
14: #include <thrust/transform.h>
15: #include <thrust/functional.h>
16: #include <thrust/reduce.h>
18: /*
19: Allocates space for the vector array on the GPU if it does not exist.
20: Does NOT change the PetscCUDAFlag for the vector
21: Does NOT zero the CUDA array
23: */
24: PetscErrorCode VecCUDAAllocateCheck(Vec v)
25: {
27: cudaError_t err;
28: Vec_CUDA *veccuda;
29: PetscBool option_set;
32: if (!v->spptr) {
33: PetscReal pinned_memory_min;
34: PetscCalloc(sizeof(Vec_CUDA),&v->spptr);
35: veccuda = (Vec_CUDA*)v->spptr;
36: err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
37: veccuda->GPUarray = veccuda->GPUarray_allocated;
38: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
39: if (v->data && ((Vec_Seq*)v->data)->array) {
40: v->offloadmask = PETSC_OFFLOAD_CPU;
41: } else {
42: v->offloadmask = PETSC_OFFLOAD_GPU;
43: }
44: }
45: pinned_memory_min = 0;
47: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
48: Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
49: PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");
50: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
51: if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
52: PetscOptionsEnd();
53: }
54: return(0);
55: }
57: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
58: PetscErrorCode VecCUDACopyToGPU(Vec v)
59: {
61: cudaError_t err;
62: Vec_CUDA *veccuda;
63: PetscScalar *varray;
67: VecCUDAAllocateCheck(v);
68: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
69: PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
70: veccuda = (Vec_CUDA*)v->spptr;
71: varray = veccuda->GPUarray;
72: err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
73: PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
74: PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
75: v->offloadmask = PETSC_OFFLOAD_BOTH;
76: }
77: return(0);
78: }
80: /*
81: VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
82: */
83: PetscErrorCode VecCUDACopyFromGPU(Vec v)
84: {
86: cudaError_t err;
87: Vec_CUDA *veccuda;
88: PetscScalar *varray;
92: VecCUDAAllocateCheckHost(v);
93: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
94: PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
95: veccuda = (Vec_CUDA*)v->spptr;
96: varray = veccuda->GPUarray;
97: err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
98: PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
99: PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
100: v->offloadmask = PETSC_OFFLOAD_BOTH;
101: }
102: return(0);
103: }
105: /*MC
106: VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
108: Options Database Keys:
109: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()
111: Level: beginner
113: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
114: M*/
116: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
117: {
118: const PetscScalar *xarray;
119: PetscScalar *yarray;
120: PetscErrorCode ierr;
121: PetscBLASInt one = 1,bn = 0;
122: PetscScalar sone = 1.0;
123: cublasHandle_t cublasv2handle;
124: cublasStatus_t cberr;
125: cudaError_t err;
128: PetscCUBLASGetHandle(&cublasv2handle);
129: PetscBLASIntCast(yin->map->n,&bn);
130: VecCUDAGetArrayRead(xin,&xarray);
131: VecCUDAGetArray(yin,&yarray);
132: PetscLogGpuTimeBegin();
133: if (alpha == (PetscScalar)0.0) {
134: err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
135: } else if (alpha == (PetscScalar)1.0) {
136: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
137: PetscLogGpuFlops(1.0*yin->map->n);
138: } else {
139: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
140: cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
141: PetscLogGpuFlops(2.0*yin->map->n);
142: }
143: PetscLogGpuTimeEnd();
144: VecCUDARestoreArrayRead(xin,&xarray);
145: VecCUDARestoreArray(yin,&yarray);
146: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
147: return(0);
148: }
150: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
151: {
152: const PetscScalar *xarray;
153: PetscScalar *yarray;
154: PetscErrorCode ierr;
155: PetscBLASInt one = 1,bn = 0;
156: cublasHandle_t cublasv2handle;
157: cublasStatus_t cberr;
158: PetscBool xiscuda;
161: if (alpha == (PetscScalar)0.0) return(0);
162: PetscCUBLASGetHandle(&cublasv2handle);
163: PetscObjectTypeCompareAny((PetscObject)xin,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
164: if (xiscuda) {
165: PetscBLASIntCast(yin->map->n,&bn);
166: VecCUDAGetArrayRead(xin,&xarray);
167: VecCUDAGetArray(yin,&yarray);
168: PetscLogGpuTimeBegin();
169: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
170: PetscLogGpuTimeEnd();
171: VecCUDARestoreArrayRead(xin,&xarray);
172: VecCUDARestoreArray(yin,&yarray);
173: PetscLogGpuFlops(2.0*yin->map->n);
174: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
175: } else {
176: VecAXPY_Seq(yin,alpha,xin);
177: }
178: return(0);
179: }
181: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
182: {
183: PetscInt n = xin->map->n;
184: const PetscScalar *xarray=NULL,*yarray=NULL;
185: PetscScalar *warray=NULL;
186: thrust::device_ptr<const PetscScalar> xptr,yptr;
187: thrust::device_ptr<PetscScalar> wptr;
188: PetscErrorCode ierr;
191: if (xin->boundtocpu || yin->boundtocpu) {
192: VecPointwiseDivide_Seq(win,xin,yin);
193: return(0);
194: }
195: VecCUDAGetArrayWrite(win,&warray);
196: VecCUDAGetArrayRead(xin,&xarray);
197: VecCUDAGetArrayRead(yin,&yarray);
198: PetscLogGpuTimeBegin();
199: try {
200: wptr = thrust::device_pointer_cast(warray);
201: xptr = thrust::device_pointer_cast(xarray);
202: yptr = thrust::device_pointer_cast(yarray);
203: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
204: } catch (char *ex) {
205: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
206: }
207: PetscLogGpuTimeEnd();
208: PetscLogGpuFlops(n);
209: VecCUDARestoreArrayRead(xin,&xarray);
210: VecCUDARestoreArrayRead(yin,&yarray);
211: VecCUDARestoreArrayWrite(win,&warray);
212: return(0);
213: }
215: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
216: {
217: const PetscScalar *xarray=NULL,*yarray=NULL;
218: PetscScalar *warray=NULL;
219: PetscErrorCode ierr;
220: PetscBLASInt one = 1,bn = 0;
221: cublasHandle_t cublasv2handle;
222: cublasStatus_t stat;
223: cudaError_t cerr;
224: cudaStream_t stream;
227: PetscCUBLASGetHandle(&cublasv2handle);
228: PetscBLASIntCast(win->map->n,&bn);
229: if (alpha == (PetscScalar)0.0) {
230: VecCopy_SeqCUDA(yin,win);
231: } else {
232: VecCUDAGetArrayRead(xin,&xarray);
233: VecCUDAGetArrayRead(yin,&yarray);
234: VecCUDAGetArrayWrite(win,&warray);
235: PetscLogGpuTimeBegin();
236: stat = cublasGetStream(cublasv2handle,&stream);CHKERRCUBLAS(stat);
237: cerr = cudaMemcpyAsync(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,stream);CHKERRCUDA(cerr);
238: stat = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(stat);
239: PetscLogGpuTimeEnd();
240: PetscLogGpuFlops(2*win->map->n);
241: VecCUDARestoreArrayRead(xin,&xarray);
242: VecCUDARestoreArrayRead(yin,&yarray);
243: VecCUDARestoreArrayWrite(win,&warray);
244: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
245: }
246: return(0);
247: }
249: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
250: {
251: PetscErrorCode ierr;
252: PetscInt n = xin->map->n,j;
253: PetscScalar *xarray;
254: const PetscScalar *yarray;
255: PetscBLASInt one = 1,bn = 0;
256: cublasHandle_t cublasv2handle;
257: cublasStatus_t cberr;
260: PetscLogGpuFlops(nv*2.0*n);
261: PetscLogCpuToGpuScalar(nv*sizeof(PetscScalar));
262: PetscCUBLASGetHandle(&cublasv2handle);
263: PetscBLASIntCast(n,&bn);
264: VecCUDAGetArray(xin,&xarray);
265: PetscLogGpuTimeBegin();
266: for (j=0; j<nv; j++) {
267: VecCUDAGetArrayRead(y[j],&yarray);
268: cberr = cublasXaxpy(cublasv2handle,bn,alpha+j,yarray,one,xarray,one);CHKERRCUBLAS(cberr);
269: VecCUDARestoreArrayRead(y[j],&yarray);
270: }
271: PetscLogGpuTimeEnd();
272: VecCUDARestoreArray(xin,&xarray);
273: return(0);
274: }
276: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
277: {
278: const PetscScalar *xarray,*yarray;
279: PetscErrorCode ierr;
280: PetscBLASInt one = 1,bn = 0;
281: cublasHandle_t cublasv2handle;
282: cublasStatus_t cerr;
285: PetscCUBLASGetHandle(&cublasv2handle);
286: PetscBLASIntCast(yin->map->n,&bn);
287: VecCUDAGetArrayRead(xin,&xarray);
288: VecCUDAGetArrayRead(yin,&yarray);
289: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
290: PetscLogGpuTimeBegin();
291: cerr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cerr);
292: PetscLogGpuTimeEnd();
293: if (xin->map->n >0) {
294: PetscLogGpuFlops(2.0*xin->map->n-1);
295: }
296: PetscLogGpuToCpuScalar(sizeof(PetscScalar));
297: VecCUDARestoreArrayRead(xin,&xarray);
298: VecCUDARestoreArrayRead(yin,&yarray);
299: return(0);
300: }
302: //
303: // CUDA kernels for MDot to follow
304: //
306: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
307: #define MDOT_WORKGROUP_SIZE 128
308: #define MDOT_WORKGROUP_NUM 128
310: #if !defined(PETSC_USE_COMPLEX)
311: // M = 2:
312: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
313: PetscInt size, PetscScalar *group_results)
314: {
315: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
316: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
317: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
318: PetscInt vec_start_index = blockIdx.x * entries_per_group;
319: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
321: PetscScalar entry_x = 0;
322: PetscScalar group_sum0 = 0;
323: PetscScalar group_sum1 = 0;
324: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
325: entry_x = x[i]; // load only once from global memory!
326: group_sum0 += entry_x * y0[i];
327: group_sum1 += entry_x * y1[i];
328: }
329: tmp_buffer[threadIdx.x] = group_sum0;
330: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
332: // parallel reduction
333: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
334: __syncthreads();
335: if (threadIdx.x < stride) {
336: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
337: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
338: }
339: }
341: // write result of group to group_results
342: if (threadIdx.x == 0) {
343: group_results[blockIdx.x] = tmp_buffer[0];
344: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
345: }
346: }
348: // M = 3:
349: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
350: PetscInt size, PetscScalar *group_results)
351: {
352: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
353: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
354: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
355: PetscInt vec_start_index = blockIdx.x * entries_per_group;
356: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
358: PetscScalar entry_x = 0;
359: PetscScalar group_sum0 = 0;
360: PetscScalar group_sum1 = 0;
361: PetscScalar group_sum2 = 0;
362: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
363: entry_x = x[i]; // load only once from global memory!
364: group_sum0 += entry_x * y0[i];
365: group_sum1 += entry_x * y1[i];
366: group_sum2 += entry_x * y2[i];
367: }
368: tmp_buffer[threadIdx.x] = group_sum0;
369: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
370: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
372: // parallel reduction
373: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
374: __syncthreads();
375: if (threadIdx.x < stride) {
376: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
377: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
378: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
379: }
380: }
382: // write result of group to group_results
383: if (threadIdx.x == 0) {
384: group_results[blockIdx.x ] = tmp_buffer[0];
385: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
386: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
387: }
388: }
390: // M = 4:
391: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
392: PetscInt size, PetscScalar *group_results)
393: {
394: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
395: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
396: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
397: PetscInt vec_start_index = blockIdx.x * entries_per_group;
398: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
400: PetscScalar entry_x = 0;
401: PetscScalar group_sum0 = 0;
402: PetscScalar group_sum1 = 0;
403: PetscScalar group_sum2 = 0;
404: PetscScalar group_sum3 = 0;
405: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
406: entry_x = x[i]; // load only once from global memory!
407: group_sum0 += entry_x * y0[i];
408: group_sum1 += entry_x * y1[i];
409: group_sum2 += entry_x * y2[i];
410: group_sum3 += entry_x * y3[i];
411: }
412: tmp_buffer[threadIdx.x] = group_sum0;
413: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
414: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
415: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
417: // parallel reduction
418: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
419: __syncthreads();
420: if (threadIdx.x < stride) {
421: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
422: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
423: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
424: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
425: }
426: }
428: // write result of group to group_results
429: if (threadIdx.x == 0) {
430: group_results[blockIdx.x ] = tmp_buffer[0];
431: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
432: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
433: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
434: }
435: }
437: // M = 8:
438: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
439: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
440: PetscInt size, PetscScalar *group_results)
441: {
442: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
443: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
444: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
445: PetscInt vec_start_index = blockIdx.x * entries_per_group;
446: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
448: PetscScalar entry_x = 0;
449: PetscScalar group_sum0 = 0;
450: PetscScalar group_sum1 = 0;
451: PetscScalar group_sum2 = 0;
452: PetscScalar group_sum3 = 0;
453: PetscScalar group_sum4 = 0;
454: PetscScalar group_sum5 = 0;
455: PetscScalar group_sum6 = 0;
456: PetscScalar group_sum7 = 0;
457: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
458: entry_x = x[i]; // load only once from global memory!
459: group_sum0 += entry_x * y0[i];
460: group_sum1 += entry_x * y1[i];
461: group_sum2 += entry_x * y2[i];
462: group_sum3 += entry_x * y3[i];
463: group_sum4 += entry_x * y4[i];
464: group_sum5 += entry_x * y5[i];
465: group_sum6 += entry_x * y6[i];
466: group_sum7 += entry_x * y7[i];
467: }
468: tmp_buffer[threadIdx.x] = group_sum0;
469: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
470: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
471: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
472: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
473: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
474: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
475: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
477: // parallel reduction
478: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
479: __syncthreads();
480: if (threadIdx.x < stride) {
481: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
482: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
483: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
484: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
485: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
486: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
487: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
488: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
489: }
490: }
492: // write result of group to group_results
493: if (threadIdx.x == 0) {
494: group_results[blockIdx.x ] = tmp_buffer[0];
495: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
496: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
497: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
498: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
499: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
500: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
501: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
502: }
503: }
504: #endif /* !defined(PETSC_USE_COMPLEX) */
506: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
507: {
508: PetscErrorCode ierr;
509: PetscInt i,n = xin->map->n,current_y_index = 0;
510: const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
511: #if !defined(PETSC_USE_COMPLEX)
512: PetscInt nv1 = ((nv % 4) == 1) ? nv-1: nv,j;
513: PetscScalar *group_results_gpu,*group_results_cpu;
514: cudaError_t cuda_ierr;
515: #endif
516: PetscBLASInt one = 1,bn = 0;
517: cublasHandle_t cublasv2handle;
518: cublasStatus_t cberr;
521: PetscCUBLASGetHandle(&cublasv2handle);
522: PetscBLASIntCast(xin->map->n,&bn);
523: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
524: /* Handle the case of local size zero first */
525: if (!xin->map->n) {
526: for (i=0; i<nv; ++i) z[i] = 0;
527: return(0);
528: }
530: #if !defined(PETSC_USE_COMPLEX)
531: // allocate scratchpad memory for the results of individual work groups:
532: cuda_cudaMalloc((void**)&group_results_gpu, nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);CHKERRCUDA(cuda_ierr);
533: PetscMalloc1(nv1*MDOT_WORKGROUP_NUM,&group_results_cpu);
534: #endif
535: VecCUDAGetArrayRead(xin,&xptr);
536: PetscLogGpuTimeBegin();
538: while (current_y_index < nv)
539: {
540: switch (nv - current_y_index) {
542: case 7:
543: case 6:
544: case 5:
545: case 4:
546: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
547: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
548: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
549: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
550: #if defined(PETSC_USE_COMPLEX)
551: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
552: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
553: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
554: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
555: #else
556: VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
557: #endif
558: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
559: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
560: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
561: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
562: current_y_index += 4;
563: break;
565: case 3:
566: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
567: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
568: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
570: #if defined(PETSC_USE_COMPLEX)
571: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
572: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
573: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
574: #else
575: VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
576: #endif
577: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
578: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
579: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
580: current_y_index += 3;
581: break;
583: case 2:
584: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
585: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
586: #if defined(PETSC_USE_COMPLEX)
587: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
588: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
589: #else
590: VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
591: #endif
592: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
593: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
594: current_y_index += 2;
595: break;
597: case 1:
598: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
599: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
600: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
601: current_y_index += 1;
602: break;
604: default: // 8 or more vectors left
605: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
606: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
607: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
608: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
609: VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
610: VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
611: VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
612: VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
613: #if defined(PETSC_USE_COMPLEX)
614: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
615: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
616: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
617: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
618: cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
619: cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
620: cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
621: cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
622: #else
623: VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
624: #endif
625: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
626: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
627: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
628: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
629: VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
630: VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
631: VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
632: VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
633: current_y_index += 8;
634: break;
635: }
636: }
637: PetscLogGpuTimeEnd();
638: VecCUDARestoreArrayRead(xin,&xptr);
640: #if defined(PETSC_USE_COMPLEX)
641: PetscLogGpuToCpuScalar(nv*sizeof(PetscScalar));
642: #else
643: // copy results to CPU
644: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
646: // sum group results into z
647: for (j=0; j<nv1; ++j) {
648: z[j] = 0;
649: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
650: }
651: PetscLogFlops(nv1*MDOT_WORKGROUP_NUM);
652: cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
653: PetscFree(group_results_cpu);
654: PetscLogGpuToCpuScalar(nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
655: #endif
656: PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
657: return(0);
658: }
660: #undef MDOT_WORKGROUP_SIZE
661: #undef MDOT_WORKGROUP_NUM
663: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
664: {
665: PetscInt n = xin->map->n;
666: PetscScalar *xarray = NULL;
667: thrust::device_ptr<PetscScalar> xptr;
668: PetscErrorCode ierr;
669: cudaError_t err;
672: VecCUDAGetArrayWrite(xin,&xarray);
673: PetscLogGpuTimeBegin();
674: if (alpha == (PetscScalar)0.0) {
675: err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
676: } else {
677: try {
678: xptr = thrust::device_pointer_cast(xarray);
679: thrust::fill(xptr,xptr+n,alpha);
680: } catch (char *ex) {
681: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
682: }
683: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
684: }
685: PetscLogGpuTimeEnd();
686: VecCUDARestoreArrayWrite(xin,&xarray);
687: return(0);
688: }
690: struct PetscScalarReciprocal
691: {
692: __host__ __device__
693: PetscScalar operator()(const PetscScalar& s)
694: {
695: return (s != (PetscScalar)0.0) ? (PetscScalar)1.0/s : 0.0;
696: }
697: };
699: PetscErrorCode VecReciprocal_SeqCUDA(Vec v)
700: {
702: PetscInt n;
703: PetscScalar *x;
706: VecGetLocalSize(v,&n);
707: VecCUDAGetArray(v,&x);
708: PetscLogGpuTimeBegin();
709: try {
710: auto xptr = thrust::device_pointer_cast(x);
711: thrust::transform(xptr,xptr+n,xptr,PetscScalarReciprocal());
712: } catch (char *ex) {
713: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
714: }
715: PetscLogGpuTimeEnd();
716: VecCUDARestoreArray(v,&x);
717: return(0);
718: }
720: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
721: {
722: PetscScalar *xarray;
724: PetscBLASInt one = 1,bn = 0;
725: cublasHandle_t cublasv2handle;
726: cublasStatus_t cberr;
729: if (alpha == (PetscScalar)0.0) {
730: VecSet_SeqCUDA(xin,alpha);
731: } else if (alpha != (PetscScalar)1.0) {
732: PetscCUBLASGetHandle(&cublasv2handle);
733: PetscBLASIntCast(xin->map->n,&bn);
734: VecCUDAGetArray(xin,&xarray);
735: PetscLogGpuTimeBegin();
736: cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
737: VecCUDARestoreArray(xin,&xarray);
738: PetscLogGpuTimeEnd();
739: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
740: PetscLogGpuFlops(xin->map->n);
741: }
742: return(0);
743: }
745: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
746: {
747: const PetscScalar *xarray,*yarray;
748: PetscErrorCode ierr;
749: PetscBLASInt one = 1,bn = 0;
750: cublasHandle_t cublasv2handle;
751: cublasStatus_t cerr;
754: PetscCUBLASGetHandle(&cublasv2handle);
755: PetscBLASIntCast(xin->map->n,&bn);
756: VecCUDAGetArrayRead(xin,&xarray);
757: VecCUDAGetArrayRead(yin,&yarray);
758: PetscLogGpuTimeBegin();
759: cerr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cerr);
760: PetscLogGpuTimeEnd();
761: if (xin->map->n > 0) {
762: PetscLogGpuFlops(2.0*xin->map->n-1);
763: }
764: PetscLogGpuToCpuScalar(sizeof(PetscScalar));
765: VecCUDARestoreArrayRead(yin,&yarray);
766: VecCUDARestoreArrayRead(xin,&xarray);
767: return(0);
768: }
770: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
771: {
772: const PetscScalar *xarray;
773: PetscScalar *yarray;
774: PetscErrorCode ierr;
775: cudaError_t err;
778: if (xin != yin) {
779: if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
780: PetscBool yiscuda;
782: PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
783: VecCUDAGetArrayRead(xin,&xarray);
784: if (yiscuda) {
785: VecCUDAGetArrayWrite(yin,&yarray);
786: } else {
787: VecGetArrayWrite(yin,&yarray);
788: }
789: PetscLogGpuTimeBegin();
790: if (yiscuda) {
791: err = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
792: } else {
793: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
794: }
795: PetscLogGpuTimeEnd();
796: VecCUDARestoreArrayRead(xin,&xarray);
797: if (yiscuda) {
798: VecCUDARestoreArrayWrite(yin,&yarray);
799: } else {
800: VecRestoreArrayWrite(yin,&yarray);
801: }
802: } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
803: /* copy in CPU if we are on the CPU */
804: VecCopy_SeqCUDA_Private(xin,yin);
805: } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
806: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
807: if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
808: /* copy in CPU */
809: VecCopy_SeqCUDA_Private(xin,yin);
810: } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
811: /* copy in GPU */
812: VecCUDAGetArrayRead(xin,&xarray);
813: VecCUDAGetArrayWrite(yin,&yarray);
814: PetscLogGpuTimeBegin();
815: err = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
816: PetscLogGpuTimeEnd();
817: VecCUDARestoreArrayRead(xin,&xarray);
818: VecCUDARestoreArrayWrite(yin,&yarray);
819: } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
820: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
821: default to copy in GPU (this is an arbitrary choice) */
822: VecCUDAGetArrayRead(xin,&xarray);
823: VecCUDAGetArrayWrite(yin,&yarray);
824: PetscLogGpuTimeBegin();
825: err = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
826: PetscLogGpuTimeEnd();
827: VecCUDARestoreArrayRead(xin,&xarray);
828: VecCUDARestoreArrayWrite(yin,&yarray);
829: } else {
830: VecCopy_SeqCUDA_Private(xin,yin);
831: }
832: }
833: }
834: return(0);
835: }
837: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
838: {
840: PetscBLASInt one = 1,bn = 0;
841: PetscScalar *xarray,*yarray;
842: cublasHandle_t cublasv2handle;
843: cublasStatus_t cberr;
846: PetscCUBLASGetHandle(&cublasv2handle);
847: PetscBLASIntCast(xin->map->n,&bn);
848: if (xin != yin) {
849: VecCUDAGetArray(xin,&xarray);
850: VecCUDAGetArray(yin,&yarray);
851: PetscLogGpuTimeBegin();
852: cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
853: PetscLogGpuTimeEnd();
854: VecCUDARestoreArray(xin,&xarray);
855: VecCUDARestoreArray(yin,&yarray);
856: }
857: return(0);
858: }
860: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
861: {
862: PetscErrorCode ierr;
863: PetscScalar a = alpha,b = beta;
864: const PetscScalar *xarray;
865: PetscScalar *yarray;
866: PetscBLASInt one = 1, bn = 0;
867: cublasHandle_t cublasv2handle;
868: cublasStatus_t cberr;
869: cudaError_t err;
872: PetscCUBLASGetHandle(&cublasv2handle);
873: PetscBLASIntCast(yin->map->n,&bn);
874: if (a == (PetscScalar)0.0) {
875: VecScale_SeqCUDA(yin,beta);
876: } else if (b == (PetscScalar)1.0) {
877: VecAXPY_SeqCUDA(yin,alpha,xin);
878: } else if (a == (PetscScalar)1.0) {
879: VecAYPX_SeqCUDA(yin,beta,xin);
880: } else if (b == (PetscScalar)0.0) {
881: VecCUDAGetArrayRead(xin,&xarray);
882: VecCUDAGetArray(yin,&yarray);
883: PetscLogGpuTimeBegin();
884: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
885: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
886: PetscLogGpuTimeEnd();
887: PetscLogGpuFlops(xin->map->n);
888: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
889: VecCUDARestoreArrayRead(xin,&xarray);
890: VecCUDARestoreArray(yin,&yarray);
891: } else {
892: VecCUDAGetArrayRead(xin,&xarray);
893: VecCUDAGetArray(yin,&yarray);
894: PetscLogGpuTimeBegin();
895: cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
896: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
897: VecCUDARestoreArrayRead(xin,&xarray);
898: VecCUDARestoreArray(yin,&yarray);
899: PetscLogGpuTimeEnd();
900: PetscLogGpuFlops(3.0*xin->map->n);
901: PetscLogCpuToGpuScalar(2*sizeof(PetscScalar));
902: }
903: return(0);
904: }
906: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
907: {
909: PetscInt n = zin->map->n;
912: if (gamma == (PetscScalar)1.0) {
913: /* z = ax + b*y + z */
914: VecAXPY_SeqCUDA(zin,alpha,xin);
915: VecAXPY_SeqCUDA(zin,beta,yin);
916: PetscLogGpuFlops(4.0*n);
917: } else {
918: /* z = a*x + b*y + c*z */
919: VecScale_SeqCUDA(zin,gamma);
920: VecAXPY_SeqCUDA(zin,alpha,xin);
921: VecAXPY_SeqCUDA(zin,beta,yin);
922: PetscLogGpuFlops(5.0*n);
923: }
924: return(0);
925: }
927: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
928: {
929: PetscInt n = win->map->n;
930: const PetscScalar *xarray,*yarray;
931: PetscScalar *warray;
932: thrust::device_ptr<const PetscScalar> xptr,yptr;
933: thrust::device_ptr<PetscScalar> wptr;
934: PetscErrorCode ierr;
937: if (xin->boundtocpu || yin->boundtocpu) {
938: VecPointwiseMult_Seq(win,xin,yin);
939: return(0);
940: }
941: VecCUDAGetArrayRead(xin,&xarray);
942: VecCUDAGetArrayRead(yin,&yarray);
943: VecCUDAGetArrayWrite(win,&warray);
944: PetscLogGpuTimeBegin();
945: try {
946: wptr = thrust::device_pointer_cast(warray);
947: xptr = thrust::device_pointer_cast(xarray);
948: yptr = thrust::device_pointer_cast(yarray);
949: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
950: } catch (char *ex) {
951: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
952: }
953: PetscLogGpuTimeEnd();
954: VecCUDARestoreArrayRead(xin,&xarray);
955: VecCUDARestoreArrayRead(yin,&yarray);
956: VecCUDARestoreArrayWrite(win,&warray);
957: PetscLogGpuFlops(n);
958: return(0);
959: }
961: /* should do infinity norm in cuda */
963: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
964: {
965: PetscErrorCode ierr;
966: PetscInt n = xin->map->n;
967: PetscBLASInt one = 1, bn = 0;
968: const PetscScalar *xarray;
969: cublasHandle_t cublasv2handle;
970: cublasStatus_t cberr;
971: cudaError_t err;
974: PetscCUBLASGetHandle(&cublasv2handle);
975: PetscBLASIntCast(n,&bn);
976: if (type == NORM_2 || type == NORM_FROBENIUS) {
977: VecCUDAGetArrayRead(xin,&xarray);
978: PetscLogGpuTimeBegin();
979: cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
980: PetscLogGpuTimeEnd();
981: VecCUDARestoreArrayRead(xin,&xarray);
982: PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
983: } else if (type == NORM_INFINITY) {
984: int i;
985: VecCUDAGetArrayRead(xin,&xarray);
986: PetscLogGpuTimeBegin();
987: cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
988: PetscLogGpuTimeEnd();
989: if (bn) {
990: PetscScalar zs;
991: err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
992: *z = PetscAbsScalar(zs);
993: } else *z = 0.0;
994: VecCUDARestoreArrayRead(xin,&xarray);
995: } else if (type == NORM_1) {
996: VecCUDAGetArrayRead(xin,&xarray);
997: PetscLogGpuTimeBegin();
998: cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
999: VecCUDARestoreArrayRead(xin,&xarray);
1000: PetscLogGpuTimeEnd();
1001: PetscLogGpuFlops(PetscMax(n-1.0,0.0));
1002: } else if (type == NORM_1_AND_2) {
1003: VecNorm_SeqCUDA(xin,NORM_1,z);
1004: VecNorm_SeqCUDA(xin,NORM_2,z+1);
1005: }
1006: PetscLogGpuToCpuScalar(sizeof(PetscReal));
1007: return(0);
1008: }
1010: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1011: {
1015: VecDot_SeqCUDA(s,t,dp);
1016: VecDot_SeqCUDA(t,t,nm);
1017: return(0);
1018: }
1020: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1021: {
1023: cudaError_t cerr;
1024: Vec_CUDA *veccuda = (Vec_CUDA*)v->spptr;
1027: if (v->spptr) {
1028: if (veccuda->GPUarray_allocated) {
1029: #if defined(PETSC_HAVE_NVSHMEM)
1030: if (veccuda->nvshmem) {
1031: PetscNvshmemFree(veccuda->GPUarray_allocated);
1032: veccuda->nvshmem = PETSC_FALSE;
1033: }
1034: else
1035: #endif
1036: {cerr = cudaFree(veccuda->GPUarray_allocated);CHKERRCUDA(cerr);}
1037: veccuda->GPUarray_allocated = NULL;
1038: }
1039: if (veccuda->stream) {
1040: cerr = cudaStreamDestroy(veccuda->stream);CHKERRCUDA(cerr);
1041: }
1042: }
1043: VecDestroy_SeqCUDA_Private(v);
1044: PetscFree(v->spptr);
1045: return(0);
1046: }
1048: #if defined(PETSC_USE_COMPLEX)
1049: struct conjugate
1050: {
1051: __host__ __device__
1052: PetscScalar operator()(const PetscScalar& x)
1053: {
1054: return PetscConj(x);
1055: }
1056: };
1057: #endif
1059: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1060: {
1061: #if defined(PETSC_USE_COMPLEX)
1062: PetscScalar *xarray;
1063: PetscErrorCode ierr;
1064: PetscInt n = xin->map->n;
1065: thrust::device_ptr<PetscScalar> xptr;
1068: VecCUDAGetArray(xin,&xarray);
1069: PetscLogGpuTimeBegin();
1070: try {
1071: xptr = thrust::device_pointer_cast(xarray);
1072: thrust::transform(xptr,xptr+n,xptr,conjugate());
1073: } catch (char *ex) {
1074: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1075: }
1076: PetscLogGpuTimeEnd();
1077: VecCUDARestoreArray(xin,&xarray);
1078: #else
1080: #endif
1081: return(0);
1082: }
1084: PETSC_STATIC_INLINE PetscErrorCode VecGetLocalVectorK_SeqCUDA(Vec v,Vec w,PetscBool read)
1085: {
1087: cudaError_t err;
1088: PetscBool wisseqcuda;
1094: PetscObjectTypeCompare((PetscObject)w,VECSEQCUDA,&wisseqcuda);
1095: if (w->data && wisseqcuda) {
1096: if (((Vec_Seq*)w->data)->array_allocated) {
1097: if (w->pinned_memory) {
1098: PetscMallocSetCUDAHost();
1099: }
1100: PetscFree(((Vec_Seq*)w->data)->array_allocated);
1101: if (w->pinned_memory) {
1102: PetscMallocResetCUDAHost();
1103: w->pinned_memory = PETSC_FALSE;
1104: }
1105: }
1106: ((Vec_Seq*)w->data)->array = NULL;
1107: ((Vec_Seq*)w->data)->unplacedarray = NULL;
1108: }
1109: if (w->spptr && wisseqcuda) {
1110: if (((Vec_CUDA*)w->spptr)->GPUarray) {
1111: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1112: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1113: }
1114: if (((Vec_CUDA*)w->spptr)->stream) {
1115: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1116: }
1117: PetscFree(w->spptr);
1118: }
1120: if (v->petscnative && wisseqcuda) {
1121: PetscFree(w->data);
1122: w->data = v->data;
1123: w->offloadmask = v->offloadmask;
1124: w->pinned_memory = v->pinned_memory;
1125: w->spptr = v->spptr;
1126: PetscObjectStateIncrease((PetscObject)w);
1127: } else {
1128: if (read) {
1129: VecGetArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1130: } else {
1131: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1132: }
1133: w->offloadmask = PETSC_OFFLOAD_CPU;
1134: if (wisseqcuda) {
1135: VecCUDAAllocateCheck(w);
1136: }
1137: }
1138: return(0);
1139: }
1141: PETSC_STATIC_INLINE PetscErrorCode VecRestoreLocalVectorK_SeqCUDA(Vec v,Vec w,PetscBool read)
1142: {
1144: cudaError_t err;
1145: PetscBool wisseqcuda;
1151: PetscObjectTypeCompare((PetscObject)w,VECSEQCUDA,&wisseqcuda);
1152: if (v->petscnative && wisseqcuda) {
1153: v->data = w->data;
1154: v->offloadmask = w->offloadmask;
1155: v->pinned_memory = w->pinned_memory;
1156: v->spptr = w->spptr;
1157: w->data = 0;
1158: w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1159: w->spptr = 0;
1160: } else {
1161: if (read) {
1162: VecRestoreArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1163: } else {
1164: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1165: }
1166: if ((Vec_CUDA*)w->spptr && wisseqcuda) {
1167: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1168: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1169: if (((Vec_CUDA*)v->spptr)->stream) {
1170: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1171: }
1172: PetscFree(w->spptr);
1173: }
1174: }
1175: return(0);
1176: }
1178: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1179: {
1183: VecGetLocalVectorK_SeqCUDA(v,w,PETSC_FALSE);
1184: return(0);
1185: }
1187: PetscErrorCode VecGetLocalVectorRead_SeqCUDA(Vec v,Vec w)
1188: {
1192: VecGetLocalVectorK_SeqCUDA(v,w,PETSC_TRUE);
1193: return(0);
1194: }
1196: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1197: {
1201: VecRestoreLocalVectorK_SeqCUDA(v,w,PETSC_FALSE);
1202: return(0);
1203: }
1205: PetscErrorCode VecRestoreLocalVectorRead_SeqCUDA(Vec v,Vec w)
1206: {
1210: VecRestoreLocalVectorK_SeqCUDA(v,w,PETSC_TRUE);
1211: return(0);
1212: }
1214: struct petscrealpart : public thrust::unary_function<PetscScalar,PetscReal>
1215: {
1216: __host__ __device__
1217: PetscReal operator()(const PetscScalar& x) {
1218: return PetscRealPart(x);
1219: }
1220: };
1222: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1223: {
1224: __host__ __device__
1225: thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscScalar, PetscInt>& x) {
1226: return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>());
1227: }
1228: };
1230: struct petscmax : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1231: {
1232: __host__ __device__
1233: PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1234: return x < y ? y : x;
1235: }
1236: };
1238: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1239: {
1240: __host__ __device__
1241: thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1242: return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1243: (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1244: (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1245: }
1246: };
1248: struct petscmin : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1249: {
1250: __host__ __device__
1251: PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1252: return x < y ? x : y;
1253: }
1254: };
1256: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1257: {
1258: __host__ __device__
1259: thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1260: return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1261: (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1262: (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1263: }
1264: };
1266: PetscErrorCode VecMax_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1267: {
1268: PetscErrorCode ierr;
1269: PetscInt n = v->map->n;
1270: const PetscScalar *av;
1271: thrust::device_ptr<const PetscScalar> avpt;
1275: if (!n) {
1276: *m = PETSC_MIN_REAL;
1277: if (p) *p = -1;
1278: return(0);
1279: }
1280: VecCUDAGetArrayRead(v,&av);
1281: avpt = thrust::device_pointer_cast(av);
1282: PetscLogGpuTimeBegin();
1283: if (p) {
1284: thrust::tuple<PetscReal,PetscInt> res(PETSC_MIN_REAL,-1);
1285: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1286: try {
1287: #if defined(PETSC_USE_COMPLEX)
1288: res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmaxi());
1289: #else
1290: res = thrust::reduce(zibit,zibit+n,res,petscmaxi());
1291: #endif
1292: } catch (char *ex) {
1293: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1294: }
1295: *m = res.get<0>();
1296: *p = res.get<1>();
1297: } else {
1298: try {
1299: #if defined(PETSC_USE_COMPLEX)
1300: *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MIN_REAL,petscmax());
1301: #else
1302: *m = thrust::reduce(avpt,avpt+n,PETSC_MIN_REAL,petscmax());
1303: #endif
1304: } catch (char *ex) {
1305: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1306: }
1307: }
1308: PetscLogGpuTimeEnd();
1309: VecCUDARestoreArrayRead(v,&av);
1310: return(0);
1311: }
1313: PetscErrorCode VecMin_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1314: {
1315: PetscErrorCode ierr;
1316: PetscInt n = v->map->n;
1317: const PetscScalar *av;
1318: thrust::device_ptr<const PetscScalar> avpt;
1322: if (!n) {
1323: *m = PETSC_MAX_REAL;
1324: if (p) *p = -1;
1325: return(0);
1326: }
1327: VecCUDAGetArrayRead(v,&av);
1328: avpt = thrust::device_pointer_cast(av);
1329: PetscLogGpuTimeBegin();
1330: if (p) {
1331: thrust::tuple<PetscReal,PetscInt> res(PETSC_MAX_REAL,-1);
1332: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1333: try {
1334: #if defined(PETSC_USE_COMPLEX)
1335: res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmini());
1336: #else
1337: res = thrust::reduce(zibit,zibit+n,res,petscmini());
1338: #endif
1339: } catch (char *ex) {
1340: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1341: }
1342: *m = res.get<0>();
1343: *p = res.get<1>();
1344: } else {
1345: try {
1346: #if defined(PETSC_USE_COMPLEX)
1347: *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MAX_REAL,petscmin());
1348: #else
1349: *m = thrust::reduce(avpt,avpt+n,PETSC_MAX_REAL,petscmin());
1350: #endif
1351: } catch (char *ex) {
1352: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1353: }
1354: }
1355: PetscLogGpuTimeEnd();
1356: VecCUDARestoreArrayRead(v,&av);
1357: return(0);
1358: }
1360: PetscErrorCode VecSum_SeqCUDA(Vec v,PetscScalar *sum)
1361: {
1362: PetscErrorCode ierr;
1363: PetscInt n = v->map->n;
1364: const PetscScalar *a;
1365: thrust::device_ptr<const PetscScalar> dptr;
1369: VecCUDAGetArrayRead(v,&a);
1370: dptr = thrust::device_pointer_cast(a);
1371: PetscLogGpuTimeBegin();
1372: try {
1373: *sum = thrust::reduce(dptr,dptr+n,PetscScalar(0.0));
1374: } catch (char *ex) {
1375: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1376: }
1377: PetscLogGpuTimeEnd();
1378: VecCUDARestoreArrayRead(v,&a);
1379: return(0);
1380: }
1382: struct petscshift : public thrust::unary_function<PetscScalar,PetscScalar>
1383: {
1384: const PetscScalar shift_;
1385: petscshift(PetscScalar shift) : shift_(shift){}
1386: __host__ __device__
1387: PetscScalar operator()(PetscScalar x) {return x + shift_;}
1388: };
1390: PetscErrorCode VecShift_SeqCUDA(Vec v,PetscScalar shift)
1391: {
1392: PetscErrorCode ierr;
1393: PetscInt n = v->map->n;
1394: PetscScalar *a;
1395: thrust::device_ptr<PetscScalar> dptr;
1399: VecCUDAGetArray(v,&a);
1400: dptr = thrust::device_pointer_cast(a);
1401: PetscLogGpuTimeBegin();
1402: try {
1403: thrust::transform(dptr,dptr+n,dptr,petscshift(shift)); /* in-place transform */
1404: } catch (char *ex) {
1405: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1406: }
1407: PetscLogGpuTimeEnd();
1408: VecCUDARestoreArray(v,&a);
1409: return(0);
1410: }
1412: #if defined(PETSC_HAVE_NVSHMEM)
1413: /* Free old CUDA array and re-allocate a new one from nvshmem symmetric heap.
1414: New array does not retain values in the old array. The offload mask is not changed.
1416: Note: the function is only meant to be used in MatAssemblyEnd_MPIAIJCUSPARSE.
1417: */
1418: PetscErrorCode VecAllocateNVSHMEM_SeqCUDA(Vec v)
1419: {
1421: cudaError_t cerr;
1422: Vec_CUDA *veccuda = (Vec_CUDA*)v->spptr;
1423: PetscInt n;
1426: cerr = cudaFree(veccuda->GPUarray_allocated);CHKERRCUDA(cerr);
1427: VecGetLocalSize(v,&n);
1428: MPIU_Allreduce(MPI_IN_PLACE,&n,1,MPIU_INT,MPI_MAX,PETSC_COMM_WORLD);
1429: PetscNvshmemMalloc(n*sizeof(PetscScalar),(void**)&veccuda->GPUarray_allocated);
1430: veccuda->GPUarray = veccuda->GPUarray_allocated;
1431: veccuda->nvshmem = PETSC_TRUE;
1432: return(0);
1433: }
1434: #endif