Actual source code: bvcuda.cu

slepc-3.20.2 2024-03-15
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    CUDA-related code common to several BV impls
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include <slepccublas.h>

 17: #define BLOCKSIZE 64

 19: /*
 20:     C := alpha*A*B + beta*C
 21: */
 22: PetscErrorCode BVMult_BLAS_CUDA(BV,PetscInt m_,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar beta,PetscScalar *d_C,PetscInt ldc_)
 23: {
 24:   PetscCuBLASInt    m=0,n=0,k=0,lda=0,ldb=0,ldc=0;
 25:   cublasHandle_t    cublasv2handle;

 27:   PetscFunctionBegin;
 28:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
 29:   PetscCall(PetscCuBLASIntCast(m_,&m));
 30:   PetscCall(PetscCuBLASIntCast(n_,&n));
 31:   PetscCall(PetscCuBLASIntCast(k_,&k));
 32:   PetscCall(PetscCuBLASIntCast(lda_,&lda));
 33:   PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
 34:   PetscCall(PetscCuBLASIntCast(ldc_,&ldc));
 35:   PetscCall(PetscLogGpuTimeBegin());
 36:   PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,lda,d_B,ldb,&beta,d_C,ldc));
 37:   PetscCall(PetscLogGpuTimeEnd());
 38:   PetscCall(PetscLogGpuFlops(2.0*m*n*k));
 39:   PetscFunctionReturn(PETSC_SUCCESS);
 40: }

 42: /*
 43:     y := alpha*A*x + beta*y
 44: */
 45: PetscErrorCode BVMultVec_BLAS_CUDA(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar beta,PetscScalar *d_y)
 46: {
 47:   PetscCuBLASInt    n=0,k=0,lda=0,one=1;
 48:   cublasHandle_t    cublasv2handle;

 50:   PetscFunctionBegin;
 51:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
 52:   PetscCall(PetscCuBLASIntCast(n_,&n));
 53:   PetscCall(PetscCuBLASIntCast(k_,&k));
 54:   PetscCall(PetscCuBLASIntCast(lda_,&lda));
 55:   PetscCall(PetscLogGpuTimeBegin());
 56:   PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,lda,d_x,one,&beta,d_y,one));
 57:   PetscCall(PetscLogGpuTimeEnd());
 58:   PetscCall(PetscLogGpuFlops(2.0*n*k));
 59:   PetscFunctionReturn(PETSC_SUCCESS);
 60: }

 62: /*
 63:     A(:,s:e-1) := A*B(:,s:e-1)
 64: */
 65: PetscErrorCode BVMultInPlace_BLAS_CUDA(BV,PetscInt m_,PetscInt k_,PetscInt s,PetscInt e,PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscBool btrans)
 66: {
 67:   const PetscScalar *d_B1;
 68:   PetscScalar       *d_work,sone=1.0,szero=0.0;
 69:   PetscCuBLASInt    m=0,n=0,k=0,l=0,lda=0,ldb=0,bs=BLOCKSIZE;
 70:   size_t            freemem,totmem;
 71:   cublasHandle_t    cublasv2handle;
 72:   cublasOperation_t bt;

 74:   PetscFunctionBegin;
 75:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
 76:   PetscCall(PetscCuBLASIntCast(m_,&m));
 77:   PetscCall(PetscCuBLASIntCast(e-s,&n));
 78:   PetscCall(PetscCuBLASIntCast(k_,&k));
 79:   PetscCall(PetscCuBLASIntCast(lda_,&lda));
 80:   PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
 81:   PetscCall(PetscLogGpuTimeBegin());
 82:   if (PetscUnlikely(btrans)) {
 83:     d_B1 = d_B+s;
 84:     bt   = CUBLAS_OP_C;
 85:   } else {
 86:     d_B1 = d_B+s*ldb;
 87:     bt   = CUBLAS_OP_N;
 88:   }
 89:   /* try to allocate the whole matrix */
 90:   PetscCallCUDA(cudaMemGetInfo(&freemem,&totmem));
 91:   if (freemem>=lda*n*sizeof(PetscScalar)) {
 92:     PetscCallCUDA(cudaMalloc((void**)&d_work,lda*n*sizeof(PetscScalar)));
 93:     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,m,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,lda));
 94:     PetscCallCUDA(cudaMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
 95:   } else {
 96:     PetscCall(PetscCuBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs));
 97:     PetscCallCUDA(cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar)));
 98:     PetscCall(PetscCuBLASIntCast(m % bs,&l));
 99:     if (l) {
100:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,l,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,l));
101:       PetscCallCUDA(cudaMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
102:     }
103:     for (;l<m;l+=bs) {
104:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,bs,n,k,&sone,d_A+l,lda,d_B1,ldb,&szero,d_work,bs));
105:       PetscCallCUDA(cudaMemcpy2D(d_A+l+s*lda,lda*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
106:     }
107:   }
108:   PetscCall(PetscLogGpuTimeEnd());
109:   PetscCallCUDA(cudaFree(d_work));
110:   PetscCall(PetscLogGpuFlops(2.0*m*n*k));
111:   PetscFunctionReturn(PETSC_SUCCESS);
112: }

114: /*
115:     B := alpha*A + beta*B
116: */
117: PetscErrorCode BVAXPY_BLAS_CUDA(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,PetscScalar beta,PetscScalar *d_B,PetscInt ldb_)
118: {
119:   PetscCuBLASInt n=0,k=0,lda=0,ldb=0;
120:   cublasHandle_t cublasv2handle;

122:   PetscFunctionBegin;
123:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
124:   PetscCall(PetscCuBLASIntCast(n_,&n));
125:   PetscCall(PetscCuBLASIntCast(k_,&k));
126:   PetscCall(PetscCuBLASIntCast(lda_,&lda));
127:   PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
128:   PetscCall(PetscLogGpuTimeBegin());
129:   PetscCallCUBLAS(cublasXgeam(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,k,&alpha,d_A,lda,&beta,d_B,ldb,d_B,ldb));
130:   PetscCall(PetscLogGpuTimeEnd());
131:   PetscCall(PetscLogGpuFlops((beta==(PetscScalar)1.0)?2.0*n*k:3.0*n*k));
132:   PetscFunctionReturn(PETSC_SUCCESS);
133: }

135: /*
136:     C := A'*B

138:     C is a CPU array
139: */
140: PetscErrorCode BVDot_BLAS_CUDA(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar *C,PetscInt ldc_,PetscBool mpi)
141: {
142:   PetscScalar       *d_work,sone=1.0,szero=0.0,*CC;
143:   PetscInt          j;
144:   PetscCuBLASInt    m=0,n=0,k=0,lda=0,ldb=0,ldc=0;
145:   PetscMPIInt       len;
146:   cublasHandle_t    cublasv2handle;

148:   PetscFunctionBegin;
149:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
150:   PetscCall(PetscCuBLASIntCast(m_,&m));
151:   PetscCall(PetscCuBLASIntCast(n_,&n));
152:   PetscCall(PetscCuBLASIntCast(k_,&k));
153:   PetscCall(PetscCuBLASIntCast(lda_,&lda));
154:   PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
155:   PetscCall(PetscCuBLASIntCast(ldc_,&ldc));
156:   PetscCallCUDA(cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar)));
157:   if (mpi) {
158:     if (ldc==m) {
159:       PetscCall(BVAllocateWork_Private(bv,m*n));
160:       if (k) {
161:         PetscCall(PetscLogGpuTimeBegin());
162:         PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,ldc));
163:         PetscCall(PetscLogGpuTimeEnd());
164:         PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
165:         PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
166:       } else PetscCall(PetscArrayzero(bv->work,m*n));
167:       PetscCall(PetscMPIIntCast(m*n,&len));
168:       PetscCall(MPIU_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
169:     } else {
170:       PetscCall(BVAllocateWork_Private(bv,2*m*n));
171:       CC = bv->work+m*n;
172:       if (k) {
173:         PetscCall(PetscLogGpuTimeBegin());
174:         PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m));
175:         PetscCall(PetscLogGpuTimeEnd());
176:         PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
177:         PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
178:       } else PetscCall(PetscArrayzero(bv->work,m*n));
179:       PetscCall(PetscMPIIntCast(m*n,&len));
180:       PetscCall(MPIU_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
181:       for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,CC+j*m,m));
182:     }
183:   } else {
184:     if (k) {
185:       PetscCall(BVAllocateWork_Private(bv,m*n));
186:       PetscCall(PetscLogGpuTimeBegin());
187:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m));
188:       PetscCall(PetscLogGpuTimeEnd());
189:       PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
190:       PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
191:       for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,bv->work+j*m,m));
192:     }
193:   }
194:   PetscCallCUDA(cudaFree(d_work));
195:   PetscCall(PetscLogGpuFlops(2.0*m*n*k));
196:   PetscFunctionReturn(PETSC_SUCCESS);
197: }

199: /*
200:     y := A'*x computed as y' := x'*A

202:     y is a CPU array, if NULL bv->buffer is used as a workspace
203: */
204: PetscErrorCode BVDotVec_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar *y,PetscBool mpi)
205: {
206:   PetscScalar       *d_work,szero=0.0,sone=1.0,*yy=y;
207:   PetscCuBLASInt    n=0,k=0,lda=0,one=1;
208:   PetscMPIInt       len;
209:   cublasHandle_t    cublasv2handle;

211:   PetscFunctionBegin;
212:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
213:   PetscCall(PetscCuBLASIntCast(n_,&n));
214:   PetscCall(PetscCuBLASIntCast(k_,&k));
215:   PetscCall(PetscCuBLASIntCast(lda_,&lda));
216:   if (!y) PetscCall(VecCUDAGetArrayWrite(bv->buffer,&d_work));
217:   else PetscCallCUDA(cudaMalloc((void**)&d_work,k*sizeof(PetscScalar)));
218:   if (mpi) {
219:     PetscCall(BVAllocateWork_Private(bv,k));
220:     if (n) {
221:       PetscCall(PetscLogGpuTimeBegin());
222: #if defined(PETSC_USE_COMPLEX)
223:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,lda,&szero,d_work,one));
224:       PetscCall(BV_ConjugateCUDAArray(d_work,k));
225: #else
226:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,lda,&szero,d_work,one));
227: #endif
228:       PetscCall(PetscLogGpuTimeEnd());
229:       PetscCallCUDA(cudaMemcpy(bv->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
230:       PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
231:     } else PetscCall(PetscArrayzero(bv->work,k));
232:     if (!y) {
233:       PetscCall(VecCUDARestoreArrayWrite(bv->buffer,&d_work));
234:       PetscCall(VecGetArray(bv->buffer,&yy));
235:     } else PetscCallCUDA(cudaFree(d_work));
236:     PetscCall(PetscMPIIntCast(k,&len));
237:     PetscCall(MPIU_Allreduce(bv->work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
238:   } else {
239:     if (n) {
240:       PetscCall(PetscLogGpuTimeBegin());
241: #if defined(PETSC_USE_COMPLEX)
242:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,lda,&szero,d_work,one));
243:       PetscCall(BV_ConjugateCUDAArray(d_work,k));
244: #else
245:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,lda,&szero,d_work,one));
246: #endif
247:       PetscCall(PetscLogGpuTimeEnd());
248:     }
249:     if (!y) PetscCall(VecCUDARestoreArrayWrite(bv->buffer,&d_work));
250:     else {
251:       PetscCallCUDA(cudaMemcpy(y,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
252:       PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
253:       PetscCallCUDA(cudaFree(d_work));
254:     }
255:   }
256:   PetscCall(PetscLogGpuFlops(2.0*n*k));
257:   PetscFunctionReturn(PETSC_SUCCESS);
258: }

260: /*
261:     Scale n scalars
262: */
263: PetscErrorCode BVScale_BLAS_CUDA(BV,PetscInt n_,PetscScalar *d_A,PetscScalar alpha)
264: {
265:   PetscCuBLASInt n=0,one=1;
266:   cublasHandle_t cublasv2handle;

268:   PetscFunctionBegin;
269:   PetscCall(PetscCuBLASIntCast(n_,&n));
270:   if (PetscUnlikely(alpha == (PetscScalar)0.0)) PetscCallCUDA(cudaMemset(d_A,0,n*sizeof(PetscScalar)));
271:   else if (alpha != (PetscScalar)1.0) {
272:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
273:     PetscCall(PetscLogGpuTimeBegin());
274:     PetscCallCUBLAS(cublasXscal(cublasv2handle,n,&alpha,d_A,one));
275:     PetscCall(PetscLogGpuTimeEnd());
276:     PetscCall(PetscLogGpuFlops(1.0*n));
277:   }
278:   PetscFunctionReturn(PETSC_SUCCESS);
279: }

281: #if defined(PETSC_USE_COMPLEX)
282: #include <thrust/device_ptr.h>

284: struct conjugate
285: {
286:   __host__ __device__
287:     PetscScalar operator()(PetscScalar x)
288:     {
289:       return PetscConj(x);
290:     }
291: };

293: PetscErrorCode BV_ConjugateCUDAArray(PetscScalar *a,PetscInt n)
294: {
295:   thrust::device_ptr<PetscScalar> ptr;

297:   PetscFunctionBegin;
298:   try {
299:     ptr = thrust::device_pointer_cast(a);
300:     thrust::transform(ptr,ptr+n,ptr,conjugate());
301:   } catch (char *ex) {
302:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
303:   }
304:   PetscFunctionReturn(PETSC_SUCCESS);
305: }
306: #endif

308: /*
309:    BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
310: */
311: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
312: {
313:   PetscScalar    *d_hh,*d_a;
314:   PetscInt       i;

316:   PetscFunctionBegin;
317:   if (!h) {
318:     PetscCall(VecCUDAGetArray(bv->buffer,&d_a));
319:     PetscCall(PetscLogGpuTimeBegin());
320:     d_hh = d_a + j*(bv->nc+bv->m);
321:     PetscCallCUDA(cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar)));
322:     PetscCall(PetscLogGpuTimeEnd());
323:     PetscCall(VecCUDARestoreArray(bv->buffer,&d_a));
324:   } else { /* cpu memory */
325:     for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
326:   }
327:   PetscFunctionReturn(PETSC_SUCCESS);
328: }

330: /*
331:    BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
332:    into column j of the bv buffer
333:  */
334: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
335: {
336:   PetscScalar    *d_h,*d_c,sone=1.0;
337:   PetscInt       i;
338:   PetscCuBLASInt idx=0,one=1;
339:   cublasHandle_t cublasv2handle;

341:   PetscFunctionBegin;
342:   if (!h) {
343:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
344:     PetscCall(VecCUDAGetArray(bv->buffer,&d_c));
345:     d_h = d_c + j*(bv->nc+bv->m);
346:     PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx));
347:     PetscCall(PetscLogGpuTimeBegin());
348:     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,idx,&sone,d_c,one,d_h,one));
349:     PetscCall(PetscLogGpuTimeEnd());
350:     PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
351:     PetscCall(VecCUDARestoreArray(bv->buffer,&d_c));
352:   } else { /* cpu memory */
353:     for (i=0;i<bv->nc+j;i++) h[i] += c[i];
354:     PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
355:   }
356:   PetscFunctionReturn(PETSC_SUCCESS);
357: }

359: /*
360:    BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
361:    of the coefficients array
362: */
363: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
364: {
365:   PetscScalar    *d_h,*a;

367:   PetscFunctionBegin;
368:   if (!h) {
369:     PetscCall(VecCUDAGetArray(bv->buffer,&a));
370:     PetscCall(PetscLogGpuTimeBegin());
371:     d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
372:     PetscCallCUDA(cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice));
373:     PetscCall(PetscLogCpuToGpu(sizeof(PetscScalar)));
374:     PetscCall(PetscLogGpuTimeEnd());
375:     PetscCall(VecCUDARestoreArray(bv->buffer,&a));
376:   } else { /* cpu memory */
377:     h[bv->nc+j] = value;
378:   }
379:   PetscFunctionReturn(PETSC_SUCCESS);
380: }

382: /*
383:    BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
384:    coefficients array (up to position j)
385: */
386: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
387: {
388:   const PetscScalar *d_h;
389:   PetscScalar       dot;
390:   PetscInt          i;
391:   PetscCuBLASInt    idx=0,one=1;
392:   cublasHandle_t    cublasv2handle;

394:   PetscFunctionBegin;
395:   if (!h) {
396:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
397:     PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h));
398:     PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx));
399:     PetscCall(PetscLogGpuTimeBegin());
400:     PetscCallCUBLAS(cublasXdotc(cublasv2handle,idx,d_h,one,d_h,one,&dot));
401:     PetscCall(PetscLogGpuTimeEnd());
402:     PetscCall(PetscLogGpuFlops(2.0*(bv->nc+j)));
403:     *sum = PetscRealPart(dot);
404:     PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h));
405:   } else { /* cpu memory */
406:     *sum = 0.0;
407:     for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
408:     PetscCall(PetscLogFlops(2.0*(bv->nc+j)));
409:   }
410:   PetscFunctionReturn(PETSC_SUCCESS);
411: }

413: /* pointwise multiplication */
414: static __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
415: {
416:   PetscInt x;

418:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
419:   if (x<n) a[x] *= PetscRealPart(b[x]);
420: }

422: /* pointwise division */
423: static __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
424: {
425:   PetscInt x;

427:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
428:   if (x<n) a[x] /= PetscRealPart(b[x]);
429: }

431: /*
432:    BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
433:    the contents of the coefficients array (up to position j) and omega is the signature;
434:    if inverse=TRUE then the operation is h/omega
435: */
436: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
437: {
438:   PetscScalar       *d_h;
439:   const PetscScalar *d_omega,*omega;
440:   PetscInt          i,xcount;
441:   dim3              blocks3d, threads3d;

443:   PetscFunctionBegin;
444:   if (!(bv->nc+j)) PetscFunctionReturn(PETSC_SUCCESS);
445:   if (!h) {
446:     PetscCall(VecCUDAGetArray(bv->buffer,&d_h));
447:     PetscCall(VecCUDAGetArrayRead(bv->omega,&d_omega));
448:     PetscCall(SlepcKernelSetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount));
449:     PetscCall(PetscLogGpuTimeBegin());
450:     if (inverse) {
451:       for (i=0;i<xcount;i++) PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
452:     } else {
453:       for (i=0;i<xcount;i++) PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
454:     }
455:     PetscCallCUDA(cudaGetLastError());
456:     PetscCall(PetscLogGpuTimeEnd());
457:     PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
458:     PetscCall(VecCUDARestoreArrayRead(bv->omega,&d_omega));
459:     PetscCall(VecCUDARestoreArray(bv->buffer,&d_h));
460:   } else {
461:     PetscCall(VecGetArrayRead(bv->omega,&omega));
462:     if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
463:     else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
464:     PetscCall(VecRestoreArrayRead(bv->omega,&omega));
465:     PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
466:   }
467:   PetscFunctionReturn(PETSC_SUCCESS);
468: }

470: /*
471:    BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
472:    of the coefficients array
473: */
474: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
475: {
476:   const PetscScalar *d_h;
477:   PetscScalar       hh;

479:   PetscFunctionBegin;
480:   if (!h) {
481:     PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h));
482:     PetscCall(PetscLogGpuTimeBegin());
483:     PetscCallCUDA(cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost));
484:     PetscCall(PetscLogGpuToCpu(sizeof(PetscScalar)));
485:     PetscCall(PetscLogGpuTimeEnd());
486:     PetscCall(BV_SafeSqrt(bv,hh,beta));
487:     PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h));
488:   } else PetscCall(BV_SafeSqrt(bv,h[bv->nc+j],beta));
489:   PetscFunctionReturn(PETSC_SUCCESS);
490: }

492: /*
493:    BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
494:    provided by the caller (only values from l to j are copied)
495: */
496: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
497: {
498:   const PetscScalar *d_h,*d_a;
499:   PetscInt          i;

501:   PetscFunctionBegin;
502:   if (!h) {
503:     PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_a));
504:     PetscCall(PetscLogGpuTimeBegin());
505:     d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
506:     PetscCallCUDA(cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
507:     PetscCall(PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar)));
508:     PetscCall(PetscLogGpuTimeEnd());
509:     PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_a));
510:   } else {
511:     for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
512:   }
513:   PetscFunctionReturn(PETSC_SUCCESS);
514: }