INLA_DIST
Loading...
Searching...
No Matches
BTA.H
1#ifndef __BTA
2#define __BTA
3
4#include <string.h>
5#include <omp.h>
6
7#include "cublas_v2.h"
8#include "Types.H"
9#include "Utilities.H"
10#include "CWC_utility.H"
11#include "Blas.H" // to inherit #define CUDA_POTRF
12
13
14// debug ...
15#include <iostream>
16#include <iomanip>
17#include <fstream>
18#include <ctime>
19
20//#include "helper_functions.h"
21
22#include "nvToolsExt.h"
23
24//#define PRINT_MSG
25
31template <class T>
32class BTA{
33
34public:
42 BTA(size_t ns, size_t nt, size_t nd, int GPU_rank_);
47
56 double factorize(size_t* ia, size_t* ja, T* a, double& t_firstStageFactor);
57
66 double factorize_noCopyHost(size_t* ia, size_t* ja, T* a, T &logDet);
67
81 double factorizeSolve(size_t* ia, size_t* ja, T* a, T* x, T* rhs,size_t nrhs, double& t_firstSecondStage, double& t_SecondStageBackPass);
82
94 double solve(size_t* ia, size_t* ja, T* a, T* rhs, size_t nrhs, double& t_secondStageForwardPass, double& t_secondStageBackwardPass);
95
107 double solve(size_t*, size_t*, T*, T*,T*,size_t, double& t_secondStageForwardPass, double& t_secondStageBackwardPass);
108
119 double solve_s(size_t* ia, size_t* ja, double* a, double* x, double* rhs, size_t nrhs);
120
131 double solve_d(size_t*, size_t*, float*, float*,float*,size_t);
132
141 double BTAdiag(size_t* ia, size_t* ja, T* a, T* diag);
142
143
144 double BTAinvBlks(size_t*, size_t*, T*, T*);
145
155 double BTAselInv(size_t *ia, size_t *ja, T *a, T *invQ);
156
164 T logDet(size_t* ia, size_t* ja, T* a);
165
172 double residualNorm(T* x,T* b);
173
180 double residualNormNormalized(T* x,T* b);
181 double flop_count_factorise();
182
183private:
184
185 size_t *matrix_ia;
186 size_t *matrix_ja;
188 size_t matrix_size;
190 size_t matrix_ns;
191 size_t matrix_nt;
192 size_t matrix_nd;
193 size_t *Bmin;
194 size_t *Bmax;
195 size_t NBlock;
196 size_t *diag_pos;
197
198 size_t max_supernode_nnz = 0;
199
200 size_t ind_invBlks_fi;
201 size_t mem_alloc_dev = 0; // keep track of allocated memory on GPU
202
205 //size_t b_size;
206 bool MF_allocated = false;
207 bool invBlks_allocated = false;
208 bool MF_dev_allocated = false;
209 bool factorization_completed = false;
210 int cpy_indicator; // 0: copy back diagonal, 1: copy back nnzQ, 2: copy back invBlks
211
212 // add copy stream, add compute stream
213 magma_queue_t magma_queue_1;
214 magma_queue_t magma_queue_2;
215 cudaStream_t stream_c;
216 cudaStream_t copyStream = NULL;
217 cudaStream_t magma_cudaStream_1 = NULL;
218 cudaStream_t magma_cudaStream_2 = NULL;
219
220 cudaEvent_t initBlock_dev_ev;
221 cudaEvent_t potrf_dev_ev;
222
223 magma_event_t potrf_dev_magma_ev;
224
225 void *cublas_handle;
226
227#ifdef CUDA_POTRF
228 int* info_cuda = NULL;
229 int *cuda_buffer_flag_potrf;
230 int *cuda_buffer_flag_trtri;
231
232 cusolverDnHandle_t *handle;
233 cusolverDnParams_t *params;
234
235 size_t *dev_size;
236 size_t *host_size;
237 double *mem_cuda_dev;
238 double *mem_cuda_host;
239#endif // end CUDA_POTRF
240
241#ifdef MAGMA_EXPERT
242 int magma_potrf_init_flag;
243
244 int magma_info[1];
245 //magma_mode_t mode = MagmaHybrid;
246 magma_mode_t mode; // = MagmaNative;
247 int subN; // = 256; // nb = 64 and recnb = 32 for 4000
248 int subSubN; // = 32; // nb = 512 and recnb = 128 for 20000
249 void* host_work;
250 int lwork_host;
251 void* device_work;
252 int lwork_device;
253 magma_event_t magma_events[2];
254 magma_queue_t magma_queues[2];
255#endif // end MAGMA_EXPERT
256
257 T *MF;
258 T *invBlks;
259 T *inv_a;
260 T *blockR_dev;
261 T *blockM_dev;
262 T *blockDense_dev;
263 T *rhs;
264 T *rhs_dev;
265
266 size_t *ia_dev;
267 size_t *ja_dev;
268 T *a_dev;
269
270 size_t *inv_ia_dev;
271 size_t *inv_ja_dev;
272 T *inv_a_dev;
273
274 T *diag_dev;
275 size_t *diag_pos_dev;
276
277 inline size_t mf_block_index(size_t, size_t);
278 inline size_t mf_block_lda(size_t, size_t);
279 inline size_t mf_dense_block_index(size_t);
280 inline size_t mf_dense_block_offset(size_t);
281 inline size_t mf_dense_block_lda(size_t);
282
283 inline size_t invblks_diag_block_index(size_t);
284 inline size_t invblks_dense_block_index(size_t);
285
286 double FirstStageFactor();
287 double FirstSecondStageFactor(size_t nhrs);
288 double FirstStageFactor_noCopyHost(T &logDet);
289 double FirstStageFactor_noCopyHost_testV(double &logDet);
290 double ForwardPassSolve(size_t);
291 double BackwardPassSolve(size_t);
292 double SecondStageSolve(size_t, double& t_secondStageForwardPass, double& t_secondStageBackwardPass);
293 // single precision solve with double precision factor
294 double SecondStageSolve_s(size_t, float* rhs_s);
295 // double precision solve with single precision factor
296 double SecondStageSolve_d(size_t, double* rhs_d);
297 double ThirdStageBTA(T*,T*, int);
298 //void create_blocks();
299 void initialize_MF_host();
300 void initialize_invBlks_host();
301
302 // new: compute maximum number of nonzeros over all supernodes
303 inline void get_max_supernode_nnz();
304
305 inline void init_supernode(T *M_dev, size_t supernode, cudaStream_t stream);
306 //inline void init_supernode(T *M_dev, size_t supernode);
307
308 inline void copy_supernode_to_host(T *M_dev, size_t supernode, cudaStream_t stream);
309 //inline void copy_supernode_to_host(T *M_dev, size_t supernode);
310
311 inline void extract_nnzA(T *M_dev, size_t supernode); // new to extract nnzQ(Qinv)
312 inline void copy_supernode_to_host_write(T *M_dev, size_t supernode); // debugging ...
313
314 inline void copy_supernode_to_device(T *M_dev, size_t supernode, cudaStream_t stream);
315 //inline void copy_supernode_diag(T *src, size_t supernode, cudaStream_t stream);
316 inline void copy_supernode_diag(T *src, size_t supernode);
317 inline void swap_pointers(T **ptr1, T **ptr2);
318
319 inline T f_one();
320 inline T f_zero();
321};
322
323/************************************************************************************************/
324
325template <class T>
326BTA<T>::BTA(size_t ns, size_t nt, size_t nd, int GPU_rank_)
327{
328 if(ns == 0){
329 printf("ns = %ld, nt = %ld. no spatial field. Just fixed effects. Consider using a different solver!\n", ns, nt);
330 exit(1);
331 }
332 // only set up indices etc. no values yet.
333 // set device from GPU rank
334 GPU_rank = GPU_rank_;
335
336#ifdef CUDA_POTRF
337 printf("using CUDA POTRF.\n");
338#elif defined(MAGMA_EXPERT)
339 printf("using MAGMA EXPERT POTRF.\n");
340#else
341 printf("using MAGMA POTRF.\n");
342#endif
343
344 matrix_ns = ns;
345 matrix_nt = nt;
346 matrix_nd = nd;
347
348 matrix_size = ns*nt + nd;
349 matrix_n_nonzeros = 2*(nt-1)*ns*ns + ns*ns + matrix_size*nd;
350
351 if (nd > 0)
352 NBlock = nt + 1;
353 else
354 NBlock = nt;
355 Bmin = new size_t[NBlock];
356 Bmax = new size_t[NBlock];
357 for (size_t i = 0; i < nt; i++)
358 {
359 Bmin[i] = i*ns;
360 Bmax[i] = (i+1)*ns;
361 }
362 if (nd > 0)
363 {
364 Bmin[nt] = nt*ns;
365 Bmax[nt] = nt*ns + nd;
366 }
367
368 // blocks with lda = 2*ns + nd
369 diag_pos = new size_t[matrix_size];
370 size_t IB;
371 for (IB = 0; IB < nt-1; IB++)
372 {
373 for (size_t i = 0; i < ns; i++)
374 {
375 diag_pos[IB*ns+i] = IB * ns*(2*ns+nd) + i*(2*ns+nd+1);
376 }
377 }
378 // last block with lda = ns + nd
379 IB = nt-1;
380 for (size_t i = 0; i < ns; i++)
381 {
382 diag_pos[IB*ns+i] = IB * ns*(2*ns+nd) + i*(ns+nd+1);
383 }
384 // dense block with lda = ns + nd
385 if (nd > 0)
386 {
387 IB = nt;
388 for (size_t i = 0; i < nd; i++)
389 {
390 diag_pos[nt*ns+i] = (nt-1) * ns*(2*ns+nd) + ns*(ns+nd) + i*(nd+1);
391 }
392 }
393
394 /*
395 printf("diag_pos: ");
396 for(int i=0; i<matrix_size; i++){
397 printf("%ld ", diag_pos[i]);
398 }
399 printf("\n");
400 */
401
402 gpuErrchk(cudaEventCreate(&initBlock_dev_ev));
403 gpuErrchk(cudaEventCreate(&potrf_dev_ev));
404
405 magma_event_create(&potrf_dev_magma_ev);
406
407
408 // initialize cuda copyStream
409 gpuErrchk(cudaStreamCreate( &copyStream ));
410
411 magma_init();
412 magma_device_t device;
413 magma_getdevice(&device);
414
415 // in order to get the magma GEMMs in a non-null stream, the second argument of magma_queue_create needs to be a CUDA stream
416 // so make a cuda stream to use for this.
417 gpuErrchk(cudaStreamCreate ( &magma_cudaStream_1 ));
418 gpuErrchk(cudaStreamCreate ( &magma_cudaStream_2 ));
419
420 //magma_queue_create_from_cuda(device, NULL, NULL, NULL, &magma_queue);
421 magma_queue_create_from_cuda(device, magma_cudaStream_1, NULL, NULL, &magma_queue_1);
422 magma_queue_create_from_cuda(device, magma_cudaStream_2, NULL, NULL, &magma_queue_2);
423
424 // magma_queue created from specific streams
425 //stream_c = magma_queue_get_cuda_stream(magma_queue_1);
426 //cublas_handle = magma_queue_get_cublas_handle(magma_queue_1);
427
428#ifdef CUDA_POTRF
429 cudaMalloc((void**)&info_cuda, sizeof(int));
430
431 cudaMallocHost((void**)&cuda_buffer_flag_potrf, sizeof(int));
432 cuda_buffer_flag_potrf[0] = 0;
433
434 cudaMallocHost((void**)&cuda_buffer_flag_trtri, sizeof(int));
435 cuda_buffer_flag_trtri[0] = 0;
436
437 cudaMallocHost((void**)&handle,sizeof(cusolverDnHandle_t));
438 cudaMallocHost((void**)&params,sizeof(cusolverDnParams_t));
439
440 cudaMallocHost((void**)&dev_size,sizeof(size_t));
441 cudaMallocHost((void**)&host_size,sizeof(size_t));
442
443 cusolverStatus_t cuSolverError = cusolverDnCreate(handle);
444 if(cuSolverError != 0){
445 printf("cuSolverError not Zero in create handle! Error: %d\n", cuSolverError);
446 exit(1);
447 }
448
449 cuSolverError = cusolverDnCreateParams(params);
450 if(cuSolverError != 0){
451 printf("cuSolverError not Zero in create params!\n");
452 exit(1);
453 }
454
455#endif
456
457#ifdef MAGMA_EXPERT
458
459 magma_potrf_init_flag = 0;
460 // mode = MagmaHybrid;
461 mode = MagmaNative;
462 subN = 256; // nb = 64 and recnb = 32 for 4000
463 subSubN = 32; // nb = 512 and recnb = 128 for 20000
464
465 lwork_host = -1;
466 lwork_device = -1;
467
468 magma_event_create(&magma_events[0]);
469 magma_event_create(&magma_events[1]);
470
471 // let's hope that this works.
472 magma_queues[0] = magma_queue_1;
473 magma_queues[1] = magma_queue_2;
474
475 //magma_queue_create(device, &queues[0]);
476 //magma_queue_create(device, &queues[1]);
477
478#endif
479
480}
481
482/************************************************************************************************/
483
484template <class T>
486{
487#ifdef PRINT_MSG
488 std::cout << "In BTA destructor. MF_allocated : " << MF_allocated << std::endl;
489#endif
490
491 // in case that only factorize_noCopyHost is called MF not allocated ...
492 if (MF_allocated)
493 {
494 //delete[] MF;
495 // if memory PINNED
496#ifdef PRINT_MSG
497 std::cout << "cudaFreeHost(MF) gets called." << std::endl;
498#endif
499 cudaFreeHost(MF);
500 }
501
502 // if we require entire diagonal blocks
503 if(invBlks_allocated){
504#ifdef PRINT_MSG
505 std::cout << "cudaFreeHost(invBlks) gets called." << std::endl;
506#endif
507 cudaFreeHost(invBlks);
508 }
509
510 if (MF_dev_allocated)
511 {
512 size_t max_supernode_nnz_dense = matrix_nt > 1 ? matrix_ns*(2*matrix_ns+matrix_nd) : matrix_ns*(matrix_ns+matrix_nd);
513 size_t final_supernode_nnz_dense = matrix_nd > 0 ? matrix_nd*matrix_nd : 0;
514 deallocate_data_on_dev(blockR_dev,max_supernode_nnz_dense*sizeof(T));
515 deallocate_data_on_dev(blockM_dev,max_supernode_nnz_dense*sizeof(T));
516 deallocate_data_on_dev(blockDense_dev,final_supernode_nnz_dense*sizeof(T));
517 }
518
519 magma_queue_destroy(magma_queue_1);
520 magma_queue_destroy(magma_queue_2);
521 magma_finalize();
522
523 cudaStreamDestroy(copyStream);
524 cudaStreamDestroy(magma_cudaStream_1);
525 cudaStreamDestroy(magma_cudaStream_2);
526
527 delete[] Bmin;
528 delete[] Bmax;
529 delete[] diag_pos;
530
531
532#ifdef CUDA_POTRF
533 cudaFreeHost(info_cuda);
534 cudaFreeHost(cuda_buffer_flag_potrf);
535 cudaFreeHost(cuda_buffer_flag_trtri);
536
537 cudaFreeHost(host_size);
538 cudaFreeHost(dev_size);
539 cudaFreeHost(mem_cuda_host);
540 cudaFree(mem_cuda_dev);
541#endif
542
543#ifdef MAGMA_EXPERT
544 cudaFreeHost(host_work);
545 cudaFree(device_work);
546#endif
547
548}
549
550/************************************************************************************************/
551
552template <class T>
553inline size_t BTA<T>::mf_block_index(size_t r, size_t c)
554{
555 //printf("r = %ld, c = %ld, c*matrix_ns = %ld, (r-c)*matrix_ns = %ld, diag_pos[c*matrix_ns]+(r-c)*matrix_ns = %ld\n", r, c, c*matrix_ns, (r-c)*matrix_ns, diag_pos[c*matrix_ns]+(r-c)*matrix_ns);
556 return diag_pos[c*matrix_ns] + (r-c)*matrix_ns;
557}
558
559/************************************************************************************************/
560
561template <class T>
562inline size_t BTA<T>::mf_block_lda(size_t r, size_t c)
563{
564 //return matrix->index_i[c*b_size];
565 // two blocks
566 if (c < matrix_nt-1)
567 return 2*matrix_ns + matrix_nd;
568 // one block
569 if (c < matrix_nt)
570 return matrix_ns + matrix_nd;
571 // dense block
572 else
573 return matrix_nd;
574}
575
576/************************************************************************************************/
577
578template <class T>
579inline size_t BTA<T>::mf_dense_block_index(size_t i)
580{
581 if (i < matrix_nt-1)
582 return diag_pos[i*matrix_ns] + 2*matrix_ns;
583 else if (i == matrix_nt-1)
584 return diag_pos[i*matrix_ns] + matrix_ns;
585 else
586 return diag_pos[i*matrix_ns];
587}
588
589/************************************************************************************************/
590
591template <class T>
592inline size_t BTA<T>::mf_dense_block_offset(size_t i)
593{
594 if (i < matrix_nt-1)
595 return 2*matrix_ns;
596 else if (i == matrix_nt-1)
597 return matrix_ns;
598 else
599 return 0;
600}
601
602/************************************************************************************************/
603
604template <class T>
605inline size_t BTA<T>::mf_dense_block_lda(size_t i)
606{
607 return mf_block_lda(i, i);;
608}
609
610/************************************************************************************************/
611// assuming that invBlks stores :
612// Sigma_11, Sigma_n+11, Sigma22, Sigma_n+12, ... , Sigma_nn, Sigma_n+1n, Sigma_n+1n+1
613// total size (ns+nd)*ns*nt + nb^2
614
615template <class T>
616inline size_t BTA<T>::invblks_diag_block_index(size_t i)
617{
618 // out-of-bounds check for i which represents time step
619 if(i > matrix_nt){
620 printf("in invblks_diag_block_index(). block index i = %ld out of bounds!\n", i);
621 }
622 // (ns+nd)*ns*i
623 return (matrix_ns+matrix_nd)*matrix_ns*i;
624
625}
626
627
628template <class T>
629inline size_t BTA<T>::invblks_dense_block_index(size_t i)
630{
631 // out-of-bounds check for i which represents time step
632 if(i > matrix_nt){
633 printf("in invblks_diag_block_index(). block index i = %ld out of bounds!\n", i);
634 }
635 // ns^2*(i+1) + ns*nb*i
636 return matrix_ns*matrix_ns*(i+1) + matrix_ns*matrix_nd*i;
637}
638
639/************************************************************************************************/
640
641
642template <class T>
644{
645
646#ifdef PRINT_MSG
647 std::cout << "In FirstStageFactor(), omp get thread num = " << omp_get_thread_num() << std::endl;
648#endif
649
650 double t_potrf = 0;
651 double t_dgemm = 0;
652 double t_copy_DtH = 0;
653 double t_temp;
654
655 double t_init_supernode = get_time(0.0);
656
657 int info;
658 size_t IB;
659 size_t NR,NM;
660 T ONE = f_one();
661
662 // count floating point operations
663 double flop_count = 0;
664
665//tpotrf_dev('L', NR, M1_dev ,NR ,&info)
666//rj dpotrf MF[0,0]
667//rj dtrsm RLTN MF[0,0] MF[1,0]
668
669 IB = 0;
670 NR = Bmax[0]-Bmin[0];
671
672#ifdef PRINT_MSG
673 std::cout << "IB = " << IB << std::endl;
674#endif
675
676 /*
677 int GPU_CurrRank;
678 cudaGetDevice(&GPU_CurrRank);
679 printf("in firstStageFactor. cudaGetDevice : %ld, supposed GPU rank : %ld\n", GPU_CurrRank, GPU_rank);
680 */
681
682 init_supernode(blockR_dev, IB, magma_cudaStream_1);
683 if (matrix_nd > 0)
684 {
685 init_supernode(blockDense_dev, matrix_nt, magma_cudaStream_1);
686 }
687
688 /*cudaDeviceSynchronize();
689 copy_supernode_to_host_write(blockR_dev, IB);
690 exit(1);*/
691
692 // put it in cudaEvent
693 //gpuErrchk(cudaEventRecord(copy_gpu, magma_cudaStream_1)); // want compute_stream to wait
694 //gpuErrchk(cudaStreamWaitEvent(compute_stream, copy_gpu, 0));
695
696#ifdef PRINT_MSG
697 std::cout << "calling first Cholesky factorization now" << std::endl;
698#endif
699
700 t_init_supernode = get_time(t_init_supernode);
701 //printf("time init supernode in factorize: %f\n", t_init_supernode);
702
703 // start timer for counting flops
704 double t_fact = get_time(0.0);
705
706 t_temp = get_time(0.0);
707 // CHOLESKY FACTORISATION FIRST DIAGONAL BLOCK
708#ifdef CUDA_POTRF
709 // factorization first block -> set buffer flag to zero!
710 cuda_buffer_flag_potrf[0] = 0;
711 tpotrf_dev_cuda('L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
712 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
713#elif defined(MAGMA_EXPERT)
714 magma_potrf_init_flag = 0;
715 magma_tpotrf_expert_wrapper('L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
716 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
717 //copy_supernode_to_host_write(blockR_dev, IB);
718 //exit(1);
719#else
720 // magma_queue currently not supported
721 cudaDeviceSynchronize();
722 tpotrf_dev('L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
723 cudaDeviceSynchronize();
724
725 //copy_supernode_to_host_write(blockR_dev, IB);
726 //exit(1);
727#endif
728 //printf("RJ: tpotrf NR: %d, a: %d, lda: %d\n\n", NR, mf_block_index(0, 0), mf_block_lda(0, 0));
729 t_temp = get_time(t_temp);
730 t_potrf += t_temp;
731 //printf("IB = %d, t_potrf = %f\n", IB, t_temp);
732 //flop_count += 1.0/3.0 * NR * NR * NR;
733 flop_count += 1.0/3.0 * NR * NR * NR + 0.5 * NR * NR + 1.0/6.0 * NR;
734
735 // the trsm shouldn't be launched until potrf has finished
736 //cudaDeviceSynchronize();
737 gpuErrchk(cudaEventRecord(potrf_dev_ev, magma_cudaStream_1));
738 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, potrf_dev_ev, 0));
739
740 // TODO: if nt = 0 but nd != 0 => problem !!
741 //rj dtrsm RLTN MF[IB,IB] MF[IB+1,IB]
742 if (matrix_nt > 1)
743 {
744 t_temp = get_time(0.0);
745 // UPDATE COLUMNS ACCORDING TO CHOLESKY FACTORISATION
746 // separate the two operations
747 //ttrsm_dev('R', 'L', 'T', 'N', NR+matrix_nd, NR, ONE, blockR_dev, mf_block_lda(0, 0), &blockR_dev[NR], mf_block_lda(1, 0), magma_queue_1);
748 ttrsm_dev('R', 'L', 'T', 'N', NR, NR, ONE, blockR_dev, mf_block_lda(0, 0), &blockR_dev[NR], mf_block_lda(1, 0), magma_queue_1);
749
750 t_temp = get_time(t_temp);
751 t_dgemm += t_temp;
752 flop_count += NR * NR * NR;
753 }
754
755 if (matrix_nd > 0){
756 //printf("mf_dense_block_offset(IB) : %ld\n", mf_dense_block_offset(IB));
757 //printf("mf_dense_block_lda(IB) : %ld\n", mf_dense_block_lda(IB));
758 ttrsm_dev('R', 'L', 'T', 'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(0, 0), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
759 flop_count += matrix_nd * NR * NR;
760 }
761
762 t_temp = get_time(0.0);
763 cudaDeviceSynchronize();
764 // if copy supernode device to host happens while new supernode is being initialized ... memory issues?
765 copy_supernode_to_host(blockR_dev, IB, copyStream);
766 //copy_supernode_to_host_write(blockR_dev, IB);
767
768 t_temp = get_time(t_temp);
769 t_copy_DtH =+ t_temp;
770
771#ifdef PRINT_MSG
772 std::cout << "entering loop block factorization loop now" << std::endl;
773#endif
774
775 //rj IB = 1; IB < NBlock; IB++
776 for (IB = 1; IB < matrix_nt-1; IB++)
777 {
778
779#ifdef PRINT_MSG
780 std::cout << "IB = " << IB << std::endl;
781#endif
782
783 NR = Bmax[IB]-Bmin[IB];
784 NM = Bmax[IB-1]-Bmin[IB-1];
785
786 double flop_1iter = 0.0;
787
788 //
789 swap_pointers(&blockR_dev, &blockM_dev);
790
791 nvtxRangeId_t id_initSN = nvtxRangeStartA("initSuperNode_inLoop");
792 init_supernode(blockR_dev, IB, magma_cudaStream_1);
793 nvtxRangeEnd(id_initSN);
794
795 // add cuda_event such that magma_queue_2/magma_cudaStream_2 wait for magma_cudaStream_1 to get here
796 // maybe better to use magma version?
797 gpuErrchk(cudaEventRecord(initBlock_dev_ev, magma_cudaStream_1));
798 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, initBlock_dev_ev, 0));
799
800 // UPDATE NEXT DIAGONAL BLOCK
801 nvtxRangeId_t id_dgemm = nvtxRangeStartA("BigGemm_inLoop");
802 t_temp = get_time(0.0);
803 tgemm_dev('N', 'T', NR, NR, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
804 t_temp = get_time(t_temp);
805 nvtxRangeEnd(id_dgemm);
806 t_dgemm += t_temp;
807
808 flop_count += 2.0 * NR * NR * NM;
809
810 // direct into separate stream -> becomes relevant when many fixed effects ...
811 if (matrix_nd > 0)
812 {
813 // Update dense rows of next super node IB
814 id_dgemm = nvtxRangeStartA("DenseLowerGemm_inLoop");
815 t_temp = get_time(0.0);
816 tgemm_dev('N', 'T', matrix_nd, NR, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
817 t_dgemm += get_time(t_temp);
818 nvtxRangeEnd(id_dgemm);
819 // NM = NR
820 flop_count += 2.0*matrix_nd * NR * NM;
821
822 // update last diagonal block
823 id_dgemm = nvtxRangeStartA("LastBlockLowerGemm_inLoop");
824 tgemm_dev('N', 'T', matrix_nd, matrix_nd, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_2);
825 nvtxRangeEnd(id_dgemm);
826 flop_count += 2.0* matrix_nd *matrix_nd *NM;
827
828 }
829
830 //printf("RJ: tgemm NR: %d, NM: %d, a: %d, lda: %d, b: %d, ldb: %d, c: %d, ldc: %d\n", NR, NM, mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB), mf_block_lda(IB, IB));
831 //rj dpotrf MF[IB,IB]
832 t_temp = get_time(0.0);
833 nvtxRangeId_t id_dpotrf = nvtxRangeStartA("Potrf_inLoop");
834
835#ifdef CUDA_POTRF
836 if(NR != NM){
837 //printf("Different block size! Re-initialize cudaBufferSize. NR = %ld, NM = %ld\n", NR, NM);
838 cuda_buffer_flag_potrf[0] = 0;
839 }
840 tpotrf_dev_cuda('L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
841 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
842#elif defined(MAGMA_EXPERT)
843 if(NR != NM){
844 //printf("Different block size! Re-initialize cudaBufferSize. NR = %ld, NM = %ld\n", NR, NM);
845 magma_potrf_init_flag = 0;
846 }
847 copy_supernode_to_host_write(blockR_dev, IB);
848 //magma_potrf_init_flag = 0;
849 magma_tpotrf_expert_wrapper('L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
850 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
851 cudaDeviceSynchronize();
852 copy_supernode_to_host_write(blockR_dev, IB);
853#else
854 // how can I get the default stream to wait but make an exception for copyStream ?
855 cudaDeviceSynchronize(); // TO BE REPLACED
856 tpotrf_dev('L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
857 cudaDeviceSynchronize();
858
859 //copy_supernode_to_host_write(blockR_dev, IB);
860 //exit(1);
861#endif
862
863 nvtxRangeEnd(id_dpotrf);
864 t_temp = get_time(t_temp);
865 t_potrf += t_temp;
866 //printf("IB = %d, t_potrf = %f\n", IB, t_temp);
867 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
868
869 //gpuErrchk(cudaEventRecord(potrf_dev_ev, magma_cudaStream_1));
870 //gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, potrf_dev_ev, 0));
871
872 magma_event_record(potrf_dev_magma_ev, magma_queue_1);
873 magma_queue_wait_event(magma_queue_2, potrf_dev_magma_ev);
874
875 // triangular solve, all columns -> potentially split this ... when nd large ... two separate streams.
876 //printf("RJ: tpotrf NR: %d, a: %d, lda: %d\n\n", NR, mf_block_index(IB, IB), mf_block_lda(IB, IB));
877 //rj dtrsm RLTN MF[IB,IB] MF[IB+1,IB]
878 nvtxRangeId_t id_ttrsm = nvtxRangeStartA("ttrsm_inLoop");
879 //ttrsm_dev('R', 'L', 'T', 'N', NR+matrix_nd, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[NR], mf_block_lda(IB+1, IB), magma_queue_1);
880 ttrsm_dev('R', 'L', 'T', 'N', NR, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[NR], mf_block_lda(IB+1, IB), magma_queue_1);
881
882 if(matrix_nd > 0){
883 ttrsm_dev('R', 'L', 'T', 'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
884 }
885
886 nvtxRangeEnd(id_ttrsm);
887
888 flop_count += (NR + matrix_nd) * NR * NR;
889 //printf("RJ: ttrsm NR: %d, a: %d, lda: %d, b: %d, ldb: %d\n", NR, mf_block_index(IB-1, IB-1), mf_block_lda(IB-1, IB-1), mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1));
890
891 t_temp = get_time(0.0);
892
893 nvtxRangeId_t id_Cpy_SN_host = nvtxRangeStartA("copy_SN_toHost_inLoop");
894 cudaDeviceSynchronize();
895 copy_supernode_to_host(blockR_dev, IB, copyStream);
896 //copy_supernode_to_host(blockR_dev, IB);
897 nvtxRangeEnd(id_Cpy_SN_host);
898 t_copy_DtH += get_time(t_temp);
899
900 } // end for loop IB
901
902 if (matrix_nt > 1)
903 {
904 IB = matrix_nt-1;
905 NR = Bmax[IB]-Bmin[IB];
906 NM = Bmax[IB-1]-Bmin[IB-1];
907
908#ifdef PRINT_MSG
909 std::cout << "IB = " << IB << std::endl;
910#endif
911
912 swap_pointers(&blockR_dev, &blockM_dev);
913 nvtxRangeId_t id_initSN = nvtxRangeStartA("initLastSuperNode");
914 init_supernode(blockR_dev, IB, magma_cudaStream_1);
915 //init_supernode(blockR_dev, IB);
916 nvtxRangeEnd(id_initSN);
917
918 gpuErrchk(cudaEventRecord(initBlock_dev_ev, magma_cudaStream_1));
919 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, initBlock_dev_ev, 0));
920
921 //rj dgemm NT M[IB,IB-1] M[IB,IB-1]
922 // todo rj: 3-last parameter ZERO in PARDISO
923 nvtxRangeId_t id_dgemm = nvtxRangeStartA("LastLargeGemm");
924 tgemm_dev('N', 'T', NR, NR, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
925 nvtxRangeEnd(id_dgemm);
926 flop_count += 2.0* NR * NR * NM;
927
928 if (matrix_nd > 0)
929 {
930 tgemm_dev('N', 'T', matrix_nd, NR, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
931 flop_count += 2.0*matrix_nd * NR * NM;
932
933 tgemm_dev('N', 'T', matrix_nd, matrix_nd, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_2);
934 flop_count += 2.0*matrix_nd *matrix_nd *NM;
935 }
936
937 //printf("RJ: tgemm NR: %d, NM: %d, a: %d, lda: %d, b: %d, ldb: %d, c: %d, ldc: %d\n", NR, NM, mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB), mf_block_lda(IB, IB));
938 //rj dpotrf MF[IB,IB]
939 nvtxRangeId_t id_dpotrf = nvtxRangeStartA("LastPotrf");
940#ifdef CUDA_POTRF
941 if(NR != NM){
942 //printf("Different block size! Re-initialize cudaBufferSize. NR = %ld, NM = %ld\n", NR, NM);
943 cuda_buffer_flag_potrf[0] = 0;
944 }
945 tpotrf_dev_cuda('L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
946 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
947#elif defined(MAGMA_EXPERT)
948 magma_potrf_init_flag = 0;
949
950 magma_tpotrf_expert_wrapper('L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
951 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
952 //copy_supernode_to_host_write(blockR_dev, IB);
953 //exit(1);
954#else
955 cudaDeviceSynchronize(); // TO BE REPLACED
956 tpotrf_dev('L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
957 cudaDeviceSynchronize();
958#endif
959 nvtxRangeEnd(id_dpotrf);
960 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
961
962 //printf("RJ: tpotrf NR: %d, a: %d, lda: %d\n\n", NR, mf_block_index(IB, IB), mf_block_lda(IB, IB));
963 //rj dtrsm RLTN MF[IB,IB] MF[IB+1,IB]
964 if (matrix_nd > 0)
965 {
966 nvtxRangeId_t id_ttrsm = nvtxRangeStartA("LastTtrsm");
967 ttrsm_dev('R', 'L', 'T', 'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_1);
968 nvtxRangeEnd(id_ttrsm);
969 flop_count += matrix_nd * NR * NR;
970
971 }
972 //printf("RJ: ttrsm NR: %d, a: %d, lda: %d, b: %d, ldb: %d\n", NR, mf_block_index(IB-1, IB-1), mf_block_lda(IB-1, IB-1), mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1));
973 t_temp = get_time(0.0);
974
975 cudaDeviceSynchronize();
976 nvtxRangeId_t id_Cpy_SN_host = nvtxRangeStartA("copy_LastLarge_SN_toHost");
977 copy_supernode_to_host(blockR_dev, IB, copyStream);
978 //copy_supernode_to_host_write(blockR_dev, IB);
979 //copy_supernode_to_host(blockR_dev, IB);
980 nvtxRangeEnd(id_Cpy_SN_host);
981
982 t_copy_DtH += get_time(t_temp);
983
984 }
985
986 if (matrix_nd > 0)
987 {
988 IB = NBlock-1;
989 NR = Bmax[IB]-Bmin[IB];
990 NM = Bmax[IB-1]-Bmin[IB-1];
991
992#ifdef PRINT_MSG
993 std::cout << "IB = " << IB << std::endl;
994#endif
995
996 //rj dgemm NT M[IB,IB-1] M[IB,IB-1]
997 //rj NM is the same for all blocks 0..nt-1
998 tgemm_dev('N', 'T', matrix_nd, matrix_nd, NM, -ONE, &blockR_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockR_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_1);
999 flop_count += 2.0 * matrix_nd *matrix_nd *NM;
1000
1001 //printf("RJ: tgemm NR: %d, NM: %d, a: %d, lda: %d, b: %d, ldb: %d, c: %d, ldc: %d\n", NR, NM, mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB), mf_block_lda(IB, IB));
1002 //rj dpotrf MF[IB,IB]
1003 //tpotrf_dev('L', NR, blockDense_dev, mf_block_lda(IB, IB), &info);
1004#ifdef CUDA_POTRF
1005 if(NR != NM){
1006 //printf("Different block size! Re-initialize cudaBufferSize. NR = %ld, NM = %ld\n", NR, NM);
1007 cuda_buffer_flag_potrf[0] = 0;
1008 }
1009 tpotrf_dev_cuda('L', NR, blockDense_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1010 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1011#elif defined(MAGMA_EXPERT)
1012 magma_potrf_init_flag = 0;
1013
1014 magma_tpotrf_expert_wrapper('L', NR, blockDense_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1015 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1016 //copy_supernode_to_host_write(blockR_dev, IB);
1017 //exit(1);
1018#else
1019 tpotrf_dev('L', NR, blockDense_dev, mf_block_lda(IB, IB), &info);
1020#endif
1021
1022 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
1023 //printf("RJ: tpotrf NR: %d, a: %d, lda: %d\n\n", NR, mf_block_index(IB, IB), mf_block_lda(IB, IB));
1024
1025#ifdef PRINT_MSG
1026 std::cout << "after complete Cholesky factorization" << std::endl;
1027#endif
1028 t_temp = get_time(0.0);
1029
1030 cudaDeviceSynchronize();
1031 copy_supernode_to_host(blockDense_dev, IB, copyStream);
1032 //copy_supernode_to_host_write(blockDense_dev, IB);
1033
1034 t_copy_DtH += get_time(t_temp);
1035
1036 }
1037
1038 cudaDeviceSynchronize(); // TO BE REPLACED
1039
1040 t_fact = get_time(t_fact);
1041 //nt : " << flop_count / 1e9 << std::endl;
1042 //std::cout << "time factorize : " << t_fact << ", sum time copy DtH : " << t_copy_DtH << ", sum t potrf : " << t_potrf << ", sum t dgemm : " << t_dgemm << std::endl;
1043 double gflops = flop_count / (1e9*t_fact);
1044 //printf("GFLOPS factorize : %f\n", gflops);
1045
1046 //std::cout << "time factorization Qxy : " << t_fact << std::endl;
1047 //std::cout << "gflop count Qxy : " << flop_count / 1e9 << std::endl;
1048 return gflops;
1049}
1050
1051/************************************************************************************************/
1052
1053// combine first & second Stage factor to do forward solve directly on GPU, i.e. L L^T x = b, solve L y = b
1054// directly when computing L recursively
1055template <class T>
1056double BTA<T>::FirstSecondStageFactor(size_t nrhs)
1057{
1058
1059#ifdef PRINT_MSG
1060 std::cout << "In FirstSecondStageFactor(), omp get thread num = " << omp_get_thread_num() << std::endl;
1061#endif
1062
1063 double t_potrf = 0;
1064 double t_dgemm = 0;
1065 double t_copy_DtH = 0;
1066 double t_temp;
1067
1068 double t_init_supernode = get_time(0.0);
1069
1070 int info;
1071 size_t IB;
1072 size_t NR,NM;
1073 T ONE = f_one();
1074
1075 // count floating point operations
1076 double flop_count = 0;
1077
1078 IB = 0;
1079 NR = Bmax[0]-Bmin[0];
1080
1081#ifdef PRINT_MSG
1082 std::cout << "IB = " << IB << std::endl;
1083#endif
1084
1085 /*int GPU_CurrRank;
1086 cudaGetDevice(&GPU_CurrRank);
1087 printf("in firstStageFactor. cudaGetDevice : %ld, supposed GPU rank : %ld\n", GPU_CurrRank, GPU_rank);*/
1088
1089 init_supernode(blockR_dev, IB, magma_cudaStream_1);
1090 if (matrix_nd > 0)
1091 {
1092 init_supernode(blockDense_dev, matrix_nt, magma_cudaStream_1);
1093 }
1094
1095#ifdef PRINT_MSG
1096 std::cout << "calling first Cholesky factorization now" << std::endl;
1097#endif
1098
1099 t_init_supernode = get_time(t_init_supernode);
1100 //printf("time init supernode in factorize: %f\n", t_init_supernode);
1101
1102 // start timer for counting flops
1103 double t_fact = get_time(0.0);
1104
1105 t_temp = get_time(0.0);
1106 // CHOLESKY FACTORISATION FIRST DIAGONAL BLOCK
1107#ifdef CUDA_POTRF
1108 // factorization first block -> set buffer flag to zero!
1109 cuda_buffer_flag_potrf[0] = 0;
1110 tpotrf_dev_cuda('L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1111 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1112#elif defined(MAGMA_EXPERT)
1113 magma_potrf_init_flag = 0;
1114 magma_tpotrf_expert_wrapper('L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1115 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1116 //copy_supernode_to_host_write(blockR_dev, IB);
1117 //exit(1);
1118#else
1119 // magma_queue currently not supported
1120 cudaDeviceSynchronize();
1121 tpotrf_dev('L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
1122 cudaDeviceSynchronize();
1123#endif
1124 //printf("RJ: tpotrf NR: %d, a: %d, lda: %d\n\n", NR, mf_block_index(0, 0), mf_block_lda(0, 0));
1125 t_temp = get_time(t_temp);
1126 t_potrf += t_temp;
1127 //printf("IB = %d, t_potrf = %f\n", IB, t_temp);
1128 //flop_count += 1.0/3.0 * NR * NR * NR;
1129 flop_count += 1.0/3.0 * NR * NR * NR + 0.5 * NR * NR + 1.0/6.0 * NR;
1130
1131 // the trsm shouldn't be launched until potrf has finished
1132 //cudaDeviceSynchronize();
1133 gpuErrchk(cudaEventRecord(potrf_dev_ev, magma_cudaStream_1));
1134 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, potrf_dev_ev, 0));
1135
1136 cudaDeviceSynchronize();
1137 // FORWARD SOLVE: when Cholesky factor for L11 computed solve L11 * y1 = b1
1138 //c_ttrsm('L', 'L', 'N', 'N', NR, nrhs, ONE, &MF[mf_block_index(0, 0)], mf_block_lda(0, 0), &rhs[Bmin[0]], matrix_size);
1139 // TODO: update magma queue?
1140 ttrsm_dev('L', 'L', 'N', 'N', NR, nrhs, ONE, blockR_dev, mf_block_lda(0,0), &rhs_dev[Bmin[0]], matrix_size, magma_queue_1);
1141 // flop_count += 2.0 * NR * NR * nrhs;
1142
1143 // TODO: if nt = 0 but nd != 0 => problem !!
1144 //rj dtrsm RLTN MF[IB,IB] MF[IB+1,IB]
1145 if (matrix_nt > 1)
1146 {
1147 t_temp = get_time(0.0);
1148 // UPDATE COLUMNS ACCORDING TO CHOLESKY FACTORISATION
1149 // separate the two operations
1150 //ttrsm_dev('R', 'L', 'T', 'N', NR+matrix_nd, NR, ONE, blockR_dev, mf_block_lda(0, 0), &blockR_dev[NR], mf_block_lda(1, 0), magma_queue_1);
1151 ttrsm_dev('R', 'L', 'T', 'N', NR, NR, ONE, blockR_dev, mf_block_lda(0, 0), &blockR_dev[NR], mf_block_lda(1, 0), magma_queue_1);
1152
1153 t_temp = get_time(t_temp);
1154 t_dgemm += t_temp;
1155 flop_count += NR * NR * NR;
1156 }
1157
1158 if (matrix_nd > 0){
1159 //printf("mf_dense_block_offset(IB) : %ld\n", mf_dense_block_offset(IB));
1160 //printf("mf_dense_block_lda(IB) : %ld\n", mf_dense_block_lda(IB));
1161 ttrsm_dev('R', 'L', 'T', 'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(0, 0), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1162 flop_count += matrix_nd * NR * NR;
1163 }
1164
1165 t_temp = get_time(0.0);
1166 cudaDeviceSynchronize();
1167 // if copy supernode device to host happens while new supernode is being initialized ... memory issues?
1168 copy_supernode_to_host(blockR_dev, IB, copyStream);
1169 //copy_supernode_to_host_write(blockR_dev, IB);
1170 //exit(1);
1171
1172 t_temp = get_time(t_temp);
1173 t_copy_DtH =+ t_temp;
1174
1175#ifdef PRINT_MSG
1176 std::cout << "entering loop block factorization loop now" << std::endl;
1177#endif
1178
1179 //rj IB = 1; IB < NBlock; IB++
1180 for (IB = 1; IB < matrix_nt-1; IB++)
1181 {
1182
1183#ifdef PRINT_MSG
1184 std::cout << "IB = " << IB << std::endl;
1185#endif
1186
1187 NR = Bmax[IB]-Bmin[IB];
1188 NM = Bmax[IB-1]-Bmin[IB-1];
1189
1190 double flop_1iter = 0.0;
1191
1192 swap_pointers(&blockR_dev, &blockM_dev);
1193
1194 nvtxRangeId_t id_initSN = nvtxRangeStartA("initSuperNode_inLoop");
1195 init_supernode(blockR_dev, IB, magma_cudaStream_1);
1196 //init_supernode(blockR_dev, IB);
1197 nvtxRangeEnd(id_initSN);
1198
1199 // magma_queue_1 is on top of magma_cudaStream_1
1200 // add cuda_event such that magma_queue_2/magma_cudaStream_2 wait for magma_cudaStream_1 to get here
1201 // maybe better to use magma version?
1202 gpuErrchk(cudaEventRecord(initBlock_dev_ev, magma_cudaStream_1));
1203 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, initBlock_dev_ev, 0));
1204
1205 nvtxRangeId_t id_dgemm = nvtxRangeStartA("BigGemm_inLoop");
1206 t_temp = get_time(0.0);
1207 tgemm_dev('N', 'T', NR, NR, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
1208 t_temp = get_time(t_temp);
1209 nvtxRangeEnd(id_dgemm);
1210 t_dgemm += t_temp;
1211
1212 // dim dgemm: m = , n = , k =
1213 flop_count += 2.0 * NR * NR * NM;
1214
1215 // copy back y1
1216 /*T* y_host = new T[NR*NR];
1217 cudaMemcpy(y_host, &rhs_dev[Bmin[0]], NR*NR*sizeof(T), cudaMemcpyDeviceToHost );
1218 printf("rhs host(%ld) before gemm = ", IB-1);
1219 for(int i = 0; i<NR*NR; i++){
1220 printf(" %f ", y_host[i]);
1221 }
1222 printf("\n");
1223
1224 delete [] y_host;*/
1225
1226 // FORWARD SOLVE
1227 // compute b_i = b_i - L_ii-1 * x_i-1
1228 //c_tgemm('N', 'N', NR, nrhs, NM, -ONE, &MF[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs[Bmin[IB-1]], matrix_size, ONE, &rhs[Bmin[IB]], matrix_size);
1229 tgemm_dev('N', 'N', NR, nrhs, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &rhs_dev[Bmin[IB-1]], matrix_size, ONE, &rhs_dev[Bmin[IB]], matrix_size, magma_queue_1);
1230 //flop_count += 2.0 * NR * nrhs * NM;
1231
1232 // copy back blockR_dev
1233 /*size_t size_SN = 2*NR*NR+matrix_nd*NR;
1234 T* blockR_host = new T[size_SN];
1235 cudaMemcpy(blockR_host, blockM_dev, size_SN*sizeof(T), cudaMemcpyDeviceToHost );
1236 printf("\nChol host(%ld) = ", IB);
1237 for(int i = 0; i<size_SN; i++){
1238 printf(" %f ", blockR_host[i]);
1239 }
1240 printf("\n\n");*/
1241
1242 // direct into separate stream -> becomes relevant when many fixed effects ...
1243 if (matrix_nd > 0)
1244 {
1245 // Update dense rows of next super node IB
1246 id_dgemm = nvtxRangeStartA("DenseLowerGemm_inLoop");
1247 t_temp = get_time(0.0);
1248 tgemm_dev('N', 'T', matrix_nd, NR, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1249 t_dgemm += get_time(t_temp);
1250 nvtxRangeEnd(id_dgemm);
1251 // NM = NR
1252 flop_count += 2.0*matrix_nd * NR * NM;
1253
1254 // update last diagonal block
1255 id_dgemm = nvtxRangeStartA("LastBlockLowerGemm_inLoop");
1256 tgemm_dev('N', 'T', matrix_nd, matrix_nd, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_2);
1257 nvtxRangeEnd(id_dgemm);
1258 flop_count += 2.0* matrix_nd *matrix_nd *NM;
1259
1260 // FORWARD SOLVE
1261 // update b_n+1 = b_n+1 - L_ni * y_i
1262 //c_tgemm('N', 'N', NR, nrhs, NM, -ONE, &MF[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs[Bmin[i]], matrix_size, ONE, &rhs[Bmin[IB]], matrix_size);
1263 //flop_count += 2.0*NR * nrhs * NM;
1264 //printf("NBlock: %ld, Bmin[NBlock-1]: %ld\n", NBlock, Bmin[NBlock-1]);
1265 tgemm_dev('N', 'N', NR, nrhs, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &rhs_dev[Bmin[IB-1]], matrix_size, ONE, &rhs_dev[Bmin[NBlock-1]], matrix_size, magma_queue_2);
1266
1267 //
1268 }
1269
1270 //printf("RJ: tgemm NR: %d, NM: %d, a: %d, lda: %d, b: %d, ldb: %d, c: %d, ldc: %d\n", NR, NM, mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB), mf_block_lda(IB, IB));
1271 //rj dpotrf MF[IB,IB]
1272 t_temp = get_time(0.0);
1273 nvtxRangeId_t id_dpotrf = nvtxRangeStartA("Potrf_inLoop");
1274
1275#ifdef CUDA_POTRF
1276 if(NR != NM){
1277 //printf("Different block size! Re-initialize cudaBufferSize. NR = %ld, NM = %ld\n", NR, NM);
1278 cuda_buffer_flag_potrf[0] = 0;
1279 }
1280 tpotrf_dev_cuda('L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1281 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1282#elif defined(MAGMA_EXPERT)
1283 if(NR != NM){
1284 //printf("Different block size! Re-initialize BufferSize. NR = %ld, NM = %ld\n", NR, NM);
1285 magma_potrf_init_flag = 0;
1286 }
1287 magma_tpotrf_expert_wrapper('L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1288 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1289 //copy_supernode_to_host_write(blockR_dev, IB);
1290 //exit(1);
1291#else
1292 // how can I get the default stream to wait but make an exception for copyStream ?
1293 cudaDeviceSynchronize(); // TO BE REPLACED
1294 tpotrf_dev('L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
1295 cudaDeviceSynchronize();
1296#endif
1297
1298 nvtxRangeEnd(id_dpotrf);
1299 t_temp = get_time(t_temp);
1300 t_potrf += t_temp;
1301 //printf("IB = %d, t_potrf = %f\n", IB, t_temp);
1302 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
1303
1304 // FORWARD SOLVE: with newly computed Cholesky factor
1305 // solve for y_i = L^-1_i*b_i (which was updated before)
1306 //c_ttrsm('L', 'L', 'N', 'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
1307 ttrsm_dev('L', 'L', 'N', 'N', NR, nrhs, ONE, blockR_dev, mf_block_lda(IB,IB), &rhs_dev[Bmin[IB]], matrix_size, magma_queue_1);
1308 //flop_count += 2.0*NR * NR * nrhs;
1309
1310 // rhs_dev[Bmin[IB]]
1311 /*T* rhs_host = new T[NR*NR];
1312 cudaMemcpy(rhs_host, &rhs_dev[Bmin[IB-1]], NR*NR*sizeof(T), cudaMemcpyDeviceToHost );
1313 printf("\nrhs host(%ld) after ttrsm = ", IB);
1314 for(int i = 0; i<NR*NR; i++){
1315 printf(" %f ", rhs_host[i]);
1316 }
1317 printf("\n\n");
1318
1319 delete [] rhs_host;*/
1320
1321 magma_event_record(potrf_dev_magma_ev, magma_queue_1);
1322 magma_queue_wait_event(magma_queue_2, potrf_dev_magma_ev);
1323
1324 // triangular solve, all columns -> potentially split this ... when nd large ... two separate streams.
1325 //printf("RJ: tpotrf NR: %d, a: %d, lda: %d\n\n", NR, mf_block_index(IB, IB), mf_block_lda(IB, IB));
1326 //rj dtrsm RLTN MF[IB,IB] MF[IB+1,IB]
1327 nvtxRangeId_t id_ttrsm = nvtxRangeStartA("ttrsm_inLoop");
1328 //ttrsm_dev('R', 'L', 'T', 'N', NR+matrix_nd, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[NR], mf_block_lda(IB+1, IB), magma_queue_1);
1329 ttrsm_dev('R', 'L', 'T', 'N', NR, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[NR], mf_block_lda(IB+1, IB), magma_queue_1);
1330
1331 if(matrix_nd > 0){
1332 ttrsm_dev('R', 'L', 'T', 'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1333 }
1334
1335 nvtxRangeEnd(id_ttrsm);
1336
1337 flop_count += (NR + matrix_nd) * NR * NR;
1338 //printf("RJ: ttrsm NR: %d, a: %d, lda: %d, b: %d, ldb: %d\n", NR, mf_block_index(IB-1, IB-1), mf_block_lda(IB-1, IB-1), mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1));
1339
1340 t_temp = get_time(0.0);
1341
1342 nvtxRangeId_t id_Cpy_SN_host = nvtxRangeStartA("copy_SN_toHost_inLoop");
1343 cudaDeviceSynchronize();
1344 copy_supernode_to_host(blockR_dev, IB, copyStream);
1345 //copy_supernode_to_host(blockR_dev, IB);
1346 nvtxRangeEnd(id_Cpy_SN_host);
1347 t_copy_DtH += get_time(t_temp);
1348
1349 } // end for loop IB
1350
1351 if (matrix_nt > 1)
1352 {
1353 IB = matrix_nt-1;
1354 NR = Bmax[IB]-Bmin[IB];
1355 NM = Bmax[IB-1]-Bmin[IB-1];
1356
1357#ifdef PRINT_MSG
1358 std::cout << "IB = " << IB << std::endl;
1359#endif
1360
1361 swap_pointers(&blockR_dev, &blockM_dev);
1362 nvtxRangeId_t id_initSN = nvtxRangeStartA("initLastSuperNode");
1363 init_supernode(blockR_dev, IB, magma_cudaStream_1);
1364 //init_supernode(blockR_dev, IB);
1365 nvtxRangeEnd(id_initSN);
1366
1367 gpuErrchk(cudaEventRecord(initBlock_dev_ev, magma_cudaStream_1));
1368 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, initBlock_dev_ev, 0));
1369
1370 //rj dgemm NT M[IB,IB-1] M[IB,IB-1]
1371 // todo rj: 3-last parameter ZERO in PARDISO
1372 nvtxRangeId_t id_dgemm = nvtxRangeStartA("LastLargeGemm");
1373 tgemm_dev('N', 'T', NR, NR, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
1374 nvtxRangeEnd(id_dgemm);
1375 flop_count += 2.0* NR * NR * NM;
1376
1377 // FORWARD SOLVE
1378 // compute b_i = b_i - L_ii-1 * y_i-1
1379 //c_tgemm('N', 'N', NR, nrhs, NM, -ONE, &MF[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs[Bmin[IB-1]], matrix_size, ONE, &rhs[Bmin[IB]], matrix_size);
1380 tgemm_dev('N', 'N', NR, nrhs, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &rhs_dev[Bmin[IB-1]], matrix_size, ONE, &rhs_dev[Bmin[IB]], matrix_size, magma_queue_1);
1381 //flop_count += 2.0 * NR * nrhs * NM;
1382
1383 if (matrix_nd > 0)
1384 {
1385 tgemm_dev('N', 'T', matrix_nd, NR, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1386 flop_count += 2.0*matrix_nd * NR * NM;
1387
1388 tgemm_dev('N', 'T', matrix_nd, matrix_nd, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_2);
1389 flop_count += 2.0*matrix_nd *matrix_nd *NM;
1390
1391 // FORWARD SOLVE
1392 // update b_n+1 = b_n+1 - L_ni * y_i
1393 //c_tgemm('N', 'N', NR, nrhs, NM, -ONE, &MF[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs[Bmin[i]], matrix_size, ONE, &rhs[Bmin[IB]], matrix_size);
1394 tgemm_dev('N', 'N', NR, nrhs, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &rhs_dev[Bmin[IB-1]], matrix_size, ONE, &rhs_dev[Bmin[NBlock-1]], matrix_size, magma_queue_2);
1395 //flop_count += 2.0*NR * nrhs * NM;
1396 }
1397
1398 //printf("RJ: tgemm NR: %d, NM: %d, a: %d, lda: %d, b: %d, ldb: %d, c: %d, ldc: %d\n", NR, NM, mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB), mf_block_lda(IB, IB));
1399 //rj dpotrf MF[IB,IB]
1400 nvtxRangeId_t id_dpotrf = nvtxRangeStartA("LastPotrf");
1401#ifdef CUDA_POTRF
1402 if(NR != NM){
1403 //printf("Different block size! Re-initialize cudaBufferSize. NR = %ld, NM = %ld\n", NR, NM);
1404 cuda_buffer_flag_potrf[0] = 0;
1405 }
1406 tpotrf_dev_cuda('L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1407 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1408#elif defined(MAGMA_EXPERT)
1409 if(NR != NM){
1410 //printf("Different block size! Re-initialize BufferSize. NR = %ld, NM = %ld\n", NR, NM);
1411 magma_potrf_init_flag = 0;
1412 }
1413
1414 magma_tpotrf_expert_wrapper('L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1415 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1416 //copy_supernode_to_host_write(blockR_dev, IB);
1417 //exit(1);
1418#else
1419 cudaDeviceSynchronize(); // TO BE REPLACED
1420 tpotrf_dev('L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
1421 cudaDeviceSynchronize();
1422#endif
1423 nvtxRangeEnd(id_dpotrf);
1424 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
1425
1426 // FORWARD SOLVE: with newly computed Cholesky factor
1427 // solve for y_i = L^-1_i*b_i (which was updated before)
1428 //c_ttrsm('L', 'L', 'N', 'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
1429 ttrsm_dev('L', 'L', 'N', 'N', NR, nrhs, ONE, blockR_dev, mf_block_lda(IB,IB), &rhs_dev[Bmin[IB]], matrix_size, magma_queue_1);
1430 //flop_count += 2.0*NR * NR * nrhs;
1431
1432 //printf("RJ: tpotrf NR: %d, a: %d, lda: %d\n\n", NR, mf_block_index(IB, IB), mf_block_lda(IB, IB));
1433 //rj dtrsm RLTN MF[IB,IB] MF[IB+1,IB]
1434 if (matrix_nd > 0)
1435 {
1436 nvtxRangeId_t id_ttrsm = nvtxRangeStartA("LastTtrsm");
1437 ttrsm_dev('R', 'L', 'T', 'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_1);
1438 nvtxRangeEnd(id_ttrsm);
1439 flop_count += matrix_nd * NR * NR;
1440
1441 }
1442 //printf("RJ: ttrsm NR: %d, a: %d, lda: %d, b: %d, ldb: %d\n", NR, mf_block_index(IB-1, IB-1), mf_block_lda(IB-1, IB-1), mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1));
1443 t_temp = get_time(0.0);
1444
1445 cudaDeviceSynchronize();
1446 nvtxRangeId_t id_Cpy_SN_host = nvtxRangeStartA("copy_LastLarge_SN_toHost");
1447 copy_supernode_to_host(blockR_dev, IB, copyStream);
1448 //copy_supernode_to_host_write(blockR_dev, IB);
1449 //copy_supernode_to_host(blockR_dev, IB);
1450 nvtxRangeEnd(id_Cpy_SN_host);
1451
1452 t_copy_DtH += get_time(t_temp);
1453
1454 }
1455
1456 if (matrix_nd > 0)
1457 {
1458 IB = NBlock-1;
1459 NR = Bmax[IB]-Bmin[IB];
1460 NM = Bmax[IB-1]-Bmin[IB-1];
1461
1462#ifdef PRINT_MSG
1463 std::cout << "IB = " << IB << std::endl;
1464#endif
1465
1466 //rj dgemm NT M[IB,IB-1] M[IB,IB-1]
1467 //rj NM is the same for all blocks 0..nt-1
1468 tgemm_dev('N', 'T', matrix_nd, matrix_nd, NM, -ONE, &blockR_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockR_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_1);
1469 flop_count += 2.0 * matrix_nd *matrix_nd *NM;
1470
1471 // FORWARD SOLVE
1472 // update b_n+1 = b_n+1 - L_ni * y_i
1473 //c_tgemm('N', 'N', NR, nrhs, NM, -ONE, &MF[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs[Bmin[i]], matrix_size, ONE, &rhs[Bmin[IB]], matrix_size);
1474 tgemm_dev('N', 'N', NR, nrhs, NM, -ONE, &blockR_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &rhs_dev[Bmin[IB-1]], matrix_size, ONE, &rhs_dev[Bmin[NBlock-1]], matrix_size, magma_queue_1);
1475 //flop_count += 2.0*NR * nrhs * NM;
1476
1477 //printf("RJ: tgemm NR: %d, NM: %d, a: %d, lda: %d, b: %d, ldb: %d, c: %d, ldc: %d\n", NR, NM, mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB), mf_block_lda(IB, IB));
1478 //rj dpotrf MF[IB,IB]
1479 //tpotrf_dev('L', NR, blockDense_dev, mf_block_lda(IB, IB), &info);
1480#ifdef CUDA_POTRF
1481 if(NR != NM){
1482 //printf("Different block size! Re-initialize cudaBufferSize. NR = %ld, NM = %ld\n", NR, NM);
1483 cuda_buffer_flag_potrf[0] = 0;
1484 }
1485 tpotrf_dev_cuda('L', NR, blockDense_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1486 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1487#elif defined(MAGMA_EXPERT)
1488 if(NR != NM){
1489 //printf("Different block size! Re-initialize BufferSize. NR = %ld, NM = %ld\n", NR, NM);
1490 magma_potrf_init_flag = 0;
1491 }
1492 magma_tpotrf_expert_wrapper('L', NR, blockDense_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1493 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1494 //copy_supernode_to_host_write(blockR_dev, IB);
1495 //exit(1);
1496#else
1497 tpotrf_dev('L', NR, blockDense_dev, mf_block_lda(IB, IB), &info);
1498#endif
1499
1500 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
1501 //printf("RJ: tpotrf NR: %d, a: %d, lda: %d\n\n", NR, mf_block_index(IB, IB), mf_block_lda(IB, IB));
1502
1503 // FORWARD SOLVE -- last block
1504 ttrsm_dev('L', 'L', 'N', 'N', NR, nrhs, ONE, blockDense_dev, mf_block_lda(IB, IB), &rhs_dev[Bmin[IB]], matrix_size, magma_queue_1);
1505 // flop_count += 2.0*NR * NR * nrhs;
1506
1507#ifdef PRINT_MSG
1508 std::cout << "after complete Cholesky factorization + forward solve" << std::endl;
1509#endif
1510 t_temp = get_time(0.0);
1511
1512 cudaDeviceSynchronize();
1513 copy_supernode_to_host(blockDense_dev, IB, copyStream);
1514 //copy_supernode_to_host_write(blockDense_dev, IB);
1515
1516 t_copy_DtH += get_time(t_temp);
1517
1518 }
1519
1520 cudaDeviceSynchronize(); // TO BE REPLACED
1521
1522 t_fact = get_time(t_fact);
1523 //nt : " << flop_count / 1e9 << std::endl;
1524 //std::cout << "time factorize : " << t_fact << ", sum time copy DtH : " << t_copy_DtH << ", sum t potrf : " << t_potrf << ", sum t dgemm : " << t_dgemm << std::endl;
1525 double gflops = flop_count / (1e9*t_fact);
1526 //printf("GFLOPS factorize : %f\n", gflops);
1527
1528 //std::cout << "time factorization Qxy : " << t_fact << std::endl;
1529 //std::cout << "gflop count Qxy : " << flop_count / 1e9 << std::endl;
1530 return gflops;
1531}
1532
1533/************************************************************************************************/
1534// dummy function to check calls
1535
1536template <class T>
1537double BTA<T>::FirstStageFactor_noCopyHost_testV(double &logDet){
1538
1539 printf("In Factorize noCopyHost test version.\n");
1540
1541 int info;
1542 size_t IB;
1543 size_t NR,NM;
1544 T ONE = f_one();
1545
1546 IB = 0;
1547 NR = Bmax[0]-Bmin[0];
1548
1549 nvtxRangeId_t id_initSN = nvtxRangeStartA("initSuperNodeDev");
1550 init_supernode(blockR_dev, IB, magma_cudaStream_1);
1551 nvtxRangeEnd(id_initSN);
1552 cudaDeviceSynchronize();
1553 copy_supernode_to_host_write(blockR_dev, IB);
1554
1555
1556 T* diag;
1557 diag = new T[NR];
1558
1559#ifdef MAGMA_EXPERT
1560 // loop where I copy from blockR_dev to blockM_dev & keep factorizing
1561 magma_potrf_init_flag = 0;
1562#endif
1563
1564 for(int i=0; i<2; i++){
1565#ifdef PRINT_MSG
1566 std::cout << "Copy to blockM_dev, i = " << i << ", mf_block_lda(IB, IB) = " << mf_block_lda(IB, IB) << std::endl;
1567#endif
1568 nvtxRangeId_t id_cpyDevDev = nvtxRangeStartA("CopySNtoblockMdev");
1569 tlacpy_dev('N', 2*NR, NR, blockR_dev, mf_block_lda(IB, IB), blockM_dev, mf_block_lda(IB, IB), magma_queue_1);
1570 nvtxRangeEnd(id_cpyDevDev);
1571
1572 //sleep(2);
1573 copy_supernode_to_host_write(blockM_dev, IB);
1574
1575#ifdef PRINT_MSG
1576 std::cout << "IB = " << IB << std::endl;
1577#endif
1578
1579 double t_fact = get_time(0.0);
1580
1581 // CHOLESKY FACTORISATION FIRST DIAGONAL BLOCK
1582 nvtxRangeId_t id_dpotrf = nvtxRangeStartA("firstPotrf");
1583#ifdef CUDA_POTRF
1584 tpotrf_dev_cuda('L', NR, blockM_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1585 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1586
1587#elif defined(MAGMA_EXPERT)
1588 copy_supernode_to_host_write(blockM_dev, IB);
1589 //sleep(2);
1590 magma_tpotrf_expert_wrapper('L', NR, blockM_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1591 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1592 copy_supernode_to_host_write(blockM_dev, IB);
1593#else
1594 cudaDeviceSynchronize();
1595 tpotrf_dev('L', NR, blockM_dev, mf_block_lda(IB, IB), &info);
1596#endif
1597 nvtxRangeEnd(id_dpotrf);
1598
1599 //sleep(2);
1600 copy_supernode_to_host_write(blockM_dev, IB);
1601
1602 id_cpyDevDev = nvtxRangeStartA("CopyDiagDevDev");
1603 copy_supernode_diag(blockM_dev, IB);
1604 nvtxRangeEnd(id_cpyDevDev);
1605
1606 id_cpyDevDev = nvtxRangeStartA("CopyDiagtoHost");
1607 cudaMemcpy(diag, diag_dev, NR*sizeof(T), cudaMemcpyDeviceToHost );
1608 nvtxRangeEnd(id_cpyDevDev);
1609
1610 id_cpyDevDev = nvtxRangeStartA("ComLogDetHost");
1611 // compute log sum
1612 log_sum(diag, NR, &logDet);
1613 logDet = 2*logDet;
1614
1615 printf("log Det : %f\n", logDet);
1616 nvtxRangeEnd(id_cpyDevDev);
1617 }
1618
1619 delete[] diag;
1620
1621 return 1.0;
1622
1623}
1624
1625
1626
1627
1628/************************************************************************************************/
1629
1630template <class T>
1632{
1633
1634 //std::cout << "In FirstStageFactor_noCopyHost(), omp get thread num = " << omp_get_thread_num() << std::endl;
1635
1636 int info;
1637 size_t IB;
1638 size_t NR,NM;
1639 T ONE = f_one();
1640
1641 // count floating point operations
1642 double flop_count = 0;
1643
1644 double t_temp;
1645 double t_dgemm;
1646
1647 IB = 0;
1648 NR = Bmax[0]-Bmin[0];
1649
1650 init_supernode(blockR_dev, IB, magma_cudaStream_1);
1651 if (matrix_nd > 0)
1652 {
1653 init_supernode(blockDense_dev, matrix_nt, magma_cudaStream_1);
1654 }
1655
1656#ifdef PRINT_MSG
1657 std::cout << "calling first Cholesky factorization now" << std::endl;
1658 std::cout << "IB = " << IB << std::endl;
1659#endif
1660
1661 double t_fact = get_time(0.0);
1662
1663 // CHOLESKY FACTORISATION FIRST DIAGONAL BLOCK
1664 nvtxRangeId_t id_dpotrf = nvtxRangeStartA("firstPotrf");
1665 //tpotrf_dev('L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
1666
1667#ifdef CUDA_POTRF
1668 // factorization first block -> set buffer flag to zero!
1669 cuda_buffer_flag_potrf[0] = 0;
1670 tpotrf_dev_cuda('L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1671 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1672#elif defined(MAGMA_EXPERT)
1673 magma_potrf_init_flag = 0;
1674 magma_tpotrf_expert_wrapper('L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1675 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1676 //copy_supernode_to_host_write(blockR_dev, IB);
1677 //exit(1);
1678#else
1679 cudaDeviceSynchronize();
1680 tpotrf_dev('L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
1681 cudaDeviceSynchronize();
1682#endif
1683
1684 nvtxRangeEnd(id_dpotrf);
1685 //printf("RJ: tpotrf NR: %d, a: %d, lda: %d\n\n", NR, mf_block_index(0, 0), mf_block_lda(0, 0));
1686 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
1687
1688 gpuErrchk(cudaEventRecord(potrf_dev_ev, magma_cudaStream_1));
1689 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, potrf_dev_ev, 0));
1690
1691 // TODO: if nt = 0 but nd != 0 => problem !!
1692 if (matrix_nt > 1)
1693 {
1694 t_temp = get_time(0.0);
1695 // UPDATE COLUMNS ACCORDING TO CHOLESKY FACTORISATION
1696 // separate the two operations
1697 //ttrsm_dev('R', 'L', 'T', 'N', NR+matrix_nd, NR, ONE, blockR_dev, mf_block_lda(0, 0), &blockR_dev[NR], mf_block_lda(1, 0), magma_queue_1);
1698 ttrsm_dev('R', 'L', 'T', 'N', NR, NR, ONE, blockR_dev, mf_block_lda(0, 0), &blockR_dev[NR], mf_block_lda(1, 0), magma_queue_1);
1699
1700 t_temp = get_time(t_temp);
1701 t_dgemm += t_temp;
1702 flop_count += NR * NR * NR;
1703 }
1704
1705 if (matrix_nd > 0){
1706 //printf("mf_dense_block_offset(IB) : %ld\n", mf_dense_block_offset(IB));
1707 //printf("mf_dense_block_lda(IB) : %ld\n", mf_dense_block_lda(IB));
1708 ttrsm_dev('R', 'L', 'T', 'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(0, 0), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1709 flop_count += matrix_nd * NR * NR;
1710 }
1711
1712 cudaDeviceSynchronize();
1713 // replace all copy supernode to host by computing sum of current diagonal block
1714 //copy_supernode_to_host(blockR_dev, IB);
1715 copy_supernode_diag(blockR_dev, IB);
1716
1717#ifdef PRINT_MSG
1718 std::cout << "entering loop block factorization loop now" << std::endl;
1719#endif
1720
1721 //rj IB = 1; IB < NBlock; IB++
1722 for (IB = 1; IB < matrix_nt-1; IB++)
1723 {
1724
1725#ifdef PRINT_MSG
1726 std::cout << "IB = " << IB << std::endl;
1727#endif
1728
1729 NR = Bmax[IB]-Bmin[IB];
1730 NM = Bmax[IB-1]-Bmin[IB-1];
1731
1732 //printf("IB = %d. before copy supdernode to device in loop.\n", IB);
1733 cudaDeviceSynchronize();
1734 swap_pointers(&blockR_dev, &blockM_dev);
1735 cudaDeviceSynchronize();
1736 init_supernode(blockR_dev, IB, magma_cudaStream_1);
1737 //printf("IB = %d. after copy supernode to device in loop.\n", IB);
1738
1739 //cudaDeviceSynchronize();
1740 gpuErrchk(cudaEventRecord(initBlock_dev_ev, magma_cudaStream_1));
1741 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, initBlock_dev_ev, 0));
1742
1743 // UPDATE NEXT DIAGONAL BLOCK
1744 //rj dgemm NT M[IB,IB-1] M[IB,IB-1]
1745 // todo rj: 3-last parameter ZERO in PARDISO
1746 nvtxRangeId_t id_dgemm = nvtxRangeStartA("BigGemm_inLoop");
1747 tgemm_dev('N', 'T', NR, NR, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
1748 nvtxRangeEnd(id_dgemm);
1749 flop_count += 2.0 * NR * NR * NR;
1750
1751 // these dgemms independent from previous one. Put in separate stream, just require supernode to be loaded.
1752 if (matrix_nd > 0)
1753 {
1754 // Update dense rows of super node IB
1755 tgemm_dev('N', 'T', matrix_nd, NR, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1756 // NM = NR
1757 flop_count += 2.0*matrix_nd * NR * NM; // matrix_nd *NR *(NM+2) + matrix_nd *NR *NM
1758
1759 // update last diagonal block
1760 tgemm_dev('N', 'T', matrix_nd, matrix_nd, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_2);
1761 flop_count += 2.0 * matrix_nd * matrix_nd * NM;
1762 }
1763
1764 //printf("RJ: tgemm NR: %d, NM: %d, a: %d, lda: %d, b: %d, ldb: %d, c: %d, ldc: %d\n", NR, NM, mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB), mf_block_lda(IB, IB));
1765 //rj dpotrf MF[IB,IB]
1766 nvtxRangeId_t id_dpotrf = nvtxRangeStartA("Potrf_inLoopTest");
1767 //tpotrf_dev('L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
1768#ifdef CUDA_POTRF
1769 if(NR != NM){
1770 //printf("Different block size! Re-initialize cudaBufferSize. NR = %ld, NM = %ld\n", NR, NM);
1771 cuda_buffer_flag_potrf[0] = 0;
1772 }
1773 tpotrf_dev_cuda('L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1774 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1775#elif defined(MAGMA_EXPERT)
1776 if(NR != NM){
1777 printf("Different block size! Re-initialize BufferSize. NR = %ld, NM = %ld\n", NR, NM);
1778 magma_potrf_init_flag = 0;
1779 }
1780 magma_potrf_init_flag = 0;
1781 magma_tpotrf_expert_wrapper('L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1782 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1783 //copy_supernode_to_host_write(blockR_dev, IB);
1784 //exit(1);
1785#else
1786 cudaDeviceSynchronize();
1787 tpotrf_dev('L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
1788 cudaDeviceSynchronize();
1789#endif
1790 nvtxRangeEnd(id_dpotrf);
1791 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
1792
1793 magma_event_record(potrf_dev_magma_ev, magma_queue_1);
1794 magma_queue_wait_event(magma_queue_2, potrf_dev_magma_ev);
1795
1796 // triangular solve, all columns -> potentially split this ... when nd large ... two separate streams.
1797 //printf("RJ: tpotrf NR: %d, a: %d, lda: %d\n\n", NR, mf_block_index(IB, IB), mf_block_lda(IB, IB));
1798 //rj dtrsm RLTN MF[IB,IB] MF[IB+1,IB]
1799 nvtxRangeId_t id_ttrsm = nvtxRangeStartA("ttrsm_inLoop");
1800 //ttrsm_dev('R', 'L', 'T', 'N', NR+matrix_nd, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[NR], mf_block_lda(IB+1, IB), magma_queue_1);
1801 ttrsm_dev('R', 'L', 'T', 'N', NR, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[NR], mf_block_lda(IB+1, IB), magma_queue_1);
1802
1803 if(matrix_nd > 0){
1804 ttrsm_dev('R', 'L', 'T', 'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1805 }
1806
1807 nvtxRangeEnd(id_ttrsm);
1808 //printf("RJ: ttrsm NR: %d, a: %d, lda: %d, b: %d, ldb: %d\n", NR, mf_block_index(IB-1, IB-1), mf_block_lda(IB-1, IB-1), mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1));
1809
1810 flop_count += (NR + matrix_nd) * NR * NR;
1811
1812 cudaDeviceSynchronize();
1813 //copy_supernode_to_host(blockR_dev, IB);
1814 copy_supernode_diag(blockR_dev, IB);
1815
1816 } // end loop IB
1817
1818 if (matrix_nt > 1)
1819 {
1820 IB = matrix_nt-1;
1821 NR = Bmax[IB]-Bmin[IB];
1822 NM = Bmax[IB-1]-Bmin[IB-1];
1823
1824 swap_pointers(&blockR_dev, &blockM_dev);
1825 init_supernode(blockR_dev, IB, magma_cudaStream_1);
1826
1827 gpuErrchk(cudaEventRecord(initBlock_dev_ev, magma_cudaStream_1));
1828 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, initBlock_dev_ev, 0));
1829
1830 //rj dgemm NT M[IB,IB-1] M[IB,IB-1]
1831 // todo rj: 3-last parameter ZERO in PARDISO
1832 tgemm_dev('N', 'T', NR, NR, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
1833 flop_count += 2.0* NR * NR * NR;
1834
1835 if (matrix_nd > 0)
1836 {
1837 tgemm_dev('N', 'T', matrix_nd, NR, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1838 flop_count += 2.0*matrix_nd * NR * NM;
1839
1840 tgemm_dev('N', 'T', matrix_nd, matrix_nd, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_2);
1841 flop_count += 2.0* matrix_nd * matrix_nd * NM;
1842 }
1843
1844 //printf("RJ: tgemm NR: %d, NM: %d, a: %d, lda: %d, b: %d, ldb: %d, c: %d, ldc: %d\n", NR, NM, mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB), mf_block_lda(IB, IB));
1845 //rj dpotrf MF[IB,IB]
1846 //tpotrf_dev('L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
1847#ifdef CUDA_POTRF
1848 if(NR != NM){
1849 //printf("Different block size! Re-initialize cudaBufferSize. NR = %ld, NM = %ld\n", NR, NM);
1850 cuda_buffer_flag_potrf[0] = 0;
1851 }
1852 tpotrf_dev_cuda('L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1853 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1854#elif defined(MAGMA_EXPERT)
1855 if(NR != NM){
1856 //printf("Different block size! Re-initialize BufferSize. NR = %ld, NM = %ld\n", NR, NM);
1857 magma_potrf_init_flag = 0;
1858 }
1859 magma_tpotrf_expert_wrapper('L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1860 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1861 //copy_supernode_to_host_write(blockR_dev, IB);
1862 //exit(1);
1863#else
1864 cudaDeviceSynchronize();
1865 tpotrf_dev('L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
1866 cudaDeviceSynchronize();
1867#endif
1868 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
1869
1870
1871 //printf("RJ: tpotrf NR: %d, a: %d, lda: %d\n\n", NR, mf_block_index(IB, IB), mf_block_lda(IB, IB));
1872 //rj dtrsm RLTN MF[IB,IB] MF[IB+1,IB]
1873 if (matrix_nd > 0)
1874 {
1875 ttrsm_dev('R', 'L', 'T', 'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_1);
1876 flop_count += matrix_nd * NR * NR;
1877
1878 }
1879 //printf("RJ: ttrsm NR: %d, a: %d, lda: %d, b: %d, ldb: %d\n", NR, mf_block_index(IB-1, IB-1), mf_block_lda(IB-1, IB-1), mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1));
1880
1881 //copy_supernode_to_host(blockR_dev, IB);
1882 copy_supernode_diag(blockR_dev, IB);
1883
1884 }
1885
1886 if (matrix_nd > 0)
1887 {
1888 IB = NBlock-1;
1889 NR = Bmax[IB]-Bmin[IB];
1890 NM = Bmax[IB-1]-Bmin[IB-1];
1891
1892 //rj dgemm NT M[IB,IB-1] M[IB,IB-1]
1893 //rj NM is the same for all blocks 0..nt-1
1894 tgemm_dev('N', 'T', matrix_nd, matrix_nd, NM, -ONE, &blockR_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockR_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_1);
1895 flop_count += 2.0* matrix_nd * matrix_nd * NM;
1896
1897 //printf("RJ: tgemm NR: %d, NM: %d, a: %d, lda: %d, b: %d, ldb: %d, c: %d, ldc: %d\n", NR, NM, mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB-1), mf_block_lda(IB, IB-1), mf_block_index(IB, IB), mf_block_lda(IB, IB));
1898 //rj dpotrf MF[IB,IB]
1899 //tpotrf_dev('L', NR, blockDense_dev, mf_block_lda(IB, IB), &info);
1900#ifdef CUDA_POTRF
1901 if(NR != NM){
1902 //printf("Different block size! Re-initialize cudaBufferSize. NR = %ld, NM = %ld\n", NR, NM);
1903 cuda_buffer_flag_potrf[0] = 0;
1904 }
1905 tpotrf_dev_cuda('L', NR, blockDense_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1906 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1907#elif defined(MAGMA_EXPERT)
1908 if(NR != NM){
1909 //printf("Different block size! Re-initialize BufferSize. NR = %ld, NM = %ld\n", NR, NM);
1910 magma_potrf_init_flag = 0;
1911 }
1912 magma_tpotrf_expert_wrapper('L', NR, blockDense_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1913 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1914 //copy_supernode_to_host_write(blockR_dev, IB);
1915 //exit(1);
1916#else
1917 cudaDeviceSynchronize();
1918 tpotrf_dev('L', NR, blockDense_dev, mf_block_lda(IB, IB), &info);
1919 cudaDeviceSynchronize();
1920#endif
1921 flop_count += 1.0/3.0 * matrix_nd * matrix_nd * matrix_nd + 0.5*matrix_nd * matrix_nd + 1.0/6.0 * matrix_nd;
1922
1923 //printf("RJ: tpotrf NR: %d, a: %d, lda: %d\n\n", NR, mf_block_index(IB, IB), mf_block_lda(IB, IB));
1924
1925#ifdef PRINT_MSG
1926 std::cout << "after complete Cholesky factorization" << std::endl;
1927#endif
1928
1929 //copy_supernode_to_host(blockDense_dev, IB);
1930 copy_supernode_diag(blockDense_dev, IB);
1931
1932 }
1933
1934 t_fact = get_time(t_fact);
1935 //std::cout << "gflop count : " << flop_count / 1e9 << std::endl;
1936 //std::cout << "time factorize : " << t_fact << std::endl;
1937 double gflops = flop_count / (1e9*t_fact);
1938 //printf("GFLOPS factorize Qu : %f\n", gflops);
1939
1940 //std::cout << "time factorization Qu : " << t_fact << std::endl;
1941 //std::cout << "gflop count Qu : " << flop_count / 1e9 << std::endl;
1942
1943 double t_copy = -omp_get_wtime();
1944 // copy result from the device to the host:
1945 T* diag;
1946 diag = new T[matrix_size];
1947 cudaMemcpy(diag, diag_dev, matrix_size*sizeof(T), cudaMemcpyDeviceToHost );
1948
1949 // compute log sum
1950 log_sum(diag, matrix_size, &logDet);
1951 logDet = 2*logDet;
1952
1953 t_copy += omp_get_wtime();
1954
1955#ifdef PRINT_TIMES
1956 std::cout << "Log Det Copy & Compute time : " << t_copy << std::endl;
1957
1958#endif
1959 //printf("\nnew logDet = %f\n", logDet);
1960
1961 return gflops;
1962}
1963
1964/************************************************************************************************/
1965// divide SecondStageSolve into ForwardPass & BackwardPass to make functions usable separately
1966
1967template <class T>
1968double BTA<T>::ForwardPassSolve(size_t nrhs){
1969
1970 int info;
1971 size_t IB;
1972 size_t NR,NM,NP;
1973 T ONE = f_one();
1974
1975 NR = Bmax[0]-Bmin[0];
1976
1977 // counting FLOPS
1978 double flop_count = 0;
1979
1980#ifdef PRINT_MSG
1981 std::cout <<"calling first block solver in second stage factor now" << std::endl;
1982#endif
1983
1984 nvtxRangeId_t id_solve = nvtxRangeStartA("ForwardPassSolve");
1985
1986 double t_solve = get_time(0.0);
1987
1988 //rj dtrsm LLNN MF[0,0] rhs[0]
1989 // solve for first diagonal block (using solve for tridiagonal matrix), ie. x_0 = D^-0*b_0
1990 c_ttrsm('L', 'L', 'N', 'N', NR, nrhs, ONE, &MF[mf_block_index(0, 0)], mf_block_lda(0, 0), &rhs[Bmin[0]], matrix_size);
1991 flop_count += 2.0 * NR * NR * nrhs;
1992
1993 //rj IB = 1; IB < NBlock; IB++
1994 for (IB = 1; IB < matrix_nt; IB++)
1995 {
1996 NR = Bmax[IB]-Bmin[IB];
1997 NM = Bmax[IB-1]-Bmin[IB-1];
1998
1999 // compute b_i = b_i - E_i-1 * x_i-1
2000 //rj dgemm NN M[IB,IB-1] rhs[IB-1] rhs[IB]
2001 c_tgemm('N', 'N', NR, nrhs, NM, -ONE, &MF[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs[Bmin[IB-1]], matrix_size, ONE, &rhs[Bmin[IB]], matrix_size);
2002 flop_count += 2.0 * NR * nrhs * NM;
2003 // solve for x_i = D^-1_i*b_i (which was updated before)
2004 //rj dtrsm LLNN MF[IB,IB] rhs[IB]
2005 c_ttrsm('L', 'L', 'N', 'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2006 flop_count += 2.0*NR * NR * nrhs;
2007 }
2008
2009 // dense rows at the end
2010 if (matrix_nd > 0)
2011 {
2012 IB = NBlock-1;
2013 NR = Bmax[IB]-Bmin[IB];
2014 NM = Bmax[IB-1]-Bmin[IB-1];
2015
2016 // compute b_n = b_n - (F_1*x_1 + F_2*x_2 + ... F_n-1*x_n-1)
2017 //rj dgemm NN M[IB,IB-1] rhs[IB-1] rhs[IB]
2018 for (size_t i = 0; i < NBlock-1; i++)
2019 {
2020 c_tgemm('N', 'N', NR, nrhs, NM, -ONE, &MF[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs[Bmin[i]], matrix_size, ONE, &rhs[Bmin[IB]], matrix_size);
2021 flop_count += 2.0*NR * nrhs * NM;
2022 }
2023 //rj dtrsm LLNN MF[IB,IB] rhs[IB]
2024 c_ttrsm('L', 'L', 'N', 'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2025 flop_count += 2.0*NR * NR * nrhs;
2026 }
2027
2028 return flop_count;
2029
2030}
2031
2032// assumes rhs to contain the result from forward pass
2033template <class T>
2034double BTA<T>::BackwardPassSolve(size_t nrhs){
2035
2036 int info;
2037 size_t IB;
2038 size_t NR,NM,NP;
2039 T ONE = f_one();
2040
2041 int flop_count = 0;
2042
2043 double t_solve = get_time(0.0);
2044
2045 if (matrix_nd > 0)
2046 {
2047 IB = NBlock-1;
2048 NR = Bmax[IB]-Bmin[IB];
2049 NM = Bmax[IB-1]-Bmin[IB-1];
2050
2051 // solve D^T_n*x_n = y_n for x_n
2052 //rj dtrsm LLTN MF[NBlock-1,NBlock-1] rhs[NBlock-1]
2053 c_ttrsm('L', 'L', 'T', 'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2054 flop_count += NR * NR * nrhs;
2055 // now update y everywhere : y_i = y_i - F^T_i * x_n
2056 //rj dgemm TN M[IB,IB-1] rhs[IB] rhs[IB-1]
2057 for (size_t i = 0; i < NBlock-1; i++)
2058 {
2059 c_tgemm('T', 'N', NM, nrhs, NR, -ONE, &MF[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs[Bmin[IB]], matrix_size, ONE, &rhs[Bmin[i]], matrix_size);
2060 flop_count += 2.0*NM * nrhs * NR;
2061 }
2062 }
2063
2064 // loop for tridiagonal block structure (right-hand side y already includes updates from dense rows/columns)
2065 for (IB = matrix_nt-1; IB > 0; IB--)
2066 {
2067 NR = Bmax[IB]-Bmin[IB];
2068 NM = Bmax[IB-1]-Bmin[IB-1];
2069
2070 // compute D^T_i x_i = y_i
2071 //rj dtrsm LLTN MF[IB,IB] rhs[IB]
2072 c_ttrsm('L', 'L', 'T', 'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2073 flop_count += NR * NR * nrhs;
2074 // compute y_i = y_i - E_i^T * x_i+1
2075 //rj dgemm TN M[IB,IB-1] rhs[IB] rhs[IB-1]
2076 c_tgemm('T', 'N', NM, nrhs, NR, -ONE, &MF[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs[Bmin[IB]], matrix_size, ONE, &rhs[Bmin[IB-1]], matrix_size);
2077 flop_count += 2.0 * NM * nrhs * NR;
2078 }
2079
2080 IB = 0;
2081 NR = Bmax[IB]-Bmin[IB];
2082
2083 // compute D^T_0 x_0 = y_0
2084 //rj dtrsm LLTN MF[NBlock-1,NBlock-1] rhs[NBlock-1]
2085 c_ttrsm('L', 'L', 'T', 'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2086
2087 flop_count += NR * NR * nrhs;
2088 t_solve = get_time(t_solve);
2089
2090 double gflops = flop_count / (1e9*t_solve);
2091 //printf("GFLOPS solve : %f\n", gflops);
2092
2093#ifdef PRINT_MSG
2094 std::cout << "after forward-backword solve in 2nd stage factor" << std::endl;
2095#endif
2096
2097 return gflops;
2098
2099}
2100
2101template <class T>
2102double BTA<T>::SecondStageSolve(size_t nrhs, double& t_secondStageForwardPass, double& t_secondStageBackwardPass)
2103{
2104 size_t gflops = 0;
2105
2106 t_secondStageForwardPass = get_time(0.0);
2107 gflops += ForwardPassSolve(nrhs);
2108 t_secondStageForwardPass = get_time(t_secondStageForwardPass);
2109
2110 t_secondStageBackwardPass = get_time(0.0);
2111 gflops += BackwardPassSolve(nrhs);
2112 t_secondStageBackwardPass = get_time(t_secondStageBackwardPass);
2113
2114 return gflops;
2115}
2116
2117
2118#if 0
2119// After factorisation of A = L*L^T solve (L*L^T) x = b by : forward pass L*y = b, backward pass L^T x = y
2120template <class T>
2121double BTA<T>::SecondStageSolve(size_t nrhs)
2122{
2123 int info;
2124 size_t IB;
2125 size_t NR,NM,NP;
2126 T ONE = f_one();
2127
2128 //Forward pass
2129 NR = Bmax[0]-Bmin[0];
2130
2131 // counting FLOPS
2132 double flop_count = 0;
2133
2134#ifdef PRINT_MSG
2135 std::cout <<"calling first block solver in second stage factor now" << std::endl;
2136#endif
2137
2138 nvtxRangeId_t id_solve = nvtxRangeStartA("solve");
2139
2140 double t_solve = get_time(0.0);
2141
2142 //rj dtrsm LLNN MF[0,0] rhs[0]
2143 // solve for first diagonal block (using solve for tridiagonal matrix), ie. x_0 = D^-0*b_0
2144 c_ttrsm('L', 'L', 'N', 'N', NR, nrhs, ONE, &MF[mf_block_index(0, 0)], mf_block_lda(0, 0), &rhs[Bmin[0]], matrix_size);
2145 flop_count += 2.0 * NR * NR * nrhs;
2146
2147 //rj IB = 1; IB < NBlock; IB++
2148 for (IB = 1; IB < matrix_nt; IB++)
2149 {
2150 NR = Bmax[IB]-Bmin[IB];
2151 NM = Bmax[IB-1]-Bmin[IB-1];
2152
2153 // compute b_i = b_i - E_i-1 * x_i-1
2154 //rj dgemm NN M[IB,IB-1] rhs[IB-1] rhs[IB]
2155 c_tgemm('N', 'N', NR, nrhs, NM, -ONE, &MF[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs[Bmin[IB-1]], matrix_size, ONE, &rhs[Bmin[IB]], matrix_size);
2156 flop_count += 2.0 * NR * nrhs * NM;
2157 // solve for x_i = D^-1_i*b_i (which was updated before)
2158 //rj dtrsm LLNN MF[IB,IB] rhs[IB]
2159 c_ttrsm('L', 'L', 'N', 'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2160 flop_count += 2.0*NR * NR * nrhs;
2161 }
2162
2163 // dense rows at the end
2164 if (matrix_nd > 0)
2165 {
2166 IB = NBlock-1;
2167 NR = Bmax[IB]-Bmin[IB];
2168 NM = Bmax[IB-1]-Bmin[IB-1];
2169
2170 // compute b_n = b_n - (F_1*x_1 + F_2*x_2 + ... F_n-1*x_n-1)
2171 //rj dgemm NN M[IB,IB-1] rhs[IB-1] rhs[IB]
2172 for (size_t i = 0; i < NBlock-1; i++)
2173 {
2174 c_tgemm('N', 'N', NR, nrhs, NM, -ONE, &MF[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs[Bmin[i]], matrix_size, ONE, &rhs[Bmin[IB]], matrix_size);
2175 flop_count += 2.0*NR * nrhs * NM;
2176 }
2177 //rj dtrsm LLNN MF[IB,IB] rhs[IB]
2178 c_ttrsm('L', 'L', 'N', 'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2179 flop_count += 2.0*NR * NR * nrhs;
2180 }
2181
2182 //Backward pass
2183 if (matrix_nd > 0)
2184 {
2185 IB = NBlock-1;
2186 NR = Bmax[IB]-Bmin[IB];
2187 NM = Bmax[IB-1]-Bmin[IB-1];
2188
2189 // solve D^T_n*x_n = y_n for x_n
2190 //rj dtrsm LLTN MF[NBlock-1,NBlock-1] rhs[NBlock-1]
2191 c_ttrsm('L', 'L', 'T', 'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2192 flop_count += NR * NR * nrhs;
2193 // now update y everywhere : y_i = y_i - F^T_i * x_n
2194 //rj dgemm TN M[IB,IB-1] rhs[IB] rhs[IB-1]
2195 for (size_t i = 0; i < NBlock-1; i++)
2196 {
2197 c_tgemm('T', 'N', NM, nrhs, NR, -ONE, &MF[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs[Bmin[IB]], matrix_size, ONE, &rhs[Bmin[i]], matrix_size);
2198 flop_count += 2.0*NM * nrhs * NR;
2199 }
2200 }
2201
2202 // loop for tridiagonal block structure (right-hand side y already includes updates from dense rows/columns)
2203 for (IB = matrix_nt-1; IB > 0; IB--)
2204 {
2205 NR = Bmax[IB]-Bmin[IB];
2206 NM = Bmax[IB-1]-Bmin[IB-1];
2207
2208 // compute D^T_i x_i = y_i
2209 //rj dtrsm LLTN MF[IB,IB] rhs[IB]
2210 c_ttrsm('L', 'L', 'T', 'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2211 flop_count += NR * NR * nrhs;
2212 // compute y_i = y_i - E_i^T * x_i+1
2213 //rj dgemm TN M[IB,IB-1] rhs[IB] rhs[IB-1]
2214 c_tgemm('T', 'N', NM, nrhs, NR, -ONE, &MF[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs[Bmin[IB]], matrix_size, ONE, &rhs[Bmin[IB-1]], matrix_size);
2215 flop_count += 2.0 * NM * nrhs * NR;
2216 }
2217
2218 IB = 0;
2219 NR = Bmax[IB]-Bmin[IB];
2220
2221 // compute D^T_0 x_0 = y_0
2222 //rj dtrsm LLTN MF[NBlock-1,NBlock-1] rhs[NBlock-1]
2223 c_ttrsm('L', 'L', 'T', 'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2224
2225 flop_count += NR * NR * nrhs;
2226 t_solve = get_time(t_solve);
2227
2228 nvtxRangeEnd(id_solve);
2229
2230 double gflops = flop_count / (1e9*t_solve);
2231 //printf("GFLOPS solve : %f\n", gflops);
2232
2233
2234#ifdef PRINT_MSG
2235 std::cout << "after forward-backword solve in 2nd stage factor" << std::endl;
2236#endif
2237
2238
2239 return gflops;
2240}
2241#endif
2242
2243/************************************************************************************************/
2244
2245// After factorisation of A = L*L^T solve (L*L^T) x = b by : forward pass L*y = b, backward pass L^T x = y
2246template <class T>
2247double BTA<T>::SecondStageSolve_d(size_t nrhs, double* rhs_d)
2248{
2249 printf("in SecondStageSolve_d()\n");
2250
2251 int info;
2252 size_t IB;
2253 size_t NR,NM,NP;
2254 double ONE_d = f_one();
2255
2256 // do CONVERSION OF MF!!
2257 // ========================================= //
2258 double* MF_d = new double[matrix_n_nonzeros];
2259
2260 double t_MF_conv = get_time(0.0);
2261
2262 for(int i = 0; i<matrix_n_nonzeros; i++){
2263 MF_d[i] = (double) MF[i];
2264 }
2265
2266 t_MF_conv = get_time(t_MF_conv);
2267 printf("time spent conversion to MF double prec: %f\n", t_MF_conv);
2268
2269 // ========================================= //
2270
2271 //Forward pass
2272 NR = Bmax[0]-Bmin[0];
2273
2274 // counting FLOPS
2275 double flop_count = 0;
2276
2277#ifdef PRINT_MSG
2278 std::cout <<"calling first block solver in second stage factor now" << std::endl;
2279#endif
2280
2281 nvtxRangeId_t id_solve = nvtxRangeStartA("solve");
2282
2283 double t_solve = get_time(0.0);
2284
2285 //rj dtrsm LLNN MF[0,0] rhs[0]
2286 // solve for first diagonal block (using solve for tridiagonal matrix), ie. x_0 = D^-0*b_0
2287 c_ttrsm('L', 'L', 'N', 'N', NR, nrhs, ONE_d, &MF_d[mf_block_index(0, 0)], mf_block_lda(0, 0), &rhs_d[Bmin[0]], matrix_size);
2288 flop_count += 2.0 * NR * NR * nrhs;
2289
2290 //rj IB = 1; IB < NBlock; IB++
2291 for (IB = 1; IB < matrix_nt; IB++)
2292 {
2293 NR = Bmax[IB]-Bmin[IB];
2294 NM = Bmax[IB-1]-Bmin[IB-1];
2295
2296 // compute b_i = b_i - E_i-1 * x_i-1
2297 //rj dgemm NN M[IB,IB-1] rhs[IB-1] rhs[IB]
2298 c_tgemm('N', 'N', NR, nrhs, NM, -ONE_d, &MF_d[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs_d[Bmin[IB-1]], matrix_size, ONE_d, &rhs_d[Bmin[IB]], matrix_size);
2299 flop_count += 2.0 * NR * nrhs * NM;
2300 // solve for x_i = D^-1_i*b_i (which was updated before)
2301 //rj dtrsm LLNN MF[IB,IB] rhs[IB]
2302 c_ttrsm('L', 'L', 'N', 'N', NR, nrhs, ONE_d, &MF_d[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_d[Bmin[IB]], matrix_size);
2303 flop_count += 2.0*NR * NR * nrhs;
2304 }
2305
2306 // dense rows at the end
2307 if (matrix_nd > 0)
2308 {
2309 IB = NBlock-1;
2310 NR = Bmax[IB]-Bmin[IB];
2311 NM = Bmax[IB-1]-Bmin[IB-1];
2312
2313 // compute b_n = b_n - (F_1*x_1 + F_2*x_2 + ... F_n-1*x_n-1)
2314 //rj dgemm NN M[IB,IB-1] rhs[IB-1] rhs[IB]
2315 for (size_t i = 0; i < NBlock-1; i++)
2316 {
2317 c_tgemm('N', 'N', NR, nrhs, NM, -ONE_d, &MF_d[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs_d[Bmin[i]], matrix_size, ONE_d, &rhs_d[Bmin[IB]], matrix_size);
2318 flop_count += 2.0*NR * nrhs * NM;
2319 }
2320 //rj dtrsm LLNN MF[IB,IB] rhs[IB]
2321 c_ttrsm('L', 'L', 'N', 'N', NR, nrhs, ONE_d, &MF_d[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_d[Bmin[IB]], matrix_size);
2322 flop_count += 2.0*NR * NR * nrhs;
2323 }
2324
2325 //Backward pass
2326 if (matrix_nd > 0)
2327 {
2328 IB = NBlock-1;
2329 NR = Bmax[IB]-Bmin[IB];
2330 NM = Bmax[IB-1]-Bmin[IB-1];
2331
2332 // solve D^T_n*x_n = y_n for x_n
2333 //rj dtrsm LLTN MF[NBlock-1,NBlock-1] rhs[NBlock-1]
2334 c_ttrsm('L', 'L', 'T', 'N', NR, nrhs, ONE_d, &MF_d[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_d[Bmin[IB]], matrix_size);
2335 flop_count += NR * NR * nrhs;
2336 // now update y everywhere : y_i = y_i - F^T_i * x_n
2337 //rj dgemm TN M[IB,IB-1] rhs[IB] rhs[IB-1]
2338 for (size_t i = 0; i < NBlock-1; i++)
2339 {
2340 c_tgemm('T', 'N', NM, nrhs, NR, -ONE_d, &MF_d[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs_d[Bmin[IB]], matrix_size, ONE_d, &rhs_d[Bmin[i]], matrix_size);
2341 flop_count += 2.0*NM * nrhs * NR;
2342 }
2343 }
2344
2345 // loop for tridiagonal block structure (right-hand side y already includes updates from dense rows/columns)
2346 for (IB = matrix_nt-1; IB > 0; IB--)
2347 {
2348 NR = Bmax[IB]-Bmin[IB];
2349 NM = Bmax[IB-1]-Bmin[IB-1];
2350
2351 // compute D^T_i x_i = y_i
2352 //rj dtrsm LLTN MF[IB,IB] rhs[IB]
2353 c_ttrsm('L', 'L', 'T', 'N', NR, nrhs, ONE_d, &MF_d[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_d[Bmin[IB]], matrix_size);
2354 flop_count += NR * NR * nrhs;
2355 // compute y_i = y_i - E_i^T * x_i+1
2356 //rj dgemm TN M[IB,IB-1] rhs[IB] rhs[IB-1]
2357 c_tgemm('T', 'N', NM, nrhs, NR, -ONE_d, &MF_d[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs_d[Bmin[IB]], matrix_size, ONE_d, &rhs_d[Bmin[IB-1]], matrix_size);
2358 flop_count += 2.0 * NM * nrhs * NR;
2359 }
2360
2361 IB = 0;
2362 NR = Bmax[IB]-Bmin[IB];
2363
2364 // compute D^T_0 x_0 = y_0
2365 //rj dtrsm LLTN MF[NBlock-1,NBlock-1] rhs[NBlock-1]
2366 c_ttrsm('L', 'L', 'T', 'N', NR, nrhs, ONE_d, &MF_d[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_d[Bmin[IB]], matrix_size);
2367
2368 flop_count += NR * NR * nrhs;
2369 t_solve = get_time(t_solve);
2370
2371 nvtxRangeEnd(id_solve);
2372
2373 double gflops = flop_count / (1e9*t_solve);
2374 //printf("GFLOPS solve : %f\n", gflops);
2375
2376
2377#ifdef PRINT_MSG
2378 std::cout << "after forward-backword solve in 2nd stage factor" << std::endl;
2379#endif
2380
2381
2382 return gflops;
2383}
2384
2385/************************************************************************************************/
2386
2387/************************************************************************************************/
2388
2389// After factorisation of A = L*L^T solve (L*L^T) x = b by : forward pass L*y = b, backward pass L^T x = y
2390template <class T>
2391double BTA<T>::SecondStageSolve_s(size_t nrhs, float* rhs_s)
2392{
2393 printf("in SecondStageSolve_s()\n");
2394
2395 int info;
2396 size_t IB;
2397 size_t NR,NM,NP;
2398 float ONE_s = f_one();
2399
2400 // do CONVERSION OF MF!!
2401 // ========================================= //
2402 float* MF_s = new float[matrix_n_nonzeros];
2403
2404 double t_MF_conv = get_time(0.0);
2405
2406 for(int i = 0; i<matrix_n_nonzeros; i++){
2407 MF_s[i] = (float) MF[i];
2408 }
2409
2410 t_MF_conv = get_time(t_MF_conv);
2411 printf("time spent conversion to MF single prec: %f\n", t_MF_conv);
2412
2413 // ========================================= //
2414
2415 //Forward pass
2416 NR = Bmax[0]-Bmin[0];
2417
2418 // counting FLOPS
2419 double flop_count = 0;
2420
2421#ifdef PRINT_MSG
2422 std::cout <<"calling first block solver in second stage factor now" << std::endl;
2423#endif
2424
2425 nvtxRangeId_t id_solve = nvtxRangeStartA("solve");
2426
2427 double t_solve = get_time(0.0);
2428
2429 //rj dtrsm LLNN MF[0,0] rhs[0]
2430 // solve for first diagonal block (using solve for tridiagonal matrix), ie. x_0 = D^-0*b_0
2431 c_ttrsm('L', 'L', 'N', 'N', NR, nrhs, ONE_s, &MF_s[mf_block_index(0, 0)], mf_block_lda(0, 0), &rhs_s[Bmin[0]], matrix_size);
2432 flop_count += 2.0 * NR * NR * nrhs;
2433
2434 //rj IB = 1; IB < NBlock; IB++
2435 for (IB = 1; IB < matrix_nt; IB++)
2436 {
2437 NR = Bmax[IB]-Bmin[IB];
2438 NM = Bmax[IB-1]-Bmin[IB-1];
2439
2440 // compute b_i = b_i - E_i-1 * x_i-1
2441 //rj dgemm NN M[IB,IB-1] rhs[IB-1] rhs[IB]
2442 c_tgemm('N', 'N', NR, nrhs, NM, -ONE_s, &MF_s[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs_s[Bmin[IB-1]], matrix_size, ONE_s, &rhs_s[Bmin[IB]], matrix_size);
2443 flop_count += 2.0 * NR * nrhs * NM;
2444 // solve for x_i = D^-1_i*b_i (which was updated before)
2445 //rj dtrsm LLNN MF[IB,IB] rhs[IB]
2446 c_ttrsm('L', 'L', 'N', 'N', NR, nrhs, ONE_s, &MF_s[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_s[Bmin[IB]], matrix_size);
2447 flop_count += 2.0*NR * NR * nrhs;
2448 }
2449
2450 // dense rows at the end
2451 if (matrix_nd > 0)
2452 {
2453 IB = NBlock-1;
2454 NR = Bmax[IB]-Bmin[IB];
2455 NM = Bmax[IB-1]-Bmin[IB-1];
2456
2457 // compute b_n = b_n - (F_1*x_1 + F_2*x_2 + ... F_n-1*x_n-1)
2458 //rj dgemm NN M[IB,IB-1] rhs[IB-1] rhs[IB]
2459 for (size_t i = 0; i < NBlock-1; i++)
2460 {
2461 c_tgemm('N', 'N', NR, nrhs, NM, -ONE_s, &MF_s[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs_s[Bmin[i]], matrix_size, ONE_s, &rhs_s[Bmin[IB]], matrix_size);
2462 flop_count += 2.0*NR * nrhs * NM;
2463 }
2464 //rj dtrsm LLNN MF[IB,IB] rhs[IB]
2465 c_ttrsm('L', 'L', 'N', 'N', NR, nrhs, ONE_s, &MF_s[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_s[Bmin[IB]], matrix_size);
2466 flop_count += 2.0*NR * NR * nrhs;
2467 }
2468
2469 //Backward pass
2470 if (matrix_nd > 0)
2471 {
2472 IB = NBlock-1;
2473 NR = Bmax[IB]-Bmin[IB];
2474 NM = Bmax[IB-1]-Bmin[IB-1];
2475
2476 // solve D^T_n*x_n = y_n for x_n
2477 //rj dtrsm LLTN MF[NBlock-1,NBlock-1] rhs[NBlock-1]
2478 c_ttrsm('L', 'L', 'T', 'N', NR, nrhs, ONE_s, &MF_s[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_s[Bmin[IB]], matrix_size);
2479 flop_count += NR * NR * nrhs;
2480 // now update y everywhere : y_i = y_i - F^T_i * x_n
2481 //rj dgemm TN M[IB,IB-1] rhs[IB] rhs[IB-1]
2482 for (size_t i = 0; i < NBlock-1; i++)
2483 {
2484 c_tgemm('T', 'N', NM, nrhs, NR, -ONE_s, &MF_s[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs_s[Bmin[IB]], matrix_size, ONE_s, &rhs_s[Bmin[i]], matrix_size);
2485 flop_count += 2.0*NM * nrhs * NR;
2486 }
2487 }
2488
2489 // loop for tridiagonal block structure (right-hand side y already includes updates from dense rows/columns)
2490 for (IB = matrix_nt-1; IB > 0; IB--)
2491 {
2492 NR = Bmax[IB]-Bmin[IB];
2493 NM = Bmax[IB-1]-Bmin[IB-1];
2494
2495 // compute D^T_i x_i = y_i
2496 //rj dtrsm LLTN MF[IB,IB] rhs[IB]
2497 c_ttrsm('L', 'L', 'T', 'N', NR, nrhs, ONE_s, &MF_s[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_s[Bmin[IB]], matrix_size);
2498 flop_count += NR * NR * nrhs;
2499 // compute y_i = y_i - E_i^T * x_i+1
2500 //rj dgemm TN M[IB,IB-1] rhs[IB] rhs[IB-1]
2501 c_tgemm('T', 'N', NM, nrhs, NR, -ONE_s, &MF_s[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs_s[Bmin[IB]], matrix_size, ONE_s, &rhs_s[Bmin[IB-1]], matrix_size);
2502 flop_count += 2.0 * NM * nrhs * NR;
2503 }
2504
2505 IB = 0;
2506 NR = Bmax[IB]-Bmin[IB];
2507
2508 // compute D^T_0 x_0 = y_0
2509 //rj dtrsm LLTN MF[NBlock-1,NBlock-1] rhs[NBlock-1]
2510 c_ttrsm('L', 'L', 'T', 'N', NR, nrhs, ONE_s, &MF_s[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_s[Bmin[IB]], matrix_size);
2511
2512 flop_count += NR * NR * nrhs;
2513 t_solve = get_time(t_solve);
2514
2515 nvtxRangeEnd(id_solve);
2516
2517 double gflops = flop_count / (1e9*t_solve);
2518 //printf("GFLOPS solve : %f\n", gflops);
2519
2520
2521#ifdef PRINT_MSG
2522 std::cout << "after forward-backword solve in 2nd stage factor" << std::endl;
2523#endif
2524
2525
2526 return gflops;
2527}
2528
2529/************************************************************************************************/
2530
2531template <class T>
2532double BTA<T>::ThirdStageBTA(T *tmp1_dev,T *tmp2_dev, int cpy_indicator)
2533{
2534 int info;
2535 size_t IB;
2536 size_t NR,NM,NP,ND;
2537 T ONE = f_one();
2538 T ZERO = f_zero();
2539
2540 ND = matrix_nd;
2541
2542 // **************************** better spot? **************************** //
2543 // need identity matrix of size ns*ns for later
2544 T *eye_dev;
2545 mem_alloc_dev = allocate_data_on_device((void**)&eye_dev,matrix_ns*matrix_ns*sizeof(T));
2546 init_eye_on_dev(eye_dev,matrix_ns,0);
2547
2548 // extra buffer allocation
2549 T *G_LastDense_dev; // store Sigma_n+1n+1 block
2550 T *G_dense_i_dev; // store Sigma_n+1i block -> size ns*nb (i.e. b_size*nb)
2551 T *tmp3_dev; // extra buffer of size nb*ns (ie. nb*b_size)
2552 T *tmp4_dev; // extra buffer of size ns*ns
2553
2554 allocate_data_on_device((void**)&G_LastDense_dev,ND*ND*sizeof(T));
2555 allocate_data_on_device((void**)&G_dense_i_dev ,ND*matrix_ns*sizeof(T));
2556 allocate_data_on_device((void**)&tmp3_dev ,ND*matrix_ns*sizeof(T));
2557 mem_alloc_dev = allocate_data_on_device((void**)&tmp4_dev ,matrix_ns*matrix_ns*sizeof(T));
2558
2559#ifdef PRINT_MSG
2560 printf("in 3rd Stage Factor. Allocated GPU memory sel Inv: %ld, cpy ind %d\n", mem_alloc_dev, cpy_indicator);
2561#endif
2562
2563#ifdef PRINT_MSG
2564 // temporary buffer on host, just to check results ...
2565 T *tmp3; // size ns*nb
2566 T *tmp4; // size ns*ns
2567
2568 cudaMallocHost(&tmp3,ND*matrix_ns*sizeof(T));
2569 cudaMallocHost(&tmp4,matrix_ns*matrix_ns*sizeof(T));
2570#endif
2571
2572 //printf("\n*** new version selected inverse... ***\n\n");
2573
2574 double flop_count = 0;
2575 nvtxRangeId_t id_inv = nvtxRangeStartA("Inversion");
2576 double t_inv = get_time(0.0);
2577
2578 if (matrix_nd > 0)
2579 {
2580 //dense block
2581 IB = NBlock-1;
2582 NR = Bmax[IB]-Bmin[IB];
2583
2584 // *************** Gn+1n+1 ************ //
2585 // copies lower triangular part of blockDense_dev to G_LastDense_dev
2586 tlacpy_dev('L', NR, NR, blockDense_dev, mf_block_lda(IB, IB), G_LastDense_dev, NR, magma_queue_1);
2587
2588 cudaDeviceSynchronize();
2589
2590 // make matrix lower triangular ?!
2591 // redirect to magma_queue_1
2592 tril_dev(G_LastDense_dev, NR, NR);
2593
2594 // solve inv(A)*L = inv(U) for inv(A) by inverting U
2595 // G = L^-T * L^-1 => G*L = L^-T
2596 // tmp1_dev contains lower-triangular matrix, if 2nd argument 'N', maybe means U = L^T?
2597 // only computes inv(L)??
2598 // redirect to magma_queue_1
2599
2600#ifdef CUDA_POTRF
2601 cuda_buffer_flag_trtri[0] = 0;
2602 ttrtri_dev_cuda('L', 'N', NR, G_LastDense_dev, NR, info_cuda, cuda_buffer_flag_trtri,
2603 handle, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
2604#else
2605 // magma_queue currently not supported
2606 cudaDeviceSynchronize();
2607 ttrtri_dev('L', 'N', NR, G_LastDense_dev, NR, &info);
2608#endif
2609 flop_count += 1.0/3.0 * NR * NR * NR + 2.0/3.0 * NR;
2610 cudaDeviceSynchronize();
2611
2612 // if tmp1_dev contains L^-1, then
2613 // C = L^-T*L^-1 -> beta = 0, write into blockDense_dev
2614 tgemm_dev('T', 'N', NR, NR, NR, ONE, G_LastDense_dev, NR, G_LastDense_dev, NR, ZERO, blockDense_dev, mf_block_lda(IB, IB), magma_queue_1);
2615 flop_count += 2.0 * NR * NR * NR;
2616 if(cpy_indicator == 2){
2617 cudaDeviceSynchronize();
2618 // instead of full block, just copy elements that are nnz in Q
2619
2620 // compute indices: total ns^2*nt+nb^2 -> want to write into last nb^s indices
2621 ind_invBlks_fi = invblks_diag_block_index(IB);
2622 //memcpy_to_host(&invBlks[ind_invBlks_fi], blockDense_dev, matrix_nd*matrix_nd*sizeof(T));
2623 //memcpy_to_host(invBlks, blockDense_dev, matrix_nd*matrix_nd*sizeof(T));
2624 cudaMemcpy(&invBlks[ind_invBlks_fi], blockDense_dev, matrix_nd*matrix_nd*sizeof(T), cudaMemcpyDeviceToHost);
2625
2626#ifdef PRINT_MSG
2627 printf("first index final small diagonal block: %ld\n", ind_invBlks_fi);
2628 printf("final small block : \n");
2629 for(int i=0; i<ND*ND; i++){
2630 printf("%f ", invBlks[ind_invBlks_fi+i]);
2631 }
2632 printf("\n");
2633#endif
2634
2635 }
2636
2637 //last non-dense block
2638 IB = NBlock-2;
2639 NR = Bmax[IB]-Bmin[IB];
2640 NP = Bmax[IB+1]-Bmin[IB+1];
2641
2642 //printf("IB = %d, NR = %d, NP = %d\n", IB, NR, NP);
2643
2644 // *************** Gnn ************ //
2645 // redirect to magma_queue_1
2646 tril_dev(blockR_dev, mf_block_lda(IB, IB), NR);
2647 // solve inv(A)*L = inv(U) for inv(A) by inverting U
2648 // G = L^-T * L^-1 => G*L = L^-T
2649 // redirect to magma_queue_1
2650#ifdef CUDA_POTRF
2651 // factorization first block -> set buffer flag to zero!
2652 if(NR != NP){
2653 cuda_buffer_flag_trtri[0] = 0;
2654 }
2655 ttrtri_dev_cuda('L', 'N', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_trtri,
2656 handle, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
2657#else
2658 ttrtri_dev('L', 'N', NR, blockR_dev, mf_block_lda(IB, IB), &info);
2659#endif
2660
2661 flop_count += 1.0/3.0 * NR * NR * NR + 2.0/3.0 * NR;
2662
2663 cudaDeviceSynchronize();
2664
2665 // initialize identity matrix temp1_dev of size NR = ns (now)
2666 // redirect to magma_queue_1 -> potentially later in copy stream, but careful need tmp1_dev before
2667 init_eye_on_dev(tmp1_dev,NR,magma_cudaStream_1);
2668 // temp2_dev = L_Fn^T*Gn+1
2669 tgemm_dev('T', 'N', NR, NP, NP, ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), blockDense_dev, mf_block_lda(IB+1, IB+1), ZERO, tmp2_dev, NR, magma_queue_1);
2670 flop_count += 2.0 * NR * NP * NP;
2671 // temp1_dev = temp2_dev*L_Fn + I
2672 tgemm_dev('N', 'N', NR, NR, NP, ONE, tmp2_dev, NR, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), ONE, tmp1_dev, NR, magma_queue_1);
2673 flop_count += 2.0 * NR * NR * NP;
2674 // temp2_dev = blockR_dev^T*temp1_dev <=> temp2_dev = L_Dn^-T*temp1_dev
2675 tgemm_dev('T', 'N', NR, NR, NR, ONE, blockR_dev, mf_block_lda(IB, IB), tmp1_dev, NR, ZERO, tmp2_dev, NR, magma_queue_1);
2676 flop_count += 2.0 * NR * NR * NR;
2677 // tmp1_dev = temp2_dev*L_Dn^-1 => Gn
2678 tgemm_dev('N', 'N', NR, NR, NR, ONE, tmp2_dev, NR, blockR_dev, mf_block_lda(IB, IB), ZERO, tmp1_dev, NR, magma_queue_1);
2679 flop_count += 2.0 * NR * NR * NR;
2680
2681 if(cpy_indicator == 2){
2682 cudaDeviceSynchronize();
2683 // compute indices: total ns^2*nt+nb^2 -> now at last ns x ns block
2684 ind_invBlks_fi = invblks_diag_block_index(IB);
2685 //memcpy_to_host(&invBlks[ind_invBlks_fi], blockDense_dev, matrix_nd*matrix_nd*sizeof(T));
2686 //memcpy_to_host(invBlks, blockDense_dev, matrix_nd*matrix_nd*sizeof(T));
2687 cudaMemcpy(&invBlks[ind_invBlks_fi], tmp1_dev, matrix_ns*matrix_ns*sizeof(T), cudaMemcpyDeviceToHost);
2688#ifdef PRINT_MSG
2689 printf("first index last ns x ns diagonal block: %ld\n", ind_invBlks_fi);
2690 printf("last ns x ns block : \n");
2691 for(int i=0; i<NR*NR; i++){
2692 printf("%f ", invBlks[ind_invBlks_fi+i]);
2693 }
2694 printf("\n");
2695#endif
2696 }
2697
2698 // *************** Gn+1n ************ //
2699 // Sn+1n = -Sn+1n+1*L_Fn*L_Dn+1^-1
2700 // tmp3_dev = blockDense_dev*&blockR_dev[mf_dense_block_offset(IB)]*blockR_dev
2701 // step 1: tmp3 = Sn+1n+1*L_Fn
2702 //printf("NP = %d, NR = %d, mf_dense_block_lda(IB) = %d\n", NP, NR, mf_dense_block_lda(IB));
2703 // dgemm(nothing/transpose A, nothing/transpose B, #rows A, #cols B, #cols A, ...)
2704 tgemm_dev('N', 'N', NP, NR, NP, ONE, blockDense_dev, mf_block_lda(IB+1, IB+1), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), ZERO, tmp3_dev, NP, magma_queue_1);
2705 flop_count += 2.0 * NP * NR * NP;
2706
2707 // step 2: -tmp3_dev*L_Dn^-1 => -tmp3_dev*blockR_dev, dim(tmp3_dev) = (nb, ns), dim(blockR_dev) = (ns, ns)
2708 // nb, ns, ns
2709 //printf("mf_block_lda(IB, IB) = %d\n", mf_block_lda(IB, IB));
2710 tgemm_dev('N', 'N', NP, NR, NR, -ONE, tmp3_dev, NP, blockR_dev, mf_block_lda(IB, IB), ZERO, G_dense_i_dev, NP, magma_queue_1);
2711 flop_count += 2.0 * NP * NR * NR;
2712
2713#ifdef PRINT_MSG
2714 if(matrix_ns*ND < 200){
2715 cudaMemcpy(tmp3, G_dense_i_dev, matrix_ns*ND*sizeof(T), cudaMemcpyDeviceToHost);
2716 printf("Sigma_n+1n : \n");
2717 for(int i=0; i<matrix_ns*ND; i++){
2718 printf("%f ", tmp3[i]);
2719 }
2720 printf("\n");
2721 }
2722#endif
2723
2724 cudaDeviceSynchronize();
2725
2726 // do copying in other stream ...
2727 if(cpy_indicator == 0){
2728 // when we only want the diagonal
2729 copy_supernode_diag(blockDense_dev, NBlock-1);
2730 } else if(cpy_indicator == 1){
2731 // want nnzQ entries in CSC format
2732 extract_nnzA(blockDense_dev, NBlock-1);
2733 } else if(cpy_indicator == 2){
2734 // copy over Gn+1n
2735 ind_invBlks_fi = invblks_dense_block_index(IB);
2736 //memcpy_to_host(&invBlks[ind_invBlks_fi], blockDense_dev, matrix_nd*matrix_nd*sizeof(T));
2737 //memcpy_to_host(invBlks, blockDense_dev, matrix_nd*matrix_nd*sizeof(T));
2738 cudaMemcpy(&invBlks[ind_invBlks_fi], G_dense_i_dev, matrix_nd*matrix_ns*sizeof(T), cudaMemcpyDeviceToHost);
2739#ifdef PRINT_MSG
2740 if(matrix_ns*ND < 200){
2741 printf("first index last ns x nd diagonal block: %ld\n", ind_invBlks_fi);
2742 printf("last ns x ns small block : \n");
2743 for(int i=0; i<matrix_nd*matrix_ns; i++){
2744 printf("%f ", invBlks[ind_invBlks_fi+i]);
2745 }
2746 printf("\n");
2747 }
2748#endif
2749 }
2750 cudaDeviceSynchronize();
2751 // *************** Gnn ************ //
2752 // copy tmp1_dev to blockR_dev
2753 tlacpy_dev('F', NR, NR, tmp1_dev, NR, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
2754
2755#ifdef PRINT_MSG
2756 if(matrix_ns*matrix_ns < 200){
2757 cudaMemcpy(tmp4, tmp1_dev, matrix_ns*matrix_ns*sizeof(T), cudaMemcpyDeviceToHost);
2758 printf("Sigma_nn : \n");
2759 for(int i=0; i<matrix_ns*matrix_ns; i++){
2760 printf("%f ", tmp4[i]);
2761 }
2762 printf("\n");
2763 }
2764#endif
2765
2766 }
2767 else
2768 {
2769 //last non-dense block
2770 IB = NBlock-1;
2771 NR = Bmax[IB]-Bmin[IB];
2772
2773 // copy lower triangular block of blockR_dev to tmp1_dev
2774 tlacpy_dev('L', NR, NR, blockR_dev, mf_block_lda(IB, IB), tmp1_dev, NR, magma_queue_1);
2775 cudaDeviceSynchronize();
2776
2777 // make make tmp1_dev lower triangular
2778 tril_dev(tmp1_dev, NR, NR);
2779
2780 // compute L^-1 = inv(tmp1_dev) which is lower triangular
2781#ifdef CUDA_POTRF
2782 // factorization first block -> set buffer flag to zero!
2783 cuda_buffer_flag_trtri[0] = 0;
2784 ttrtri_dev_cuda('L', 'N', NR, tmp1_dev, NR, info_cuda, cuda_buffer_flag_trtri,
2785 handle, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
2786#else
2787 // magma_queue currently not supported
2788 cudaDeviceSynchronize();
2789 ttrtri_dev('L', 'N', NR, tmp1_dev, NR, &info);
2790#endif
2791
2792 flop_count += 1.0/3.0 * NR * NR * NR + 2.0/3.0*NR;
2793
2794 // blockR_dev = L^-T*L^-1
2795 tgemm_dev('T', 'N', NR, NR, NR, ONE, tmp1_dev, NR, tmp1_dev, NR, ZERO, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
2796 flop_count += 2.0 * NR * NR * NR;
2797
2798 if(cpy_indicator == 2){
2799 cudaDeviceSynchronize();
2800 // compute indices: total ns^2*nt+nb^2 -> now at last ns x ns block
2801 ind_invBlks_fi = invblks_diag_block_index(IB);
2802 //memcpy_to_host(&invBlks[ind_invBlks_fi], blockDense_dev, matrix_nd*matrix_nd*sizeof(T));
2803 //memcpy_to_host(invBlks, blockDense_dev, matrix_nd*matrix_nd*sizeof(T));
2804 cudaMemcpy(&invBlks[ind_invBlks_fi], tmp1_dev, matrix_ns*matrix_ns*sizeof(T), cudaMemcpyDeviceToHost);
2805#ifdef PRINT_MSG
2806 if(NR*NR < 200){
2807 printf("first index last ns x ns diagonal block case with no fixed effects : %ld\n", ind_invBlks_fi);
2808 printf("last ns x ns block : \n");
2809 for(int i=0; i<NR*NR; i++){
2810 printf("%f ", invBlks[ind_invBlks_fi+i]);
2811 }
2812 printf("\n");
2813 }
2814#endif
2815 }
2816
2817 }
2818 /*
2819 T* tmp_host;
2820 cudaMallocHost(&tmp_host, (NR*NR) * sizeof(T));
2821 tlacpy_dev('F', NR, NR, blockR_dev, mf_block_lda(IB,IB), tmp4_dev, NR, magma_queue_1);
2822 cudaDeviceSynchronize();
2823 gpuErrchk(cudaMemcpy(&tmp_host[0], &tmp4_dev[0], NR*NR*sizeof(double), cudaMemcpyDeviceToHost));
2824 cudaDeviceSynchronize();
2825
2826 std::string fileName = "invBlock_cpyInd" + to_string(cpy_indicator) + "_IB" + to_string(IB) + ".txt";
2827 ofstream file(fileName, ios::out | ::ios::trunc);
2828
2829 for(int i=0; i<NR*NR; i++){
2830 file << std::setprecision(15) << tmp_host[i] << endl;
2831 }
2832
2833 file.close();
2834 cudaFreeHost(tmp_host);
2835 */
2836
2837 // -> already copy next block to device
2838 cudaDeviceSynchronize();
2839 copy_supernode_to_device(blockM_dev, matrix_nt-2, copyStream);
2840
2841
2842 //second-last non-dense block .. 0-block
2843 for (int IBi = matrix_nt-2; IBi > -1; IBi--)
2844 {
2845 IB = IBi;
2846 NR = Bmax[IB]-Bmin[IB];
2847 NP = Bmax[IB+1]-Bmin[IB+1];
2848
2849 // don't swap pointers before copy_supernode_to_device() isn't done
2850 cudaDeviceSynchronize();
2851 // before swap: blockR_dev: Sigma_IBi+1, blockM_dev: CholeskyFactor IBi, then swap
2852 swap_pointers(&blockR_dev, &blockM_dev);
2853 cudaDeviceSynchronize();
2854
2855 // copy diagonal from previous iteration into buffer on device
2856 // redirect to copy stream
2857 if(cpy_indicator == 0){
2858 copy_supernode_diag(blockM_dev, IB+1);
2859 } else if(cpy_indicator == 1){
2860 //copy_supernode_diag(blockM_dev, IB+1);
2861 // write G_dense_i_dev into "correct spot in blockM_dev"
2862 if(matrix_nd > 0){
2863 //printf("ND = %ld\n", ND);
2864 tlacpy_dev('F', ND, NR, G_dense_i_dev, ND, &blockM_dev[mf_dense_block_offset(IB+1)], mf_dense_block_lda(IB+1), magma_queue_1);
2865 }
2866 cudaDeviceSynchronize();
2867 extract_nnzA(blockM_dev, IB+1);
2868 //exit(1);
2869 }
2870
2871 /*
2872 cudaDeviceSynchronize();
2873 copy_supernode_to_device(blockR_dev, IB);
2874 cudaDeviceSynchronize();
2875 */
2876
2877 // tril(blockR_dev)
2878 tril_dev(blockR_dev, mf_block_lda(IB, IB), NR);
2879 // if L = blockR_dev, then compute L^-1
2880 nvtxRangeId_t id_trtri = nvtxRangeStartA("TriangularSolve_inLoop");
2881#ifdef CUDA_POTRF
2882 // factorization first block -> set buffer flag to zero!
2883 if(NR != NP){
2884 cuda_buffer_flag_trtri[0] = 0;
2885 }
2886 ttrtri_dev_cuda('L', 'N', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_trtri,
2887 handle, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
2888#else
2889 ttrtri_dev('L', 'N', NR, blockR_dev, mf_block_lda(IB, IB), &info);
2890#endif
2891 nvtxRangeEnd(id_trtri);
2892 flop_count += 1.0/3.0 * NR * NR * NR + 2.0/3.0 * NR;
2893
2894 // initialize eye(NR, NR)
2895 init_eye_on_dev(tmp1_dev,NR,magma_cudaStream_1);
2896 //cudaDeviceSynchronize();
2897
2898 // tmp2_dev = blockR_dev^T*blockM_dev => L_Ei^T*Gi+1
2899 tgemm_dev('T', 'N', NR, NP, NP, ONE, &blockR_dev[NR], mf_block_lda(IB+1, IB), blockM_dev, mf_block_lda(IB+1, IB+1), ZERO, tmp2_dev, NR, magma_queue_1);
2900 flop_count += 2.0 * NR * NP * NP;
2901
2902 // temp1_dev = tmp2_dev*blockR_dev + I => tmp2_dev*L_Ei + I
2903 tgemm_dev('N', 'N', NR, NR, NP, ONE, tmp2_dev, NR, &blockR_dev[NR], mf_block_lda(IB+1, IB), ONE, tmp1_dev, NR, magma_queue_1);
2904 flop_count += 2.0 * NR * NR * NP;
2905 //cudaDeviceSynchronize(); //
2906
2907 // blockM_dev not in use anymore until next iter where pointers are swapped
2908 // -> already copy next block to device
2909 // use cuda Events
2910 if(IB > 0){
2911 //cudaDeviceSynchronize();
2912 gpuErrchk(cudaEventRecord(potrf_dev_ev, magma_cudaStream_1));
2913 gpuErrchk(cudaStreamWaitEvent(copyStream, potrf_dev_ev, 0));
2914
2915 nvtxRangeId_t id_initSN = nvtxRangeStartA("CpySNtoDevice");
2916 copy_supernode_to_device(blockM_dev, IB-1, copyStream);
2917 nvtxRangeEnd(id_initSN);
2918 //cudaDeviceSynchronize();
2919 }
2920
2921 if (matrix_nd > 0)
2922 {
2923 nvtxRangeId_t id_denseGEMMS = nvtxRangeStartA("GEMMsDenseRows");
2924
2925 nvtxRangeId_t id_firstDenseGEMM = nvtxRangeStartA("FirstDenseGEMM");
2926 // tmp3_dev = L_Fi^T*Gn+1n+1 //
2927 tgemm_dev('T', 'N', NR, ND, ND, ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), blockDense_dev, mf_block_lda(NBlock-1, NBlock-1), ZERO, tmp3_dev, NR, magma_queue_1);
2928 flop_count += 2.0 * NR * ND * ND;
2929
2930 nvtxRangeEnd(id_firstDenseGEMM);
2931
2932 // tmp1_dev = tmp3_dev*L_Fi + tmp1_dev
2933 tgemm_dev('N', 'N', NR, NR, ND, ONE, tmp3_dev, NR, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), ONE, tmp1_dev, NR, magma_queue_1);
2934 flop_count += 2.0 * NR * NR * ND;
2935
2936 // store Gn+1i+1 in G_dense_i_dev -> overwrite in every iteration
2937 // total: Gii = L_Di^-T*(L_Ei^T*Gi+1i+1*L_Ei + L_Fi^T*Gn+1n+1*L_Fi + L_Fi^T*Gn+1i+1*L_Ei + L_Ei^T*Gi+1n+1*L_Fi)L_Di^-1
2938 // need: L_Fi^T*Gi+1n+1*L_Ei (& its transpose) i.e. L_Ei^T*Gn+1i+1*L_Fi
2939 // tmp4_dev = L_Fi^T*Gn+1i+1*L_Ei
2940 // step 1: tmp4_dev = L_Fi^T*Gn+1i+1, dim(tmp2_dev) = (ns,ns)
2941 tgemm_dev('T', 'N', NP, NR, ND, ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), G_dense_i_dev, ND, ZERO, tmp4_dev, NR, magma_queue_1);
2942 flop_count += 2.0 * NP * NR * ND;
2943
2944#ifdef PRINT_MSG
2945 if(NR*NR < 200){
2946 cudaMemcpy(tmp4, tmp4_dev,NR*NR*sizeof(T), cudaMemcpyDeviceToHost);
2947 printf("L_Fi^T*Gn+1i+1: \n");
2948 for(int i=0; i<NR*NR; i++){
2949 printf("%f ", tmp4[i]);
2950 }
2951 printf("\n");
2952 }
2953#endif
2954
2955 // tmp2_dev = tmp4_dev*L_Ei
2956 tgemm_dev('N', 'N', NR, NP, NR, ONE, tmp4_dev, NR, &blockR_dev[NR], mf_block_lda(IB+1, IB), ZERO, tmp2_dev, NR, magma_queue_1);
2957 flop_count += 2.0 * NR * NP * NR;
2958
2959#ifdef PRINT_MSG
2960 if(NR*NR < 200){
2961 cudaMemcpy(tmp4, tmp4_dev,NR*NR*sizeof(T), cudaMemcpyDeviceToHost);
2962 printf("L_Fi^T*Gn+1i+1*L_Ei:\n");
2963 for(int i=0; i<NR*NR; i++){
2964 printf("%f ", tmp4[i]);
2965 }
2966 printf("\n");
2967 }
2968#endif
2969
2970 // copy tmp2 into temp4
2971 tlacpy_dev('F', NR, NR, tmp2_dev, NR, tmp4_dev, NR, magma_queue_1);
2972 // -> then tmp4 = tmp2' + tmp4
2973 tgemm_dev('T', 'N', NR, NR, NR, ONE, tmp2_dev, NR, eye_dev, NR, ONE, tmp4_dev, NR, magma_queue_1);
2974 flop_count += 2.0 * NR * NR * NR;
2975
2976#ifdef PRINT_MSG
2977 if(NR*NR < 200){
2978 cudaMemcpy(tmp4, tmp4_dev,NR*NR*sizeof(T), cudaMemcpyDeviceToHost);
2979 printf("L_Fi^T*Gn+1i+1*L_Ei + t(L_Fi^T*Gn+1i+1*L_Ei): \n");
2980 for(int i=0; i<NR*NR; i++){
2981 printf("%f ", tmp4[i]);
2982 }
2983 printf("\n");
2984 }
2985#endif
2986
2987 // tmp1_dev = tmp4_dev + tmp1_dev (rest happens outside this if statement)
2988 tgemm_dev('N', 'N', NR, NR, NR, ONE, tmp4_dev, NR, eye_dev, NR, ONE, tmp1_dev, NR, magma_queue_1);
2989 flop_count += 2.0 * NR * NR * NR;
2990
2991#ifdef PRINT_MSG
2992 if(NR*NR < 200){
2993 cudaMemcpy(tmp4, tmp1_dev,NR*NR*sizeof(T), cudaMemcpyDeviceToHost);
2994 printf("tmp4_dev: \n");
2995 for(int i=0; i<NR*NR; i++){
2996 printf("%f ", tmp4[i]);
2997 }
2998 printf("\n");
2999 }
3000#endif
3001 // compute inv Dense rows
3002 // Gn+1i = - (Gn+1i+1*L_Ei + Gn+1n+1*L_Fi)*L_Di^-1 , dim(Gn+1i+1*L_Ei) = dim(Gn+1i+1) = (nb, ns)
3003 // tmp3_dev = G_dense_i_dev*(&blockR_dev[NR], mf_block_lda(IB+1, IB))
3004 tgemm_dev('N', 'N', ND, NP, NR, ONE, G_dense_i_dev, ND, &blockR_dev[NR], mf_block_lda(IB+1, IB), ZERO, tmp3_dev, ND, magma_queue_1);
3005 flop_count += 2.0 * ND * NP * NR;
3006
3007#ifdef PRINT_MSG
3008 if(NR*ND < 200){
3009 cudaMemcpy(tmp3, tmp3_dev,NR*ND*sizeof(T), cudaMemcpyDeviceToHost);
3010 printf("tmp3_dev: \n");
3011 for(int i=0; i<ND*NR; i++){
3012 printf("%f ", tmp3[i]);
3013 }
3014 printf("\n");
3015 }
3016#endif
3017
3018 // compute inv Dense rows
3019 // tmp3_dev = Gn+1n+1*L_Fi + tmp3_dev
3020 tgemm_dev('N', 'N', ND, NR, ND, ONE, blockDense_dev, ND, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), ONE, tmp3_dev, ND, magma_queue_1);
3021 flop_count += 2.0 * ND * NR * ND;
3022
3023#ifdef PRINT_MSG
3024 if(NR*ND < 200){
3025 cudaMemcpy(tmp3, tmp3_dev,NR*ND*sizeof(T), cudaMemcpyDeviceToHost);
3026 printf("tmp3_dev: \n");
3027 for(int i=0; i<ND*NR; i++){
3028 printf("%f ", tmp3[i]);
3029 }
3030 printf("\n");
3031 }
3032#endif
3033 // compute inv Dense rows
3034 // Gn+1i = tmp3_dev*L_Di^-1 => G_dense_i_dev*blockR_dev
3035 tgemm_dev('N', 'N', ND, NR, NR, -ONE, tmp3_dev, ND, blockR_dev, mf_block_lda(IB, IB), ZERO, G_dense_i_dev, ND, magma_queue_1);
3036 flop_count += 2.0 * ND * NR * NR;
3037
3038#ifdef PRINT_MSG
3039 if(NR*ND < 200){
3040 cudaMemcpy(tmp3, G_dense_i_dev,NR*ND*sizeof(T), cudaMemcpyDeviceToHost);
3041 printf("tmp3_dev: \n");
3042 for(int i=0; i<ND*NR; i++){
3043 printf("%f ", tmp3[i]);
3044 }
3045 printf("\n");
3046 }
3047#endif
3048
3049 if(cpy_indicator == 2){
3050 // copy over Gn+1i
3051 ind_invBlks_fi = invblks_dense_block_index(IB);
3052 //memcpy_to_host(&invBlks[ind_invBlks_fi], blockDense_dev, matrix_nd*matrix_nd*sizeof(T));
3053 //memcpy_to_host(invBlks, blockDense_dev, matrix_nd*matrix_nd*sizeof(T));
3054 cudaMemcpy(&invBlks[ind_invBlks_fi], G_dense_i_dev, matrix_nd*matrix_ns*sizeof(T), cudaMemcpyDeviceToHost);
3055#ifdef PRINT_MSG
3056 if(matrix_nd*matrix_ns < 200){
3057 printf("first index of i-th ns x nd diagonal block: %ld\n", ind_invBlks_fi);
3058 printf(" i-th ns x nd dense block : \n");
3059 for(int i=0; i<matrix_nd*matrix_ns; i++){
3060 printf("%f ", invBlks[ind_invBlks_fi+i]);
3061 }
3062 printf("\n");
3063 }
3064 #endif
3065 } // end cpy invBlks
3066
3067 nvtxRangeEnd(id_denseGEMMS); // goes through entire if statement
3068 } // end if matrix_nd > 0
3069
3070 // tmp2_dev = blockR_dev^T*tmp1_dev => tmp2_dev = L^-T*tmp1_dev
3071 tgemm_dev('T', 'N', NR, NR, NR, ONE, blockR_dev, mf_block_lda(IB, IB), tmp1_dev, NR, ZERO, tmp2_dev, NR, magma_queue_1);
3072 flop_count += 2.0 * NR * NR * NR;
3073
3074 // tmp1_dev = tmp2_dev*L^-1
3075 tgemm_dev('N', 'N', NR, NR, NR, ONE, tmp2_dev, NR, blockR_dev, mf_block_lda(IB, IB), ZERO, tmp1_dev, NR, magma_queue_1);
3076 flop_count += 2.0 * NR * NR * NR;
3077
3078#ifdef PRINT_MSG
3079 if(NR*NR < 200){
3080 cudaMemcpy(tmp4, tmp1_dev, NR*NR*sizeof(T), cudaMemcpyDeviceToHost);
3081 printf("Gii: \n");
3082 for(int i=0; i<NR*NR; i++){
3083 printf("%f ", tmp4[i]);
3084 }
3085 printf("\n");
3086 }
3087#endif
3088
3089 if(cpy_indicator == 2){
3090 cudaDeviceSynchronize();
3091 // compute indices: total ns^2*nt+nb^2 -> now at last ns x ns block
3092 ind_invBlks_fi = invblks_diag_block_index(IB);
3093 //memcpy_to_host(&invBlks[ind_invBlks_fi], blockDense_dev, matrix_nd*matrix_nd*sizeof(T));
3094 //memcpy_to_host(invBlks, blockDense_dev, matrix_nd*matrix_nd*sizeof(T));
3095 cudaMemcpy(&invBlks[ind_invBlks_fi], tmp1_dev, matrix_ns*matrix_ns*sizeof(T), cudaMemcpyDeviceToHost);
3096#ifdef PRINT_MSG
3097 if(NR*NR < 200){
3098 printf("first index last ns x ns diagonal block case with no fixed effects : %ld\n", ind_invBlks_fi);
3099 printf("i-th ns x ns block : \n");
3100 for(int i=0; i<NR*NR; i++){
3101 printf("%f ", invBlks[ind_invBlks_fi+i]);
3102 }
3103 printf("\n");
3104 }
3105#endif
3106 }
3107
3108 // copy tmp1_dev to blockR_dev
3109 tlacpy_dev('F', NR, NR, tmp1_dev, NR, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
3110
3111 } // end loop over nt
3112
3113#ifdef PRINT_MSG
3114 if(cpy_indicator == 2){
3115 printf("array containing all neccessary inv blk entries: \n");
3116 for(int i=0; i<(matrix_ns+matrix_nd)*matrix_ns*matrix_nt+matrix_nd*matrix_nd; i++){
3117 printf("%f ", invBlks[i]);
3118 }
3119 printf("\n");
3120 }
3121#endif
3122
3123 t_inv = get_time(t_inv);
3124 double gflops = flop_count / (1e9*t_inv);
3125 printf("GFLOPS sel. inversion : %f, time selInv : %f\n", gflops, t_inv);
3126
3127 nvtxRangeEnd(id_inv);
3128
3129 cudaDeviceSynchronize();
3130
3131 // copy diagonal of first block into buffer
3132 if(cpy_indicator == 0){
3133 copy_supernode_diag(blockR_dev, 0);
3134 }
3135
3136 if(cpy_indicator == 1){
3137 if(ND > 0){
3138 tlacpy_dev('F', ND, NR, G_dense_i_dev, ND, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_1);
3139 }
3140 cudaDeviceSynchronize();
3141 extract_nnzA(blockR_dev, 0);
3142 }
3143
3144 // free memory of extra buffers
3145 deallocate_data_on_dev(G_LastDense_dev, ND*ND*sizeof(T));
3146 deallocate_data_on_dev(eye_dev ,matrix_ns*matrix_ns*sizeof(T));
3147 deallocate_data_on_dev(G_dense_i_dev ,ND*matrix_ns*sizeof(T));
3148
3149 deallocate_data_on_dev(tmp3_dev, ND*matrix_ns*sizeof(T));
3150 mem_alloc_dev = deallocate_data_on_dev(tmp4_dev, matrix_ns*matrix_ns*sizeof(T));
3151
3152#ifdef PRINT_MSG
3153 printf("Allocated GPU memory after sel Inv: %ld, cpyInd %d\n", mem_alloc_dev, cpy_indicator);
3154
3155 cudaFreeHost(tmp3);
3156 cudaFreeHost(tmp4);
3157#endif
3158
3159 return gflops;
3160} // end thirdstageFactor
3161
3162template <class T>
3164
3165#ifdef PRINT_MSG
3166 std::cout << "in initialize MF host. MF allocated : " << MF_allocated << std::endl;
3167#endif
3168
3169 // cudaMallocHost takes a long time -> only worth it if we reuse it!
3170 if(!MF_allocated)
3171 {
3172 // memory allocation on CPU
3173 // PREVIOUSLY unpinned memory:
3174 //MF = new T[matrix_n_nonzeros];
3175 //unsigned int dummy_flag = 0;
3176 //cudaHostRegister(MF, (matrix_n_nonzeros) * sizeof(T), dummy_flag);
3177 // pin memory:
3178 cudaMallocHost(&MF, (matrix_n_nonzeros) * sizeof(T));
3179 MF_allocated = true;
3180 //printf("In MF_allocated not allocated.\n");
3181 }
3182
3183}
3184
3185
3186template <class T>
3188
3189#ifdef PRINT_MSG
3190 std::cout << "in initialize invBlks host" << std::endl;
3191#endif
3192
3193 // cudaMallocHost takes a long time -> only worth it if we reuse it!
3194 if(!invBlks_allocated)
3195 {
3196 double t_invBlk_alloc = get_time(0.0);
3197 // allocate space for Diagonal blocks and dense rows below
3198 size_t nnz_invBlks = (matrix_ns+matrix_nd)*matrix_ns*matrix_nt + matrix_nd*matrix_nd;
3199 cudaMallocHost(&invBlks, nnz_invBlks * sizeof(T));
3200 t_invBlk_alloc = get_time(t_invBlk_alloc);
3201
3202 invBlks_allocated = true;
3203
3204//#ifdef PRINT_MSG
3205 std::cout << "in initialize invBlks host. nnz(invBlks) : " << nnz_invBlks << ", Allocation time : " << t_invBlk_alloc << std::endl;
3206//#endif
3207 }
3208
3209}
3210
3211/************************************************************************************************/
3212
3213template <class T>
3214double BTA<T>::factorize(size_t *ia, size_t *ja, T *a, double& t_firstStageFactor)
3215{
3216
3217#ifdef PRINT_MSG
3218 std::cout << "entered factorize." << std::endl;
3219#endif
3220
3221 int GPU_currRank;
3222 cudaGetDevice(&GPU_currRank);
3223
3224 if(GPU_currRank != GPU_rank){
3225 //printf("In factorize. Current GPU rank: %d. set GPU rank to: %d\n", GPU_currRank, GPU_rank);
3226 cudaSetDevice(GPU_rank);
3227 cudaGetDevice(&GPU_currRank);
3228 //printf("updated GPU rank is: %d.\n", GPU_currRank);
3229 }
3230
3231 //int numaNodeID = topo_get_numNode(GPU_rank);
3232
3233 matrix_ia = ia;
3234 matrix_ja = ja;
3235 matrix_a = a;
3236
3237 double t_allocate_MF_host = get_time(0.0);
3238
3239 initialize_MF_host();
3240
3241#ifdef PRINT_MSG
3242 std::cout << "in factorize. MF_allocated : " << MF_allocated << std::endl;
3243#endif
3244
3245#if 0
3246 // cudaMallocHost takes a long time -> only worth it if we reuse it!
3247 if(!MF_allocated)
3248 {
3249 // memory allocation on CPU
3250 // PREVIOUSLY unpinned memory:
3251 //MF = new T[matrix_n_nonzeros];
3252 //unsigned int dummy_flag = 0;
3253 //cudaHostRegister(MF, (matrix_n_nonzeros) * sizeof(T), dummy_flag);
3254 // pin memory:
3255 cudaMallocHost(&MF, (matrix_n_nonzeros) * sizeof(T));
3256 MF_allocated = true;
3257 //printf("In MF_allocated not allocated.\n");
3258 }
3259#endif
3260
3261 t_allocate_MF_host = get_time(t_allocate_MF_host);
3262 //std::cout << "time to allocate memory on CPU: " << t_allocate_MF_host << std::endl;
3263
3264 double t_allocate_MF_dev = get_time(0.0);
3265
3266 //Data allocation on GPU -> negligible time
3267 if (!MF_dev_allocated)
3268 {
3269 //printf("In MF_dev_allocated not allocated.\n");
3270 //MF = new T[matrix_n_nonzeros];
3271 size_t max_supernode_nnz_dense = matrix_nt > 1 ? matrix_ns*(2*matrix_ns+matrix_nd) : matrix_ns*(matrix_ns+matrix_nd);
3272 size_t final_supernode_nnz_dense = matrix_nd > 0 ? matrix_nd*matrix_nd : 0;
3273 allocate_data_on_device((void**)&blockR_dev,max_supernode_nnz_dense*sizeof(T));
3274 allocate_data_on_device((void**)&blockM_dev,max_supernode_nnz_dense*sizeof(T));
3275 allocate_data_on_device((void**)&blockDense_dev,final_supernode_nnz_dense*sizeof(T));
3276 MF_dev_allocated = true;
3277 }
3278
3279 t_allocate_MF_dev = get_time(t_allocate_MF_dev);
3280 //std::cout << "time to allocate memory on GPU: " << t_allocate_MF_dev << std::endl;
3281
3282
3283 size_t nnz = matrix_ia[matrix_size];
3284 //std::cout << "nnz = " << nnz << std::endl;
3285 size_t max_rows = matrix_nt > 1 ? 2*matrix_ns+matrix_nd : matrix_ns+matrix_nd;
3286 size_t max_cols = max(matrix_ns, matrix_nd);
3287
3288 //Temp data allocation
3289 allocate_data_on_device((void**)&ia_dev,(max_cols+1)*sizeof(size_t));
3290 allocate_data_on_device((void**)&ja_dev,max_rows*max_cols*sizeof(size_t));
3291 mem_alloc_dev = allocate_data_on_device((void**)&a_dev,max_rows*max_cols*sizeof(T));
3292
3293#ifdef PRINT_MSG
3294 printf("Allocated GPU memory Factorize: %ld\n", mem_alloc_dev);
3295#endif
3296
3297 //Computation
3298 t_firstStageFactor = get_time(0.0);
3299 double gflops = FirstStageFactor();
3300 t_firstStageFactor = get_time(t_firstStageFactor);
3301 //printf("time firstStageFactor : %f\n", t_firstStageFactor);
3302
3303 //std::string fileName = "L.txt";
3304 /*time_t rawtime;
3305 struct tm * timeinfo;
3306 char buffer[80];
3307
3308 time (&rawtime);
3309 timeinfo = localtime(&rawtime);
3310
3311 strftime(buffer,sizeof(buffer),"L_%d-%m-%Y_%H:%M:%S.txt",timeinfo);
3312 std::string fileName(buffer);
3313 ofstream file(fileName, ios::out | ::ios::trunc);
3314
3315 for(int i=0; i<matrix_n_nonzeros; i++){
3316 file << std::setprecision(15) << MF[i] << endl;
3317 }
3318
3319 file.close();*/
3320
3321 //Temp data deallocation
3322 deallocate_data_on_dev(ia_dev,(max_cols+1)*sizeof(size_t));
3323 deallocate_data_on_dev(ja_dev,max_rows*max_cols*sizeof(size_t));
3324 mem_alloc_dev = deallocate_data_on_dev(a_dev,max_rows*max_cols*sizeof(T));
3325
3326#ifdef PRINT_MGS
3327 printf("Allocated GPU memory after Factorize: %ld\n", mem_alloc_dev);
3328#endif
3329
3330 factorization_completed = true;
3331
3332 return gflops;
3333}
3334
3335
3336/************************************************************************************************/
3337
3338template <class T>
3339double BTA<T>::factorize_noCopyHost(size_t* ia, size_t* ja, T* a, T &logDet)
3340{
3341
3342#ifdef PRINT_MSG
3343 printf("In factorize no copy host function.\n");
3344#endif
3345
3346 matrix_ia = ia;
3347 matrix_ja = ja;
3348 matrix_a = a;
3349
3350 int GPU_currRank;
3351 cudaGetDevice(&GPU_currRank);
3352
3353 if(GPU_currRank != GPU_rank){
3354 //printf("in factorize_noCopyHost. Current GPU rank: %d. set GPU rank to: %d\n", GPU_currRank, GPU_rank);
3355 cudaSetDevice(GPU_rank);
3356 cudaGetDevice(&GPU_currRank);
3357 //printf("updated GPU rank is: %d.\n", GPU_currRank);
3358 }
3359
3360 // Data allocation device
3361 if (!MF_dev_allocated)
3362 {
3363 // don't need to initialize MF
3364 //MF = new T[matrix_n_nonzeros];
3365 size_t max_supernode_nnz_dense = matrix_nt > 1 ? matrix_ns*(2*matrix_ns+matrix_nd) : matrix_ns*(matrix_ns+matrix_nd);
3366 size_t final_supernode_nnz_dense = matrix_nd > 0 ? matrix_nd*matrix_nd : 0;
3367 allocate_data_on_device((void**)&blockR_dev,max_supernode_nnz_dense*sizeof(T));
3368 allocate_data_on_device((void**)&blockM_dev,max_supernode_nnz_dense*sizeof(T));
3369 allocate_data_on_device((void**)&blockDense_dev,final_supernode_nnz_dense*sizeof(T));
3370 MF_dev_allocated = true; // otherwise stuff never gets deleted ...
3371 }
3372
3373 size_t nnz = matrix_ia[matrix_size];
3374 size_t max_rows = matrix_nt > 1 ? 2*matrix_ns+matrix_nd : matrix_ns+matrix_nd;
3375 size_t max_cols = max(matrix_ns, matrix_nd);
3376
3377 //Temp data allocation
3378 allocate_data_on_device((void**)&ia_dev,(max_cols+1)*sizeof(size_t));
3379 allocate_data_on_device((void**)&ja_dev,max_rows*max_cols*sizeof(size_t));
3380 allocate_data_on_device((void**)&a_dev,max_rows*max_cols*sizeof(T));
3381
3383 // allocate arrays to store diagonal elements
3384 allocate_data_on_device((void**)&diag_dev,matrix_size*sizeof(T));
3385 mem_alloc_dev = allocate_data_on_device((void**)&diag_pos_dev,matrix_size*sizeof(size_t));
3386
3387#ifdef PRINT_MSG
3388 printf("Allocated GPU memory Factorize noCpyHost: %ld\n", mem_alloc_dev);
3389#endif
3390
3391 //Copy diag_pos to device
3392 memcpy_to_device(diag_pos,diag_pos_dev,matrix_size*sizeof(size_t), NULL );
3393
3394 //Computation
3395 double gflops = FirstStageFactor_noCopyHost(logDet);
3396 //double gflops = FirstStageFactor_noCopyHost_testV(logDet);
3397
3399 //Data deallocation
3400 deallocate_data_on_dev(diag_dev,matrix_size*sizeof(T));
3401 deallocate_data_on_dev(diag_pos_dev,matrix_size*sizeof(size_t));
3402
3403 //Temp data deallocation
3404 deallocate_data_on_dev(ia_dev,(max_cols+1)*sizeof(size_t));
3405 deallocate_data_on_dev(ja_dev,max_rows*max_cols*sizeof(size_t));
3406 mem_alloc_dev = deallocate_data_on_dev(a_dev,max_rows*max_cols*sizeof(T));
3407
3408#ifdef PRINT_MSG
3409 printf("Allocated GPU memory after Factorize noCpyHost: %ld\n", mem_alloc_dev);
3410#endif
3411
3412 // factor doesn't exist, hence mark as false
3413 factorization_completed = false;
3414
3415 return gflops;
3416}
3417
3418/************************************************************************************************/
3419
3420template <class T>
3421double BTA<T>::factorizeSolve(size_t *ia, size_t *ja, T *a, T *x, T *b, size_t nrhs, double& t_firstSecondStage, double& t_SecondStageBackPass)
3422{
3423
3424#ifdef PRINT_MSG
3425 std::cout << "entered factorize." << std::endl;
3426#endif
3427
3428 int GPU_currRank;
3429 cudaGetDevice(&GPU_currRank);
3430
3431 if(GPU_currRank != GPU_rank){
3432 //printf("In factorize. Current GPU rank: %d. set GPU rank to: %d\n", GPU_currRank, GPU_rank);
3433 cudaSetDevice(GPU_rank);
3434 cudaGetDevice(&GPU_currRank);
3435 //printf("updated GPU rank is: %d.\n", GPU_currRank);
3436 }
3437
3438 //int numaNodeID = topo_get_numNode(GPU_rank);
3439
3440 matrix_ia = ia;
3441 matrix_ja = ja;
3442 matrix_a = a;
3443
3444 double t_allocate_MF_host = get_time(0.0);
3445
3446 initialize_MF_host();
3447
3448#ifdef PRINT_MSG
3449 std::cout << "in factorize. MF_allocated : " << MF_allocated << std::endl;
3450#endif
3451
3452 t_allocate_MF_host = get_time(t_allocate_MF_host);
3453 //std::cout << "time to allocate memory on CPU: " << t_allocate_MF_host << std::endl;
3454
3455 double t_allocate_MF_dev = get_time(0.0);
3456
3457 //Data allocation on GPU -> negligible time
3458 if (!MF_dev_allocated){
3459 //printf("In MF_dev_allocated not allocated.\n");
3460 //MF = new T[matrix_n_nonzeros];
3461 size_t max_supernode_nnz_dense = matrix_nt > 1 ? matrix_ns*(2*matrix_ns+matrix_nd) : matrix_ns*(matrix_ns+matrix_nd);
3462 size_t final_supernode_nnz_dense = matrix_nd > 0 ? matrix_nd*matrix_nd : 0;
3463 allocate_data_on_device((void**)&blockR_dev,max_supernode_nnz_dense*sizeof(T));
3464 allocate_data_on_device((void**)&blockM_dev,max_supernode_nnz_dense*sizeof(T));
3465 allocate_data_on_device((void**)&blockDense_dev,final_supernode_nnz_dense*sizeof(T));
3466 MF_dev_allocated = true;
3467 }
3468
3469 t_allocate_MF_dev = get_time(t_allocate_MF_dev);
3470 //std::cout << "time to allocate memory on GPU: " << t_allocate_MF_dev << std::endl;
3471
3472 size_t nnz = matrix_ia[matrix_size];
3473 //std::cout << "nnz = " << nnz << std::endl;
3474 size_t max_rows = matrix_nt > 1 ? 2*matrix_ns+matrix_nd : matrix_ns+matrix_nd;
3475 size_t max_cols = max(matrix_ns, matrix_nd);
3476
3477 //Temp data allocation
3478 allocate_data_on_device((void**)&ia_dev,(max_cols+1)*sizeof(size_t));
3479 allocate_data_on_device((void**)&ja_dev,max_rows*max_cols*sizeof(size_t));
3480 mem_alloc_dev = allocate_data_on_device((void**)&a_dev,max_rows*max_cols*sizeof(T));
3481
3482 // Data allocation for right-hand side
3483 rhs = new T[matrix_size*nrhs];
3484 //Copy data
3485 memcpy(rhs,b,matrix_size*nrhs*sizeof(T));
3486
3487 allocate_data_on_device((void**)&rhs_dev,matrix_size*sizeof(T));
3488 // TODO: stream later
3489 memcpy_to_device(b, rhs_dev,matrix_size*sizeof(T), NULL);
3490
3491#ifdef PRINT_MSG
3492 printf("Allocated GPU memory FactorizeSolve(): %ld\n", mem_alloc_dev);
3493#endif
3494
3495 // ************************************************************* //
3496 //Computation Factorization + Forward Solve
3497 t_firstSecondStage = get_time(0.0);
3498
3499 double gflops = FirstSecondStageFactor(nrhs);
3500
3501 t_firstSecondStage = get_time(t_firstSecondStage);
3502 printf("time firstSecondStageFactor : %f\n", t_firstSecondStage);
3503 // ************************************************************* //
3504
3505 //std::string fileName = "L.txt";
3506 time_t rawtime;
3507 struct tm * timeinfo;
3508 char buffer[80];
3509
3510 time (&rawtime);
3511 timeinfo = localtime(&rawtime);
3512
3513 strftime(buffer,sizeof(buffer),"L_%d-%m-%Y_%H:%M:%S.txt",timeinfo);
3514 std::string fileName(buffer);
3515 ofstream file(fileName, ios::out | ::ios::trunc);
3516
3517 for(int i=0; i<matrix_n_nonzeros; i++){
3518 file << std::setprecision(15) << MF[i] << endl;
3519 }
3520
3521 file.close();
3522
3523 // contains solution vector from forward pass
3524 memcpy_to_host(rhs, rhs_dev,matrix_size*sizeof(T), NULL);
3525
3526 /*printf("y = ");
3527 for(int i = 0; i<min((int) matrix_size, 20); i++){
3528 printf(" %f ", rhs[i]);
3529 }
3530 printf("\n");*/
3531
3532 // ************************************************************* //
3533 // Backward Solve on CPU
3534 // Second stage factor backward solve only ()
3535 // ************************************************************* //
3536 t_SecondStageBackPass = get_time(0.0);
3537
3538 gflops += BackwardPassSolve(nrhs);
3539
3540 t_SecondStageBackPass = get_time(t_SecondStageBackPass);
3541 printf("time backwardPassSolve : %f\n", t_SecondStageBackPass);
3542 // ***************************************************** //
3543 //Copy data
3544 memcpy(x,rhs,matrix_size*nrhs*sizeof(T));
3545
3546 //Data deallocation
3547 delete[] rhs;
3548
3549 //Temp data deallocation Factorization
3550 deallocate_data_on_dev(ia_dev,(max_cols+1)*sizeof(size_t));
3551 deallocate_data_on_dev(ja_dev,max_rows*max_cols*sizeof(size_t));
3552 mem_alloc_dev = deallocate_data_on_dev(a_dev,max_rows*max_cols*sizeof(T));
3553
3554 // deallocate rhs device buffer
3555 deallocate_data_on_dev(rhs_dev, matrix_size*sizeof(T));
3556
3557#ifdef PRINT_MGS
3558 printf("Allocated GPU memory after Factorize: %ld\n", mem_alloc_dev);
3559#endif
3560
3561 factorization_completed = true;
3562
3563 return gflops;
3564}
3565
3566/************************************************************************************************/
3567
3568template <class T>
3569double BTA<T>::solve(size_t *ia, size_t *ja, T *a, T *b, size_t nrhs, double& t_secondStageForwardPass, double& t_secondStageBackwardPass)
3570{
3571 double gflops = solve(b, b, nrhs, t_secondStageForwardPass, t_secondStageBackwardPass);
3572
3573 return gflops;
3574}
3575
3576/************************************************************************************************/
3577
3578template <class T>
3579double BTA<T>::solve(size_t *ia, size_t *ja, T *a, T *x, T *b, size_t nrhs, double& t_secondStageForwardPass, double& t_secondStageBackwardPass)
3580{
3581
3582 double t_FSF;
3583
3584 if (!factorization_completed){
3585 factorize(ia, ja, a, t_FSF);
3586 }
3587
3588 // check if solve is performed on the right matrix
3589 if(!(ia[matrix_size] == matrix_ia[matrix_size])){
3590 std::cout << "Matrices don't match!!" << std::endl;
3591 exit(1);
3592 }
3593
3594 // check MF diff put in static array first time
3595 /*
3596 static int solve_flag = 0;
3597 static double *MF_initial;
3598
3599 if(solve_flag == 0){
3600 printf("in solve, solve flag = 0\n");
3601 MF_initial = new double[matrix_n_nonzeros];
3602 for(int i=0; i<matrix_n_nonzeros; i++){
3603 MF_initial[i] = MF[i];
3604 }
3605 solve_flag = 1;
3606 } else {
3607 double temp = 0;
3608 for(int i=0; i<matrix_n_nonzeros; i++){
3609 temp += (MF_initial[i]-MF[i])*(MF_initial[i]-MF[i]);
3610 }
3611 printf("norm(MF-MF_initial) = %f\n", temp);
3612 }
3613 */
3614
3615 //Data allocation
3616 rhs = new T[matrix_size*nrhs];
3617
3618 //Copy data
3619 memcpy(rhs,b,matrix_size*nrhs*sizeof(T));
3620
3621 //Computation
3622 double gflops = SecondStageSolve(nrhs, t_secondStageForwardPass, t_secondStageBackwardPass);
3623
3624 //Copy data
3625 memcpy(x,rhs,matrix_size*nrhs*sizeof(T));
3626
3627 //Data deallocation
3628 delete[] rhs;
3629
3630 return gflops;
3631}
3632
3633/************************************************************************************************/
3634// COPY -- DOUBLE PRECISION but assuming SINGLE precision input
3635#if 1
3636template <class T>
3637double BTA<T>::solve_d(size_t *ia, size_t *ja, float* a, float* x, float *b, size_t nrhs)
3638{
3639
3640 double t_FSF;
3641
3642 if (!factorization_completed){
3643 factorize(ia, ja, a, t_FSF);
3644 }
3645
3646 // check if solve is performed on the right matrix
3647 if(!(ia[matrix_size] == matrix_ia[matrix_size])){
3648 std::cout << "Matrices don't match!!" << std::endl;
3649 exit(1);
3650 }
3651
3652 if(sizeof(T) == 8){
3653 printf("T already double precision. Doesn't make sense to call solve_d()! Cholesky factor already in double precision!\n");
3654 exit(1);
3655 }
3656
3657 //Data allocation
3658 double* rhs_d = new double[matrix_size*nrhs];
3659
3660 // copy b to rhs and store as single precision
3661 for(int i=0; i<matrix_size*nrhs; i++){
3662 rhs_d[i] = (double) b[i];
3663 }
3664
3665 //Computation
3666 double gflops = SecondStageSolve_d(nrhs, rhs_d);
3667
3668 //Copy solution into x and convert to double
3669 for(int i=0; i<matrix_size*nrhs; i++){
3670 x[i] = (float) rhs_d[i];
3671 }
3672
3673 //Data deallocation
3674 delete[] rhs_d;
3675
3676 return gflops;
3677}
3678
3679#endif
3680
3681/************************************************************************************************/
3682// COPY -- SINGLE PRECISION but assuming double precision input
3683#if 1
3684template <class T>
3685double BTA<T>::solve_s(size_t *ia, size_t *ja, double* a, double* x, double*b, size_t nrhs)
3686{
3687
3688 double t_FSF;
3689
3690 if (!factorization_completed){
3691 factorize(ia, ja, a, t_FSF);
3692 }
3693
3694 // check if solve is performed on the right matrix
3695 if(!(ia[matrix_size] == matrix_ia[matrix_size])){
3696 std::cout << "Matrices don't match!!" << std::endl;
3697 exit(1);
3698 }
3699
3700 if(sizeof(T) == 4){
3701 printf("T already single precision. Doesn't make sense to call solve_s()! Cholesky factor already in single precision!\n");
3702 exit(1);
3703 }
3704
3705 //Data allocation
3706 float* rhs_s = new float[matrix_size*nrhs];
3707
3708 // copy b to rhs and store as single precision
3709 for(int i=0; i<matrix_size*nrhs; i++){
3710 rhs_s[i] = (float) b[i];
3711 }
3712
3713 //Computation
3714 double gflops = SecondStageSolve_s(nrhs, rhs_s);
3715
3716 //Copy solution into x and convert to double
3717 for(int i=0; i<matrix_size*nrhs; i++){
3718 x[i] = (double) rhs_s[i];
3719 }
3720
3721 //Data deallocation
3722 delete[] rhs_s;
3723
3724 return gflops;
3725}
3726
3727#endif
3728
3729/************************************************************************************************/
3730
3731template <class T>
3732double BTA<T>::BTAdiag(size_t *ia, size_t *ja, T *a, T *diag)
3733{
3734 // check current device. set appropriately.
3735 int GPU_currRank;
3736 cudaGetDevice(&GPU_currRank);
3737
3738 if(GPU_currRank != GPU_rank){
3739 //printf("in BTAdiag. current GPU rank: %d. set GPU rank to: %d\n", GPU_currRank, GPU_rank);
3740 cudaSetDevice(GPU_rank);
3741 cudaGetDevice(&GPU_currRank);
3742 //printf("updated GPU rank is: %d.\n", GPU_currRank);
3743 }
3744
3745 double t_FSF;
3746
3747 if (!factorization_completed)
3748 factorize(ia, ja, a, t_FSF);
3749
3750 // check if number of nnz are the same to check if matrix is the same
3751 if(!(ia[matrix_size] == matrix_ia[matrix_size])){
3752 std::cout << "Matrices don't match!!" << std::endl;
3753 exit(1);
3754 }
3755
3756 factorization_completed = false;
3757 cpy_indicator = 0; // => only copy back diagonal
3758
3759 //Data allocation
3760 T *tmp1_dev;
3761 T *tmp2_dev;
3762
3763#ifdef PRINT_MSG
3764 printf("\nBTASelInv: before allocating data on device now. CPY indicator %d\n", cpy_indicator);
3765#endif
3766
3767 allocate_data_on_device((void**)&diag_dev,matrix_size*sizeof(T));
3768 allocate_data_on_device((void**)&diag_pos_dev,matrix_size*sizeof(size_t));
3769 allocate_data_on_device((void**)&tmp1_dev,matrix_ns*matrix_ns*sizeof(T));
3770 mem_alloc_dev = allocate_data_on_device((void**)&tmp2_dev,matrix_ns*matrix_ns*sizeof(T));
3771
3772#ifdef PRINT_MSG
3773 printf("Allocated GPU memory sel Inv: %ld, cpy ind %d\n", mem_alloc_dev, cpy_indicator);
3774#endif
3775
3776 //Copy diag_pos to device
3777 memcpy_to_device(diag_pos,diag_pos_dev,matrix_size*sizeof(size_t), NULL );
3778
3779 //Computation
3780 double gflops = ThirdStageBTA(tmp1_dev,tmp2_dev, cpy_indicator);
3781
3782 //Copy data to host
3783 memcpy_to_host(diag,diag_dev,matrix_size*sizeof(T), NULL );
3784
3785 //Data deallocation
3786 deallocate_data_on_dev(diag_dev,matrix_size*sizeof(T));
3787 deallocate_data_on_dev(diag_pos_dev,matrix_size*sizeof(size_t));
3788 deallocate_data_on_dev(tmp1_dev,matrix_ns*matrix_ns*sizeof(T));
3789 deallocate_data_on_dev(tmp2_dev,matrix_ns*matrix_ns*sizeof(T));
3790
3791#ifdef PRINT_MSG
3792 printf("Allocated GPU memory after sel Inv: %ld, cpy ind %d\n", mem_alloc_dev, cpy_indicator);
3793#endif
3794
3795 return gflops;
3796}
3797
3798template <class T>
3799double BTA<T>::BTAselInv(size_t *ia, size_t *ja, T *a, T *invQ)
3800{
3801 int GPU_currRank;
3802 cudaGetDevice(&GPU_currRank);
3803
3804 if(GPU_currRank != GPU_rank){
3805 //printf("in BTAselInv. current GPU rank: %d. set GPU rank to: %d\n", GPU_currRank, GPU_rank);
3806 cudaSetDevice(GPU_rank);
3807 cudaGetDevice(&GPU_currRank);
3808 //printf("updated GPU rank is: %d.\n", GPU_currRank);
3809 }
3810
3811 double t_FSF;
3812
3813 if (!factorization_completed)
3814 factorize(ia, ja, a, t_FSF);
3815
3816 // check if number of nnz are the same to check if matrix is the same
3817 if(!(ia[matrix_size] == matrix_ia[matrix_size])){
3818 std::cout << "Matrices don't match!!" << std::endl;
3819 exit(1);
3820 }
3821
3822 cpy_indicator = 1; // => copy back all entries of nnzQ
3823
3824 //Data allocation
3825 T *tmp1_dev;
3826 T *tmp2_dev;
3827
3828 size_t nnz = matrix_ia[matrix_size];
3829
3830 //inv_a = new T[nnz];
3831 gpuErrchk(cudaMallocHost((void**)&inv_a, nnz*sizeof(T)));
3832
3833 //std::cout << "nnz = " << nnz << std::endl;
3834 size_t max_rows = matrix_nt > 1 ? 2*matrix_ns+matrix_nd : matrix_ns+matrix_nd;
3835 size_t max_cols = max(matrix_ns, matrix_nd);
3836
3837#ifdef PRINT_MSG
3838 printf("\nBTASelInv: allocating data on device now. CPY indicator %d\n", cpy_indicator);
3839#endif
3840
3841 //allocate_data_on_device((void**)&diag_dev,matrix_size*sizeof(T));
3842 //allocate_data_on_device((void**)&diag_pos_dev,matrix_size*sizeof(size_t));
3843 allocate_data_on_device((void**)&tmp1_dev,matrix_ns*matrix_ns*sizeof(T));
3844 allocate_data_on_device((void**)&tmp2_dev,matrix_ns*matrix_ns*sizeof(T));
3845
3846 //Temp data allocation
3847 // ALLOCATES MORE MEMORY THAN NEEDED -> only need max nnz for all supernodes
3848 /*allocate_data_on_device((void**)&inv_ia_dev,(max_cols+1)*sizeof(size_t));
3849 allocate_data_on_device((void**)&inv_ja_dev,max_rows*max_cols*sizeof(size_t));
3850 allocate_data_on_device((void**)&inv_a_dev,max_rows*max_cols*sizeof(T));*/
3851
3852 // compute maximum number of nnz over all supernodes (of sparse input matrix A)
3853 // only works because only the last set of columns is not of size matrix_ns ...
3854 get_max_supernode_nnz();
3855 //printf("final max supernode nnz : %ld\n", max_supernode_nnz);
3856
3857 allocate_data_on_device((void**)&inv_ia_dev,(max_cols+1)*sizeof(size_t));
3858 allocate_data_on_device((void**)&inv_ja_dev,max_supernode_nnz*sizeof(size_t));
3859 allocate_data_on_device((void**)&inv_a_dev,max_supernode_nnz*sizeof(T));
3860
3861 //Computation
3862 double gflops = ThirdStageBTA(tmp1_dev,tmp2_dev, cpy_indicator);
3863
3864 factorization_completed = false; // we have overwritten the Cholesky factor
3865
3866 //CAREFUL: invBlks are on CPU but not as contiguous array, block-wise !!
3867 //memcpy_to_host(diag,diag_dev,matrix_size*sizeof(T));
3868
3869 // restructure array to only contain value array of CSC format
3870 //size_t nnz_invBlks = (matrix_ns+matrix_nd)*matrix_ns*matrix_nt+matrix_nd*matrix_nd;
3871 //for(size_t i=0; i<nnz_invBlks; i++){
3872 //invBlks_ext[i] = invBlks[i];
3873 //}
3874
3875 for(size_t i=0; i<nnz; i++){
3876 invQ[i] = inv_a[i];
3877 }
3878
3879 //Data deallocation
3880 //deallocate_data_on_dev(diag_dev,matrix_size*sizeof(T));
3881 //deallocate_data_on_dev(diag_pos_dev,matrix_size*sizeof(size_t));
3882 deallocate_data_on_dev(tmp1_dev,matrix_ns*matrix_ns*sizeof(T));
3883 deallocate_data_on_dev(tmp2_dev,matrix_ns*matrix_ns*sizeof(T));
3884
3885 //Temp data deallocation
3886 deallocate_data_on_dev(inv_ia_dev,(max_cols+1)*sizeof(size_t));
3887 deallocate_data_on_dev(inv_ja_dev,max_rows*max_cols*sizeof(size_t));
3888 deallocate_data_on_dev(inv_a_dev,max_rows*max_cols*sizeof(T));
3889
3890 cudaFreeHost(inv_a);
3891
3892 return gflops;
3893}
3894
3895template <class T>
3896double BTA<T>::BTAinvBlks(size_t *ia, size_t *ja, T *a, T *invBlks_ext)
3897{
3898
3899 printf("!!!CAREFUL: BTAinvBlks function abandoned. Not sure if it gives correct results!!!\n");
3900
3901 double t_FSF;
3902
3903 if (!factorization_completed)
3904 factorize(ia, ja, a, t_FSF);
3905
3906 // check if number of nnz are the same to check if matrix is the same
3907 if(!(ia[matrix_size] == matrix_ia[matrix_size])){
3908 std::cout << "Matrices don't match!!" << std::endl;
3909 exit(1);
3910 }
3911
3912 factorization_completed = false;
3913 cpy_indicator = 2; // => copy back inv diag blocks in block format
3914
3915 //std::cout << "nnz = " << nnz << std::endl;
3916 size_t max_rows = matrix_nt > 1 ? 2*matrix_ns+matrix_nd : matrix_ns+matrix_nd;
3917 size_t max_cols = max(matrix_ns, matrix_nd);
3918
3919 initialize_invBlks_host();
3920
3921 //Data allocation
3922 T *tmp1_dev;
3923 T *tmp2_dev;
3924
3925 //allocate_data_on_device((void**)&diag_dev,matrix_size*sizeof(T));
3926 //allocate_data_on_device((void**)&diag_pos_dev,matrix_size*sizeof(size_t));
3927 allocate_data_on_device((void**)&tmp1_dev,matrix_ns*matrix_ns*sizeof(T));
3928 allocate_data_on_device((void**)&tmp2_dev,matrix_ns*matrix_ns*sizeof(T));
3929
3930 //Copy diag_pos to device
3931 //memcpy_to_device(diag_pos,diag_pos_dev,matrix_size*sizeof(size_t));
3932
3933 //Computation
3934 double gflops = ThirdStageBTA(tmp1_dev,tmp2_dev, cpy_indicator);
3935
3936 //CAREFUL: invBlks are on CPU but not as contiguous array, block-wise !!
3937 //memcpy_to_host(diag,diag_dev,matrix_size*sizeof(T));
3938
3939 // restructure array to only contain value array of CSC format
3940 size_t nnz_invBlks = (matrix_ns+matrix_nd)*matrix_ns*matrix_nt+matrix_nd*matrix_nd;
3941 for(size_t i=0; i<nnz_invBlks; i++){
3942 invBlks_ext[i] = invBlks[i];
3943 }
3944
3945 //Data deallocation
3946 //deallocate_data_on_dev(diag_dev,matrix_size*sizeof(T));
3947 //deallocate_data_on_dev(diag_pos_dev,matrix_size*sizeof(size_t));
3948 deallocate_data_on_dev(tmp1_dev,matrix_ns*matrix_ns*sizeof(T));
3949 deallocate_data_on_dev(tmp2_dev,matrix_ns*matrix_ns*sizeof(T));
3950
3951 return gflops;
3952}
3953
3954/************************************************************************************************/
3955
3956template <class T>
3957T BTA<T>::logDet(size_t *ia, size_t *ja, T *a)
3958{
3959 double t_FSF;
3960
3961 if (!factorization_completed)
3962 factorize(ia, ja, a, t_FSF);
3963
3964 //Computation
3965 T det = T(0.0);
3966 indexed_log_sum(MF, diag_pos, matrix_size, &det);
3967
3968 return 2.0*det;
3969}
3970
3971/************************************************************************************************/
3972
3973template <class T>
3974double BTA<T>::residualNorm(T *x, T *b)
3975{
3976 T *r = new T[matrix_size];
3977
3978 memcpy(r, b, matrix_size*sizeof(T));
3979
3980 for (size_t ic = 0; ic < matrix_size; ic++)
3981 {
3982 for (size_t i = matrix_ia[ic]; i < matrix_ia[ic+1]; i++)
3983 {
3984 size_t ir = matrix_ja[i];
3985
3986 r[ir] -= matrix_a[i]*x[ic];
3987 if (ir != ic)
3988 r[ic] -= matrix_a[i]*x[ir];
3989 }
3990 }
3991
3992 double res = c_dtnrm2(matrix_size, r, 1);
3993
3994 delete[] r;
3995
3996 return res;
3997}
3998
3999/************************************************************************************************/
4000
4001template <class T>
4003{
4004 return residualNorm(x, b) / c_dtnrm2(matrix_size, b, 1);
4005}
4006
4007/************************************************************************************************/
4008
4009/*
4010template <class T>
4011void BTA<T>::create_blocks()
4012{
4013 size_t IB;
4014
4015 b_size = 0;
4016
4017 for(IB=0;IB<NBlock;IB++){
4018
4019 if(Bmax[IB]-Bmin[IB]>b_size){
4020 b_size = Bmax[IB]-Bmin[IB];
4021 }
4022 }
4023}
4024*/
4025
4026
4027/************************************************************************************************/
4028
4029template <class T>
4031{
4032 // compute maximum number of nnz over all supernodes
4033 // only works because only the last set of columns is not of size matrix_ns ...
4034 max_supernode_nnz = 0;
4035 size_t count_nnz = 0;
4036 // # supernodes: matrix_nt + 1
4037 for(int supernode=0; supernode<matrix_nt+1; supernode++){
4038 size_t supernode_fc = supernode * matrix_ns;
4039 size_t supernode_lc = supernode < matrix_nt ? (supernode+1) * matrix_ns : matrix_ns * matrix_nt + matrix_nd;
4040 size_t supernode_nnz = matrix_ia[supernode_lc] - matrix_ia[supernode_fc];
4041 count_nnz += supernode_nnz;
4042 max_supernode_nnz = max(supernode_nnz, max_supernode_nnz);
4043
4044#ifdef PRINT_MSG
4045 printf("supernode= %d, supernode_nnz = %ld, max_supernode_nnz= %ld\n", supernode, supernode_nnz, max_supernode_nnz);
4046#endif
4047 }
4048
4049}
4050
4051
4052/************************************************************************************************/
4053
4054template <class T>
4055inline void BTA<T>::init_supernode(T *M_dev, size_t supernode, cudaStream_t stream)
4056{
4057
4058 size_t supernode_fc = supernode * matrix_ns;
4059 size_t supernode_lc = supernode < matrix_nt ? (supernode+1) * matrix_ns : matrix_ns * matrix_nt + matrix_nd;
4060 size_t supernode_nnz = matrix_ia[supernode_lc] - matrix_ia[supernode_fc];
4061 size_t supernode_offset = matrix_ia[supernode_fc];
4062 size_t rows = mf_block_lda(supernode, supernode);
4063 size_t cols = supernode_lc - supernode_fc;
4064 size_t supernode_size = rows * cols;
4065
4066 cudaMemsetAsync((void*)M_dev, 0, supernode_size*sizeof(T), stream );
4067
4068 //printf("in init supernode. a[0] = %f\n", matrix_a[supernode_offset]);
4069
4070 memcpy_to_device( &matrix_ia[supernode_fc], ia_dev, (cols+1)*sizeof(size_t), stream );
4071 memcpy_to_device( &matrix_ja[supernode_offset], ja_dev, supernode_nnz*sizeof(size_t), stream );
4072 memcpy_to_device( &matrix_a[supernode_offset], a_dev, supernode_nnz*sizeof(T), stream );
4073
4074 init_supernode_dev(M_dev, ia_dev, ja_dev, a_dev, supernode, supernode_nnz, supernode_offset, matrix_ns, matrix_nt, matrix_nd, stream );
4075
4076 /*double a_host;
4077 gpuErrchk(cudaMemcpy(&a_host, &M_dev[0], sizeof(double), cudaMemcpyDeviceToHost));
4078 printf("in init supernode. IB = %d, a[0] = %f\n", supernode, a_host);*/
4079
4080}
4081
4082
4083/************************************************************************************************/
4084
4085// M_dev is local, all entries of 1 supernode
4086// a_dev is local -> all nnz entries of a in that supernode
4087template <class T>
4088inline void BTA<T>::extract_nnzA(T *M_dev, size_t supernode)
4089{
4090 // only works because only the last set of columns is not of size matrix_ns ...
4091 size_t supernode_fc = supernode * matrix_ns;
4092 size_t supernode_lc = supernode < matrix_nt ? (supernode+1) * matrix_ns : matrix_ns * matrix_nt + matrix_nd;
4093 size_t supernode_nnz = matrix_ia[supernode_lc] - matrix_ia[supernode_fc];
4094 size_t supernode_offset = matrix_ia[supernode_fc];
4095 size_t rows = mf_block_lda(supernode, supernode);
4096 size_t cols = supernode_lc - supernode_fc;
4097 size_t supernode_size = rows * cols;
4098
4099 // check that supernode_nnz is smaller than max_supernode_nnz because length(inv_a_dev/inv_ja_dev) = max_supernode_nnz
4100 if(supernode_nnz > max_supernode_nnz){
4101 printf("Memory Allocation problem! supernode_nnz: %ld > max_supernode_nnz: %ld\n", supernode_nnz, max_supernode_nnz);
4102 exit(1);
4103 }
4104
4105 gpuErrchk(cudaMemcpy(inv_ia_dev, &matrix_ia[supernode_fc], (cols+1)*sizeof(size_t), cudaMemcpyHostToDevice ));
4106 gpuErrchk(cudaMemcpy(inv_ja_dev, &matrix_ja[supernode_offset], supernode_nnz*sizeof(size_t), cudaMemcpyHostToDevice ));
4107
4108 //memcpy_to_device(&matrix_ia[supernode_fc],inv_ia_dev,(cols+1)*sizeof(size_t));
4109 //memcpy_to_device(&matrix_ja[supernode_offset],inv_ja_dev,supernode_nnz*sizeof(size_t));
4110 //memcpy_to_device(&matrix_a[supernode_offset],a_dev,supernode_nnz*sizeof(T));
4111
4112 /*
4113 T* M_host;
4114 M_host = new T[supernode_size];
4115
4116 cudaMemcpy(M_host, M_dev, supernode_size*sizeof(T), cudaMemcpyDeviceToHost );
4117
4118 printf("IB = %ld, supernode nnz = %ld, supernode size = %ld, M host : ", supernode, supernode_nnz, supernode_size);
4119 for(int i=0; i<supernode_size; i++){
4120 printf(" %f", M_host[i]);
4121 }
4122 printf("\n");
4123
4124 delete[] M_host;
4125
4126 size_t* inv_ia_host;
4127 inv_ia_host = new size_t[supernode_nnz];
4128
4129 //memcpy_to_host(a_dev, a_host, supernode_nnz*sizeof(T));
4130 cudaMemcpy(inv_ia_host, inv_ia_dev, (cols+1)*sizeof(size_t), cudaMemcpyDeviceToHost );
4131 //memcpy_to_host(a_dev, &invQ[supernode_offset], supernode_nnz*sizeof(T));
4132
4133 printf("ia host : ");
4134 for(int i=0; i<cols+1; i++){
4135 printf(" %ld", inv_ia_host[i]);
4136 }
4137 printf("\n");
4138 */
4139
4140 extract_nnzA_dev(inv_a_dev, inv_ia_dev, inv_ja_dev, M_dev, supernode, supernode_nnz, supernode_offset, matrix_ns, matrix_nt, matrix_nd);
4141
4142 //exit(1);
4143 //printf("supernode offset : %ld, supernode nnz : %ld\n", supernode_offset, supernode_nnz);
4144
4145 gpuErrchk(cudaMemcpy(&inv_a[supernode_offset], inv_a_dev, supernode_nnz*sizeof(T), cudaMemcpyDeviceToHost));
4146
4147 /*
4148 printf("inv_a : ");
4149 for(int i=0; i<supernode_nnz; i++){
4150 //inv_a[supernode_offset+i] = inv_a_host[i];
4151 printf(" %f", inv_a[supernode_offset+i]);
4152 }
4153 printf("\n");
4154 cudaDeviceSynchronize();
4155 */
4156}
4157
4158/************************************************************************************************/
4159
4160template <class T>
4161inline void BTA<T>::copy_supernode_to_host(T *M_dev, size_t supernode, cudaStream_t stream)
4162{
4163 size_t supernode_fc = supernode * matrix_ns;
4164 size_t supernode_lc = supernode < matrix_nt ? (supernode+1) * matrix_ns : matrix_ns * matrix_nt + matrix_nd;
4165 size_t ind = mf_block_index(supernode, supernode);
4166 size_t rows = mf_block_lda(supernode, supernode);
4167 size_t cols = supernode_lc - supernode_fc;
4168
4169#ifdef PRINT_MSG
4170 printf("In copy supernode to host.\n");
4171 printf("size rows = %ld, size cols = %ld, size rows*cols = %ld, ind = %ld\n", rows, cols, rows*cols, ind);
4172#endif
4173
4174 memcpy_to_host(&MF[ind], M_dev, rows*cols*sizeof(T), stream );
4175
4176}
4177
4178
4179/************************************************************************************************/
4180
4181template <class T>
4182inline void BTA<T>::copy_supernode_to_host_write(T *M_dev, size_t supernode)
4183{
4184 size_t supernode_fc = supernode * matrix_ns;
4185 size_t supernode_lc = supernode < matrix_nt ? (supernode+1) * matrix_ns : matrix_ns * matrix_nt + matrix_nd;
4186 size_t ind = mf_block_index(supernode, supernode);
4187 size_t rows = mf_block_lda(supernode, supernode);
4188 size_t cols = supernode_lc - supernode_fc;
4189
4190#ifdef PRINT_MSG
4191 printf("In copy supernode to host write.\n");
4192 printf("size rows&cols = %ld, ind = %ld\n", rows*cols, ind);
4193#endif
4194
4195 // initialize MF host in case not allocated
4196 initialize_MF_host();
4197
4198 memcpy_to_host(&MF[ind], M_dev, rows*cols*sizeof(T), NULL);
4199
4200 time_t rawtime;
4201 struct tm * timeinfo;
4202 char buffer[80];
4203
4204 time (&rawtime);
4205 timeinfo = localtime(&rawtime);
4206
4207 strftime(buffer,sizeof(buffer),"chunk_of_L_%d-%m-%Y_%H:%M:%S.txt",timeinfo);
4208 std::string file_name(buffer);
4209
4210 //std::string file_name = "chunk_of_L.txt";
4211 ofstream file(file_name, ios::out | ::ios::trunc);
4212 for(int i=0; i<rows*cols; i++){
4213 //file << std::setprecision(15) << MF[ind+i] << endl;
4214 file << MF[ind+i] << endl;
4215
4216 }
4217 file.close();
4218 std::cout << "wrote to file : " << file_name << std::endl;
4219
4220}
4221
4222
4223/************************************************************************************************/
4224
4225template <class T>
4226inline void BTA<T>::copy_supernode_to_device(T *M_dev, size_t supernode, cudaStream_t stream)
4227{
4228 size_t supernode_fc = supernode * matrix_ns;
4229 size_t supernode_lc = supernode < matrix_nt ? (supernode+1) * matrix_ns : matrix_ns * matrix_nt + matrix_nd;
4230 size_t ind = mf_block_index(supernode, supernode);
4231 size_t rows = mf_block_lda(supernode, supernode);
4232 size_t cols = supernode_lc - supernode_fc;
4233
4234 memcpy_to_device(&MF[ind], M_dev, rows*cols*sizeof(T), stream);
4235
4236}
4237
4238/************************************************************************************************/
4239
4240template <class T>
4241inline void BTA<T>::copy_supernode_diag(T *src, size_t supernode)
4242{
4243 size_t supernode_fc = supernode * matrix_ns;
4244 size_t supernode_lc = supernode < matrix_nt ? (supernode+1) * matrix_ns : matrix_ns * matrix_nt + matrix_nd;
4245 size_t offset = mf_block_index(supernode, supernode);
4246 size_t rows = mf_block_lda(supernode, supernode);
4247 size_t cols = supernode_lc - supernode_fc;
4248
4249 indexed_copy_offset_dev(src, &diag_dev[supernode_fc], &diag_pos_dev[supernode_fc], cols, offset);
4250
4251}
4252
4253/************************************************************************************************/
4254
4255template <class T>
4256inline void BTA<T>::swap_pointers(T **ptr1, T **ptr2)
4257{
4258
4259 T *tmp = *ptr1;
4260 *ptr1 = *ptr2;
4261 *ptr2 = tmp;
4262
4263 }
4264
4265/************************************************************************************************/
4266
4267// added inline here and in the next 3 functions to avoid compiler issues!
4268template <>
4269inline CPX BTA<CPX>::f_one()
4270{
4271 return CPX(1.0, 0.0);
4272}
4273
4274template <>
4275inline double BTA<double>::f_one()
4276{
4277 return 1.0;
4278}
4279
4280// new SINGLE PRECISION
4281template <>
4282inline float BTA<float>::f_one()
4283{
4284 return 1.0;
4285}
4286
4287/************************************************************************************************/
4288
4289template <>
4290inline CPX BTA<CPX>::f_zero()
4291{
4292 return CPX(0.0, 0.0);
4293}
4294
4295template <>
4296inline double BTA<double>::f_zero()
4297{
4298 return 0.0;
4299}
4300
4301// new SINGLE PRECISION
4302template <>
4303inline float BTA<float>::f_zero()
4304{
4305 return 0.0;
4306}
4307
4308/************************************************************************************************/
4309
4310#endif
4311
4312
Template class for Block Triangular Arrowhead Solver.
Definition BTA.H:32
double factorize(size_t *ia, size_t *ja, T *a, double &t_firstStageFactor)
Perform factorization of sparse matrix in CSC format.
Definition BTA.H:3214
double BTAdiag(size_t *ia, size_t *ja, T *a, T *diag)
compute selected inverse. return only the diagonal of the inverse.
Definition BTA.H:3732
size_t * matrix_ia
Definition BTA.H:185
double BTAselInv(size_t *ia, size_t *ja, T *a, T *invQ)
compute selected inverse. return all elements of the inverse that were nonzero in Q and are within th...
Definition BTA.H:3799
double solve(size_t *ia, size_t *ja, T *a, T *rhs, size_t nrhs, double &t_secondStageForwardPass, double &t_secondStageBackwardPass)
Solve the linear system when cholesky factor is already computed.
Definition BTA.H:3569
size_t matrix_nd
Definition BTA.H:192
double factorize_noCopyHost(size_t *ia, size_t *ja, T *a, T &logDet)
Perform factorization on the given matrix without copying factor back to host.
Definition BTA.H:3339
double solve_s(size_t *ia, size_t *ja, double *a, double *x, double *rhs, size_t nrhs)
single precision solve but assuming double precision input.
Definition BTA.H:3685
size_t matrix_size
Definition BTA.H:188
size_t matrix_ns
Definition BTA.H:190
T logDet(size_t *ia, size_t *ja, T *a)
compute log determinant.
Definition BTA.H:3957
double residualNorm(T *x, T *b)
compute residual norm as || r || = || b - A*x ||.
Definition BTA.H:3974
double residualNormNormalized(T *x, T *b)
compute residual norm as || rel r || = || b - A*x || / || b ||.
Definition BTA.H:4002
T * matrix_a
Definition BTA.H:187
BTA(size_t ns, size_t nt, size_t nd, int GPU_rank_)
Constructor for BTA Solver.
Definition BTA.H:326
double factorizeSolve(size_t *ia, size_t *ja, T *a, T *x, T *rhs, size_t nrhs, double &t_firstSecondStage, double &t_SecondStageBackPass)
Perform factorization and solve the linear system for multiple right-hand sides. forward substitution...
Definition BTA.H:3421
double solve_d(size_t *, size_t *, float *, float *, float *, size_t)
double precision solve but assuming single precision input.
Definition BTA.H:3637
int GPU_rank
Definition BTA.H:203
size_t matrix_nt
Definition BTA.H:191
double solve(size_t *, size_t *, T *, T *, T *, size_t, double &t_secondStageForwardPass, double &t_secondStageBackwardPass)
Solve the linear system for a single right-hand side.
Definition BTA.H:3579
size_t * matrix_ja
Definition BTA.H:186
~BTA()
Destructor for BTA class.
Definition BTA.H:485
size_t matrix_n_nonzeros
Definition BTA.H:189