181 double flop_count_factorise();
198 size_t max_supernode_nnz = 0;
200 size_t ind_invBlks_fi;
201 size_t mem_alloc_dev = 0;
206 bool MF_allocated =
false;
207 bool invBlks_allocated =
false;
208 bool MF_dev_allocated =
false;
209 bool factorization_completed =
false;
213 magma_queue_t magma_queue_1;
214 magma_queue_t magma_queue_2;
215 cudaStream_t stream_c;
216 cudaStream_t copyStream = NULL;
217 cudaStream_t magma_cudaStream_1 = NULL;
218 cudaStream_t magma_cudaStream_2 = NULL;
220 cudaEvent_t initBlock_dev_ev;
221 cudaEvent_t potrf_dev_ev;
223 magma_event_t potrf_dev_magma_ev;
228 int* info_cuda = NULL;
229 int *cuda_buffer_flag_potrf;
230 int *cuda_buffer_flag_trtri;
232 cusolverDnHandle_t *handle;
233 cusolverDnParams_t *params;
237 double *mem_cuda_dev;
238 double *mem_cuda_host;
242 int magma_potrf_init_flag;
253 magma_event_t magma_events[2];
254 magma_queue_t magma_queues[2];
275 size_t *diag_pos_dev;
277 inline size_t mf_block_index(
size_t,
size_t);
278 inline size_t mf_block_lda(
size_t,
size_t);
279 inline size_t mf_dense_block_index(
size_t);
280 inline size_t mf_dense_block_offset(
size_t);
281 inline size_t mf_dense_block_lda(
size_t);
283 inline size_t invblks_diag_block_index(
size_t);
284 inline size_t invblks_dense_block_index(
size_t);
286 double FirstStageFactor();
287 double FirstSecondStageFactor(
size_t nhrs);
288 double FirstStageFactor_noCopyHost(T &
logDet);
289 double FirstStageFactor_noCopyHost_testV(
double &
logDet);
290 double ForwardPassSolve(
size_t);
291 double BackwardPassSolve(
size_t);
292 double SecondStageSolve(
size_t,
double& t_secondStageForwardPass,
double& t_secondStageBackwardPass);
294 double SecondStageSolve_s(
size_t,
float* rhs_s);
296 double SecondStageSolve_d(
size_t,
double* rhs_d);
297 double ThirdStageBTA(T*,T*,
int);
299 void initialize_MF_host();
300 void initialize_invBlks_host();
303 inline void get_max_supernode_nnz();
305 inline void init_supernode(T *M_dev,
size_t supernode, cudaStream_t stream);
308 inline void copy_supernode_to_host(T *M_dev,
size_t supernode, cudaStream_t stream);
311 inline void extract_nnzA(T *M_dev,
size_t supernode);
312 inline void copy_supernode_to_host_write(T *M_dev,
size_t supernode);
314 inline void copy_supernode_to_device(T *M_dev,
size_t supernode, cudaStream_t stream);
316 inline void copy_supernode_diag(T *src,
size_t supernode);
317 inline void swap_pointers(T **ptr1, T **ptr2);
329 printf(
"ns = %ld, nt = %ld. no spatial field. Just fixed effects. Consider using a different solver!\n", ns, nt);
334 GPU_rank = GPU_rank_;
337 printf(
"using CUDA POTRF.\n");
338#elif defined(MAGMA_EXPERT)
339 printf(
"using MAGMA EXPERT POTRF.\n");
341 printf(
"using MAGMA POTRF.\n");
348 matrix_size = ns*nt + nd;
349 matrix_n_nonzeros = 2*(nt-1)*ns*ns + ns*ns + matrix_size*nd;
355 Bmin =
new size_t[NBlock];
356 Bmax =
new size_t[NBlock];
357 for (
size_t i = 0; i < nt; i++)
365 Bmax[nt] = nt*ns + nd;
369 diag_pos =
new size_t[matrix_size];
371 for (IB = 0; IB < nt-1; IB++)
373 for (
size_t i = 0; i < ns; i++)
375 diag_pos[IB*ns+i] = IB * ns*(2*ns+nd) + i*(2*ns+nd+1);
380 for (
size_t i = 0; i < ns; i++)
382 diag_pos[IB*ns+i] = IB * ns*(2*ns+nd) + i*(ns+nd+1);
388 for (
size_t i = 0; i < nd; i++)
390 diag_pos[nt*ns+i] = (nt-1) * ns*(2*ns+nd) + ns*(ns+nd) + i*(nd+1);
402 gpuErrchk(cudaEventCreate(&initBlock_dev_ev));
403 gpuErrchk(cudaEventCreate(&potrf_dev_ev));
405 magma_event_create(&potrf_dev_magma_ev);
409 gpuErrchk(cudaStreamCreate( ©Stream ));
412 magma_device_t device;
413 magma_getdevice(&device);
417 gpuErrchk(cudaStreamCreate ( &magma_cudaStream_1 ));
418 gpuErrchk(cudaStreamCreate ( &magma_cudaStream_2 ));
421 magma_queue_create_from_cuda(device, magma_cudaStream_1, NULL, NULL, &magma_queue_1);
422 magma_queue_create_from_cuda(device, magma_cudaStream_2, NULL, NULL, &magma_queue_2);
429 cudaMalloc((
void**)&info_cuda,
sizeof(
int));
431 cudaMallocHost((
void**)&cuda_buffer_flag_potrf,
sizeof(
int));
432 cuda_buffer_flag_potrf[0] = 0;
434 cudaMallocHost((
void**)&cuda_buffer_flag_trtri,
sizeof(
int));
435 cuda_buffer_flag_trtri[0] = 0;
437 cudaMallocHost((
void**)&handle,
sizeof(cusolverDnHandle_t));
438 cudaMallocHost((
void**)¶ms,
sizeof(cusolverDnParams_t));
440 cudaMallocHost((
void**)&dev_size,
sizeof(
size_t));
441 cudaMallocHost((
void**)&host_size,
sizeof(
size_t));
443 cusolverStatus_t cuSolverError = cusolverDnCreate(handle);
444 if(cuSolverError != 0){
445 printf(
"cuSolverError not Zero in create handle! Error: %d\n", cuSolverError);
449 cuSolverError = cusolverDnCreateParams(params);
450 if(cuSolverError != 0){
451 printf(
"cuSolverError not Zero in create params!\n");
459 magma_potrf_init_flag = 0;
468 magma_event_create(&magma_events[0]);
469 magma_event_create(&magma_events[1]);
472 magma_queues[0] = magma_queue_1;
473 magma_queues[1] = magma_queue_2;
488 std::cout <<
"In BTA destructor. MF_allocated : " << MF_allocated << std::endl;
497 std::cout <<
"cudaFreeHost(MF) gets called." << std::endl;
503 if(invBlks_allocated){
505 std::cout <<
"cudaFreeHost(invBlks) gets called." << std::endl;
507 cudaFreeHost(invBlks);
510 if (MF_dev_allocated)
512 size_t max_supernode_nnz_dense = matrix_nt > 1 ? matrix_ns*(2*matrix_ns+matrix_nd) : matrix_ns*(matrix_ns+matrix_nd);
513 size_t final_supernode_nnz_dense = matrix_nd > 0 ? matrix_nd*matrix_nd : 0;
514 deallocate_data_on_dev(blockR_dev,max_supernode_nnz_dense*
sizeof(T));
515 deallocate_data_on_dev(blockM_dev,max_supernode_nnz_dense*
sizeof(T));
516 deallocate_data_on_dev(blockDense_dev,final_supernode_nnz_dense*
sizeof(T));
519 magma_queue_destroy(magma_queue_1);
520 magma_queue_destroy(magma_queue_2);
523 cudaStreamDestroy(copyStream);
524 cudaStreamDestroy(magma_cudaStream_1);
525 cudaStreamDestroy(magma_cudaStream_2);
533 cudaFreeHost(info_cuda);
534 cudaFreeHost(cuda_buffer_flag_potrf);
535 cudaFreeHost(cuda_buffer_flag_trtri);
537 cudaFreeHost(host_size);
538 cudaFreeHost(dev_size);
539 cudaFreeHost(mem_cuda_host);
540 cudaFree(mem_cuda_dev);
544 cudaFreeHost(host_work);
545 cudaFree(device_work);
556 return diag_pos[c*matrix_ns] + (r-c)*matrix_ns;
567 return 2*matrix_ns + matrix_nd;
570 return matrix_ns + matrix_nd;
582 return diag_pos[i*matrix_ns] + 2*matrix_ns;
583 else if (i == matrix_nt-1)
584 return diag_pos[i*matrix_ns] + matrix_ns;
586 return diag_pos[i*matrix_ns];
596 else if (i == matrix_nt-1)
607 return mf_block_lda(i, i);;
620 printf(
"in invblks_diag_block_index(). block index i = %ld out of bounds!\n", i);
623 return (matrix_ns+matrix_nd)*matrix_ns*i;
633 printf(
"in invblks_diag_block_index(). block index i = %ld out of bounds!\n", i);
636 return matrix_ns*matrix_ns*(i+1) + matrix_ns*matrix_nd*i;
647 std::cout <<
"In FirstStageFactor(), omp get thread num = " << omp_get_thread_num() << std::endl;
652 double t_copy_DtH = 0;
655 double t_init_supernode = get_time(0.0);
663 double flop_count = 0;
670 NR = Bmax[0]-Bmin[0];
673 std::cout <<
"IB = " << IB << std::endl;
682 init_supernode(blockR_dev, IB, magma_cudaStream_1);
685 init_supernode(blockDense_dev, matrix_nt, magma_cudaStream_1);
697 std::cout <<
"calling first Cholesky factorization now" << std::endl;
700 t_init_supernode = get_time(t_init_supernode);
704 double t_fact = get_time(0.0);
706 t_temp = get_time(0.0);
710 cuda_buffer_flag_potrf[0] = 0;
711 tpotrf_dev_cuda(
'L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
712 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
713#elif defined(MAGMA_EXPERT)
714 magma_potrf_init_flag = 0;
715 magma_tpotrf_expert_wrapper(
'L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
716 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
721 cudaDeviceSynchronize();
722 tpotrf_dev(
'L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
723 cudaDeviceSynchronize();
729 t_temp = get_time(t_temp);
733 flop_count += 1.0/3.0 * NR * NR * NR + 0.5 * NR * NR + 1.0/6.0 * NR;
737 gpuErrchk(cudaEventRecord(potrf_dev_ev, magma_cudaStream_1));
738 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, potrf_dev_ev, 0));
744 t_temp = get_time(0.0);
748 ttrsm_dev(
'R',
'L',
'T',
'N', NR, NR, ONE, blockR_dev, mf_block_lda(0, 0), &blockR_dev[NR], mf_block_lda(1, 0), magma_queue_1);
750 t_temp = get_time(t_temp);
752 flop_count += NR * NR * NR;
758 ttrsm_dev(
'R',
'L',
'T',
'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(0, 0), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
759 flop_count += matrix_nd * NR * NR;
762 t_temp = get_time(0.0);
763 cudaDeviceSynchronize();
765 copy_supernode_to_host(blockR_dev, IB, copyStream);
768 t_temp = get_time(t_temp);
769 t_copy_DtH =+ t_temp;
772 std::cout <<
"entering loop block factorization loop now" << std::endl;
776 for (IB = 1; IB < matrix_nt-1; IB++)
780 std::cout <<
"IB = " << IB << std::endl;
783 NR = Bmax[IB]-Bmin[IB];
784 NM = Bmax[IB-1]-Bmin[IB-1];
786 double flop_1iter = 0.0;
789 swap_pointers(&blockR_dev, &blockM_dev);
791 nvtxRangeId_t id_initSN = nvtxRangeStartA(
"initSuperNode_inLoop");
792 init_supernode(blockR_dev, IB, magma_cudaStream_1);
793 nvtxRangeEnd(id_initSN);
797 gpuErrchk(cudaEventRecord(initBlock_dev_ev, magma_cudaStream_1));
798 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, initBlock_dev_ev, 0));
801 nvtxRangeId_t id_dgemm = nvtxRangeStartA(
"BigGemm_inLoop");
802 t_temp = get_time(0.0);
803 tgemm_dev(
'N',
'T', NR, NR, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
804 t_temp = get_time(t_temp);
805 nvtxRangeEnd(id_dgemm);
808 flop_count += 2.0 * NR * NR * NM;
814 id_dgemm = nvtxRangeStartA(
"DenseLowerGemm_inLoop");
815 t_temp = get_time(0.0);
816 tgemm_dev(
'N',
'T', matrix_nd, NR, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
817 t_dgemm += get_time(t_temp);
818 nvtxRangeEnd(id_dgemm);
820 flop_count += 2.0*matrix_nd * NR * NM;
823 id_dgemm = nvtxRangeStartA(
"LastBlockLowerGemm_inLoop");
824 tgemm_dev(
'N',
'T', matrix_nd, matrix_nd, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_2);
825 nvtxRangeEnd(id_dgemm);
826 flop_count += 2.0* matrix_nd *matrix_nd *NM;
832 t_temp = get_time(0.0);
833 nvtxRangeId_t id_dpotrf = nvtxRangeStartA(
"Potrf_inLoop");
838 cuda_buffer_flag_potrf[0] = 0;
840 tpotrf_dev_cuda(
'L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
841 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
842#elif defined(MAGMA_EXPERT)
845 magma_potrf_init_flag = 0;
847 copy_supernode_to_host_write(blockR_dev, IB);
849 magma_tpotrf_expert_wrapper(
'L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
850 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
851 cudaDeviceSynchronize();
852 copy_supernode_to_host_write(blockR_dev, IB);
855 cudaDeviceSynchronize();
856 tpotrf_dev(
'L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
857 cudaDeviceSynchronize();
863 nvtxRangeEnd(id_dpotrf);
864 t_temp = get_time(t_temp);
867 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
872 magma_event_record(potrf_dev_magma_ev, magma_queue_1);
873 magma_queue_wait_event(magma_queue_2, potrf_dev_magma_ev);
878 nvtxRangeId_t id_ttrsm = nvtxRangeStartA(
"ttrsm_inLoop");
880 ttrsm_dev(
'R',
'L',
'T',
'N', NR, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[NR], mf_block_lda(IB+1, IB), magma_queue_1);
883 ttrsm_dev(
'R',
'L',
'T',
'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
886 nvtxRangeEnd(id_ttrsm);
888 flop_count += (NR + matrix_nd) * NR * NR;
891 t_temp = get_time(0.0);
893 nvtxRangeId_t id_Cpy_SN_host = nvtxRangeStartA(
"copy_SN_toHost_inLoop");
894 cudaDeviceSynchronize();
895 copy_supernode_to_host(blockR_dev, IB, copyStream);
897 nvtxRangeEnd(id_Cpy_SN_host);
898 t_copy_DtH += get_time(t_temp);
905 NR = Bmax[IB]-Bmin[IB];
906 NM = Bmax[IB-1]-Bmin[IB-1];
909 std::cout <<
"IB = " << IB << std::endl;
912 swap_pointers(&blockR_dev, &blockM_dev);
913 nvtxRangeId_t id_initSN = nvtxRangeStartA(
"initLastSuperNode");
914 init_supernode(blockR_dev, IB, magma_cudaStream_1);
916 nvtxRangeEnd(id_initSN);
918 gpuErrchk(cudaEventRecord(initBlock_dev_ev, magma_cudaStream_1));
919 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, initBlock_dev_ev, 0));
923 nvtxRangeId_t id_dgemm = nvtxRangeStartA(
"LastLargeGemm");
924 tgemm_dev(
'N',
'T', NR, NR, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
925 nvtxRangeEnd(id_dgemm);
926 flop_count += 2.0* NR * NR * NM;
930 tgemm_dev(
'N',
'T', matrix_nd, NR, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
931 flop_count += 2.0*matrix_nd * NR * NM;
933 tgemm_dev(
'N',
'T', matrix_nd, matrix_nd, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_2);
934 flop_count += 2.0*matrix_nd *matrix_nd *NM;
939 nvtxRangeId_t id_dpotrf = nvtxRangeStartA(
"LastPotrf");
943 cuda_buffer_flag_potrf[0] = 0;
945 tpotrf_dev_cuda(
'L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
946 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
947#elif defined(MAGMA_EXPERT)
948 magma_potrf_init_flag = 0;
950 magma_tpotrf_expert_wrapper(
'L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
951 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
955 cudaDeviceSynchronize();
956 tpotrf_dev(
'L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
957 cudaDeviceSynchronize();
959 nvtxRangeEnd(id_dpotrf);
960 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
966 nvtxRangeId_t id_ttrsm = nvtxRangeStartA(
"LastTtrsm");
967 ttrsm_dev(
'R',
'L',
'T',
'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_1);
968 nvtxRangeEnd(id_ttrsm);
969 flop_count += matrix_nd * NR * NR;
973 t_temp = get_time(0.0);
975 cudaDeviceSynchronize();
976 nvtxRangeId_t id_Cpy_SN_host = nvtxRangeStartA(
"copy_LastLarge_SN_toHost");
977 copy_supernode_to_host(blockR_dev, IB, copyStream);
980 nvtxRangeEnd(id_Cpy_SN_host);
982 t_copy_DtH += get_time(t_temp);
989 NR = Bmax[IB]-Bmin[IB];
990 NM = Bmax[IB-1]-Bmin[IB-1];
993 std::cout <<
"IB = " << IB << std::endl;
998 tgemm_dev(
'N',
'T', matrix_nd, matrix_nd, NM, -ONE, &blockR_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockR_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_1);
999 flop_count += 2.0 * matrix_nd *matrix_nd *NM;
1007 cuda_buffer_flag_potrf[0] = 0;
1009 tpotrf_dev_cuda(
'L', NR, blockDense_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1010 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1011#elif defined(MAGMA_EXPERT)
1012 magma_potrf_init_flag = 0;
1014 magma_tpotrf_expert_wrapper(
'L', NR, blockDense_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1015 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1019 tpotrf_dev(
'L', NR, blockDense_dev, mf_block_lda(IB, IB), &info);
1022 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
1026 std::cout <<
"after complete Cholesky factorization" << std::endl;
1028 t_temp = get_time(0.0);
1030 cudaDeviceSynchronize();
1031 copy_supernode_to_host(blockDense_dev, IB, copyStream);
1034 t_copy_DtH += get_time(t_temp);
1038 cudaDeviceSynchronize();
1040 t_fact = get_time(t_fact);
1043 double gflops = flop_count / (1e9*t_fact);
1060 std::cout <<
"In FirstSecondStageFactor(), omp get thread num = " << omp_get_thread_num() << std::endl;
1065 double t_copy_DtH = 0;
1068 double t_init_supernode = get_time(0.0);
1076 double flop_count = 0;
1079 NR = Bmax[0]-Bmin[0];
1082 std::cout <<
"IB = " << IB << std::endl;
1089 init_supernode(blockR_dev, IB, magma_cudaStream_1);
1092 init_supernode(blockDense_dev, matrix_nt, magma_cudaStream_1);
1096 std::cout <<
"calling first Cholesky factorization now" << std::endl;
1099 t_init_supernode = get_time(t_init_supernode);
1103 double t_fact = get_time(0.0);
1105 t_temp = get_time(0.0);
1109 cuda_buffer_flag_potrf[0] = 0;
1110 tpotrf_dev_cuda(
'L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1111 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1112#elif defined(MAGMA_EXPERT)
1113 magma_potrf_init_flag = 0;
1114 magma_tpotrf_expert_wrapper(
'L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1115 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1120 cudaDeviceSynchronize();
1121 tpotrf_dev(
'L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
1122 cudaDeviceSynchronize();
1125 t_temp = get_time(t_temp);
1129 flop_count += 1.0/3.0 * NR * NR * NR + 0.5 * NR * NR + 1.0/6.0 * NR;
1133 gpuErrchk(cudaEventRecord(potrf_dev_ev, magma_cudaStream_1));
1134 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, potrf_dev_ev, 0));
1136 cudaDeviceSynchronize();
1140 ttrsm_dev(
'L',
'L',
'N',
'N', NR, nrhs, ONE, blockR_dev, mf_block_lda(0,0), &rhs_dev[Bmin[0]], matrix_size, magma_queue_1);
1147 t_temp = get_time(0.0);
1151 ttrsm_dev(
'R',
'L',
'T',
'N', NR, NR, ONE, blockR_dev, mf_block_lda(0, 0), &blockR_dev[NR], mf_block_lda(1, 0), magma_queue_1);
1153 t_temp = get_time(t_temp);
1155 flop_count += NR * NR * NR;
1161 ttrsm_dev(
'R',
'L',
'T',
'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(0, 0), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1162 flop_count += matrix_nd * NR * NR;
1165 t_temp = get_time(0.0);
1166 cudaDeviceSynchronize();
1168 copy_supernode_to_host(blockR_dev, IB, copyStream);
1172 t_temp = get_time(t_temp);
1173 t_copy_DtH =+ t_temp;
1176 std::cout <<
"entering loop block factorization loop now" << std::endl;
1180 for (IB = 1; IB < matrix_nt-1; IB++)
1184 std::cout <<
"IB = " << IB << std::endl;
1187 NR = Bmax[IB]-Bmin[IB];
1188 NM = Bmax[IB-1]-Bmin[IB-1];
1190 double flop_1iter = 0.0;
1192 swap_pointers(&blockR_dev, &blockM_dev);
1194 nvtxRangeId_t id_initSN = nvtxRangeStartA(
"initSuperNode_inLoop");
1195 init_supernode(blockR_dev, IB, magma_cudaStream_1);
1197 nvtxRangeEnd(id_initSN);
1202 gpuErrchk(cudaEventRecord(initBlock_dev_ev, magma_cudaStream_1));
1203 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, initBlock_dev_ev, 0));
1205 nvtxRangeId_t id_dgemm = nvtxRangeStartA(
"BigGemm_inLoop");
1206 t_temp = get_time(0.0);
1207 tgemm_dev(
'N',
'T', NR, NR, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
1208 t_temp = get_time(t_temp);
1209 nvtxRangeEnd(id_dgemm);
1213 flop_count += 2.0 * NR * NR * NM;
1229 tgemm_dev(
'N',
'N', NR, nrhs, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &rhs_dev[Bmin[IB-1]], matrix_size, ONE, &rhs_dev[Bmin[IB]], matrix_size, magma_queue_1);
1246 id_dgemm = nvtxRangeStartA(
"DenseLowerGemm_inLoop");
1247 t_temp = get_time(0.0);
1248 tgemm_dev(
'N',
'T', matrix_nd, NR, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1249 t_dgemm += get_time(t_temp);
1250 nvtxRangeEnd(id_dgemm);
1252 flop_count += 2.0*matrix_nd * NR * NM;
1255 id_dgemm = nvtxRangeStartA(
"LastBlockLowerGemm_inLoop");
1256 tgemm_dev(
'N',
'T', matrix_nd, matrix_nd, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_2);
1257 nvtxRangeEnd(id_dgemm);
1258 flop_count += 2.0* matrix_nd *matrix_nd *NM;
1265 tgemm_dev(
'N',
'N', NR, nrhs, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &rhs_dev[Bmin[IB-1]], matrix_size, ONE, &rhs_dev[Bmin[NBlock-1]], matrix_size, magma_queue_2);
1272 t_temp = get_time(0.0);
1273 nvtxRangeId_t id_dpotrf = nvtxRangeStartA(
"Potrf_inLoop");
1278 cuda_buffer_flag_potrf[0] = 0;
1280 tpotrf_dev_cuda(
'L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1281 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1282#elif defined(MAGMA_EXPERT)
1285 magma_potrf_init_flag = 0;
1287 magma_tpotrf_expert_wrapper(
'L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1288 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1293 cudaDeviceSynchronize();
1294 tpotrf_dev(
'L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
1295 cudaDeviceSynchronize();
1298 nvtxRangeEnd(id_dpotrf);
1299 t_temp = get_time(t_temp);
1302 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
1307 ttrsm_dev(
'L',
'L',
'N',
'N', NR, nrhs, ONE, blockR_dev, mf_block_lda(IB,IB), &rhs_dev[Bmin[IB]], matrix_size, magma_queue_1);
1321 magma_event_record(potrf_dev_magma_ev, magma_queue_1);
1322 magma_queue_wait_event(magma_queue_2, potrf_dev_magma_ev);
1327 nvtxRangeId_t id_ttrsm = nvtxRangeStartA(
"ttrsm_inLoop");
1329 ttrsm_dev(
'R',
'L',
'T',
'N', NR, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[NR], mf_block_lda(IB+1, IB), magma_queue_1);
1332 ttrsm_dev(
'R',
'L',
'T',
'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1335 nvtxRangeEnd(id_ttrsm);
1337 flop_count += (NR + matrix_nd) * NR * NR;
1340 t_temp = get_time(0.0);
1342 nvtxRangeId_t id_Cpy_SN_host = nvtxRangeStartA(
"copy_SN_toHost_inLoop");
1343 cudaDeviceSynchronize();
1344 copy_supernode_to_host(blockR_dev, IB, copyStream);
1346 nvtxRangeEnd(id_Cpy_SN_host);
1347 t_copy_DtH += get_time(t_temp);
1354 NR = Bmax[IB]-Bmin[IB];
1355 NM = Bmax[IB-1]-Bmin[IB-1];
1358 std::cout <<
"IB = " << IB << std::endl;
1361 swap_pointers(&blockR_dev, &blockM_dev);
1362 nvtxRangeId_t id_initSN = nvtxRangeStartA(
"initLastSuperNode");
1363 init_supernode(blockR_dev, IB, magma_cudaStream_1);
1365 nvtxRangeEnd(id_initSN);
1367 gpuErrchk(cudaEventRecord(initBlock_dev_ev, magma_cudaStream_1));
1368 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, initBlock_dev_ev, 0));
1372 nvtxRangeId_t id_dgemm = nvtxRangeStartA(
"LastLargeGemm");
1373 tgemm_dev(
'N',
'T', NR, NR, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
1374 nvtxRangeEnd(id_dgemm);
1375 flop_count += 2.0* NR * NR * NM;
1380 tgemm_dev(
'N',
'N', NR, nrhs, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &rhs_dev[Bmin[IB-1]], matrix_size, ONE, &rhs_dev[Bmin[IB]], matrix_size, magma_queue_1);
1385 tgemm_dev(
'N',
'T', matrix_nd, NR, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1386 flop_count += 2.0*matrix_nd * NR * NM;
1388 tgemm_dev(
'N',
'T', matrix_nd, matrix_nd, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_2);
1389 flop_count += 2.0*matrix_nd *matrix_nd *NM;
1394 tgemm_dev(
'N',
'N', NR, nrhs, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &rhs_dev[Bmin[IB-1]], matrix_size, ONE, &rhs_dev[Bmin[NBlock-1]], matrix_size, magma_queue_2);
1400 nvtxRangeId_t id_dpotrf = nvtxRangeStartA(
"LastPotrf");
1404 cuda_buffer_flag_potrf[0] = 0;
1406 tpotrf_dev_cuda(
'L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1407 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1408#elif defined(MAGMA_EXPERT)
1411 magma_potrf_init_flag = 0;
1414 magma_tpotrf_expert_wrapper(
'L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1415 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1419 cudaDeviceSynchronize();
1420 tpotrf_dev(
'L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
1421 cudaDeviceSynchronize();
1423 nvtxRangeEnd(id_dpotrf);
1424 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
1429 ttrsm_dev(
'L',
'L',
'N',
'N', NR, nrhs, ONE, blockR_dev, mf_block_lda(IB,IB), &rhs_dev[Bmin[IB]], matrix_size, magma_queue_1);
1436 nvtxRangeId_t id_ttrsm = nvtxRangeStartA(
"LastTtrsm");
1437 ttrsm_dev(
'R',
'L',
'T',
'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_1);
1438 nvtxRangeEnd(id_ttrsm);
1439 flop_count += matrix_nd * NR * NR;
1443 t_temp = get_time(0.0);
1445 cudaDeviceSynchronize();
1446 nvtxRangeId_t id_Cpy_SN_host = nvtxRangeStartA(
"copy_LastLarge_SN_toHost");
1447 copy_supernode_to_host(blockR_dev, IB, copyStream);
1450 nvtxRangeEnd(id_Cpy_SN_host);
1452 t_copy_DtH += get_time(t_temp);
1459 NR = Bmax[IB]-Bmin[IB];
1460 NM = Bmax[IB-1]-Bmin[IB-1];
1463 std::cout <<
"IB = " << IB << std::endl;
1468 tgemm_dev(
'N',
'T', matrix_nd, matrix_nd, NM, -ONE, &blockR_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockR_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_1);
1469 flop_count += 2.0 * matrix_nd *matrix_nd *NM;
1474 tgemm_dev(
'N',
'N', NR, nrhs, NM, -ONE, &blockR_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &rhs_dev[Bmin[IB-1]], matrix_size, ONE, &rhs_dev[Bmin[NBlock-1]], matrix_size, magma_queue_1);
1483 cuda_buffer_flag_potrf[0] = 0;
1485 tpotrf_dev_cuda(
'L', NR, blockDense_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1486 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1487#elif defined(MAGMA_EXPERT)
1490 magma_potrf_init_flag = 0;
1492 magma_tpotrf_expert_wrapper(
'L', NR, blockDense_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1493 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1497 tpotrf_dev(
'L', NR, blockDense_dev, mf_block_lda(IB, IB), &info);
1500 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
1504 ttrsm_dev(
'L',
'L',
'N',
'N', NR, nrhs, ONE, blockDense_dev, mf_block_lda(IB, IB), &rhs_dev[Bmin[IB]], matrix_size, magma_queue_1);
1508 std::cout <<
"after complete Cholesky factorization + forward solve" << std::endl;
1510 t_temp = get_time(0.0);
1512 cudaDeviceSynchronize();
1513 copy_supernode_to_host(blockDense_dev, IB, copyStream);
1516 t_copy_DtH += get_time(t_temp);
1520 cudaDeviceSynchronize();
1522 t_fact = get_time(t_fact);
1525 double gflops = flop_count / (1e9*t_fact);
1539 printf(
"In Factorize noCopyHost test version.\n");
1547 NR = Bmax[0]-Bmin[0];
1549 nvtxRangeId_t id_initSN = nvtxRangeStartA(
"initSuperNodeDev");
1550 init_supernode(blockR_dev, IB, magma_cudaStream_1);
1551 nvtxRangeEnd(id_initSN);
1552 cudaDeviceSynchronize();
1553 copy_supernode_to_host_write(blockR_dev, IB);
1561 magma_potrf_init_flag = 0;
1564 for(
int i=0; i<2; i++){
1566 std::cout <<
"Copy to blockM_dev, i = " << i <<
", mf_block_lda(IB, IB) = " << mf_block_lda(IB, IB) << std::endl;
1568 nvtxRangeId_t id_cpyDevDev = nvtxRangeStartA(
"CopySNtoblockMdev");
1569 tlacpy_dev(
'N', 2*NR, NR, blockR_dev, mf_block_lda(IB, IB), blockM_dev, mf_block_lda(IB, IB), magma_queue_1);
1570 nvtxRangeEnd(id_cpyDevDev);
1573 copy_supernode_to_host_write(blockM_dev, IB);
1576 std::cout <<
"IB = " << IB << std::endl;
1579 double t_fact = get_time(0.0);
1582 nvtxRangeId_t id_dpotrf = nvtxRangeStartA(
"firstPotrf");
1584 tpotrf_dev_cuda(
'L', NR, blockM_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1585 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1587#elif defined(MAGMA_EXPERT)
1588 copy_supernode_to_host_write(blockM_dev, IB);
1590 magma_tpotrf_expert_wrapper(
'L', NR, blockM_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1591 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1592 copy_supernode_to_host_write(blockM_dev, IB);
1594 cudaDeviceSynchronize();
1595 tpotrf_dev(
'L', NR, blockM_dev, mf_block_lda(IB, IB), &info);
1597 nvtxRangeEnd(id_dpotrf);
1600 copy_supernode_to_host_write(blockM_dev, IB);
1602 id_cpyDevDev = nvtxRangeStartA(
"CopyDiagDevDev");
1603 copy_supernode_diag(blockM_dev, IB);
1604 nvtxRangeEnd(id_cpyDevDev);
1606 id_cpyDevDev = nvtxRangeStartA(
"CopyDiagtoHost");
1607 cudaMemcpy(diag, diag_dev, NR*
sizeof(T), cudaMemcpyDeviceToHost );
1608 nvtxRangeEnd(id_cpyDevDev);
1610 id_cpyDevDev = nvtxRangeStartA(
"ComLogDetHost");
1612 log_sum(diag, NR, &logDet);
1615 printf(
"log Det : %f\n", logDet);
1616 nvtxRangeEnd(id_cpyDevDev);
1642 double flop_count = 0;
1648 NR = Bmax[0]-Bmin[0];
1650 init_supernode(blockR_dev, IB, magma_cudaStream_1);
1653 init_supernode(blockDense_dev, matrix_nt, magma_cudaStream_1);
1657 std::cout <<
"calling first Cholesky factorization now" << std::endl;
1658 std::cout <<
"IB = " << IB << std::endl;
1661 double t_fact = get_time(0.0);
1664 nvtxRangeId_t id_dpotrf = nvtxRangeStartA(
"firstPotrf");
1669 cuda_buffer_flag_potrf[0] = 0;
1670 tpotrf_dev_cuda(
'L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1671 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1672#elif defined(MAGMA_EXPERT)
1673 magma_potrf_init_flag = 0;
1674 magma_tpotrf_expert_wrapper(
'L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1675 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1679 cudaDeviceSynchronize();
1680 tpotrf_dev(
'L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
1681 cudaDeviceSynchronize();
1684 nvtxRangeEnd(id_dpotrf);
1686 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
1688 gpuErrchk(cudaEventRecord(potrf_dev_ev, magma_cudaStream_1));
1689 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, potrf_dev_ev, 0));
1694 t_temp = get_time(0.0);
1698 ttrsm_dev(
'R',
'L',
'T',
'N', NR, NR, ONE, blockR_dev, mf_block_lda(0, 0), &blockR_dev[NR], mf_block_lda(1, 0), magma_queue_1);
1700 t_temp = get_time(t_temp);
1702 flop_count += NR * NR * NR;
1708 ttrsm_dev(
'R',
'L',
'T',
'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(0, 0), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1709 flop_count += matrix_nd * NR * NR;
1712 cudaDeviceSynchronize();
1715 copy_supernode_diag(blockR_dev, IB);
1718 std::cout <<
"entering loop block factorization loop now" << std::endl;
1722 for (IB = 1; IB < matrix_nt-1; IB++)
1726 std::cout <<
"IB = " << IB << std::endl;
1729 NR = Bmax[IB]-Bmin[IB];
1730 NM = Bmax[IB-1]-Bmin[IB-1];
1733 cudaDeviceSynchronize();
1734 swap_pointers(&blockR_dev, &blockM_dev);
1735 cudaDeviceSynchronize();
1736 init_supernode(blockR_dev, IB, magma_cudaStream_1);
1740 gpuErrchk(cudaEventRecord(initBlock_dev_ev, magma_cudaStream_1));
1741 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, initBlock_dev_ev, 0));
1746 nvtxRangeId_t id_dgemm = nvtxRangeStartA(
"BigGemm_inLoop");
1747 tgemm_dev(
'N',
'T', NR, NR, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
1748 nvtxRangeEnd(id_dgemm);
1749 flop_count += 2.0 * NR * NR * NR;
1755 tgemm_dev(
'N',
'T', matrix_nd, NR, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1757 flop_count += 2.0*matrix_nd * NR * NM;
1760 tgemm_dev(
'N',
'T', matrix_nd, matrix_nd, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_2);
1761 flop_count += 2.0 * matrix_nd * matrix_nd * NM;
1766 nvtxRangeId_t id_dpotrf = nvtxRangeStartA(
"Potrf_inLoopTest");
1771 cuda_buffer_flag_potrf[0] = 0;
1773 tpotrf_dev_cuda(
'L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1774 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1775#elif defined(MAGMA_EXPERT)
1777 printf(
"Different block size! Re-initialize BufferSize. NR = %ld, NM = %ld\n", NR, NM);
1778 magma_potrf_init_flag = 0;
1780 magma_potrf_init_flag = 0;
1781 magma_tpotrf_expert_wrapper(
'L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1782 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1786 cudaDeviceSynchronize();
1787 tpotrf_dev(
'L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
1788 cudaDeviceSynchronize();
1790 nvtxRangeEnd(id_dpotrf);
1791 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
1793 magma_event_record(potrf_dev_magma_ev, magma_queue_1);
1794 magma_queue_wait_event(magma_queue_2, potrf_dev_magma_ev);
1799 nvtxRangeId_t id_ttrsm = nvtxRangeStartA(
"ttrsm_inLoop");
1801 ttrsm_dev(
'R',
'L',
'T',
'N', NR, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[NR], mf_block_lda(IB+1, IB), magma_queue_1);
1804 ttrsm_dev(
'R',
'L',
'T',
'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1807 nvtxRangeEnd(id_ttrsm);
1810 flop_count += (NR + matrix_nd) * NR * NR;
1812 cudaDeviceSynchronize();
1814 copy_supernode_diag(blockR_dev, IB);
1821 NR = Bmax[IB]-Bmin[IB];
1822 NM = Bmax[IB-1]-Bmin[IB-1];
1824 swap_pointers(&blockR_dev, &blockM_dev);
1825 init_supernode(blockR_dev, IB, magma_cudaStream_1);
1827 gpuErrchk(cudaEventRecord(initBlock_dev_ev, magma_cudaStream_1));
1828 gpuErrchk(cudaStreamWaitEvent(magma_cudaStream_2, initBlock_dev_ev, 0));
1832 tgemm_dev(
'N',
'T', NR, NR, NM, -ONE, &blockM_dev[NM], mf_block_lda(IB, IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
1833 flop_count += 2.0* NR * NR * NR;
1837 tgemm_dev(
'N',
'T', matrix_nd, NR, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[NM], mf_block_lda(IB, IB-1), ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_2);
1838 flop_count += 2.0*matrix_nd * NR * NM;
1840 tgemm_dev(
'N',
'T', matrix_nd, matrix_nd, NM, -ONE, &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockM_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_2);
1841 flop_count += 2.0* matrix_nd * matrix_nd * NM;
1850 cuda_buffer_flag_potrf[0] = 0;
1852 tpotrf_dev_cuda(
'L', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1853 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1854#elif defined(MAGMA_EXPERT)
1857 magma_potrf_init_flag = 0;
1859 magma_tpotrf_expert_wrapper(
'L', NR, blockR_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1860 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1864 cudaDeviceSynchronize();
1865 tpotrf_dev(
'L', NR, blockR_dev, mf_block_lda(IB, IB), &info);
1866 cudaDeviceSynchronize();
1868 flop_count += 1.0/3.0 * NR * NR * NR + 0.5*NR*NR + 1.0/6.0 * NR;
1875 ttrsm_dev(
'R',
'L',
'T',
'N', matrix_nd, NR, ONE, blockR_dev, mf_block_lda(IB, IB), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_1);
1876 flop_count += matrix_nd * NR * NR;
1882 copy_supernode_diag(blockR_dev, IB);
1889 NR = Bmax[IB]-Bmin[IB];
1890 NM = Bmax[IB-1]-Bmin[IB-1];
1894 tgemm_dev(
'N',
'T', matrix_nd, matrix_nd, NM, -ONE, &blockR_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), &blockR_dev[mf_dense_block_offset(IB-1)], mf_dense_block_lda(IB-1), ONE, blockDense_dev, matrix_nd, magma_queue_1);
1895 flop_count += 2.0* matrix_nd * matrix_nd * NM;
1903 cuda_buffer_flag_potrf[0] = 0;
1905 tpotrf_dev_cuda(
'L', NR, blockDense_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_potrf,
1906 handle, params, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
1907#elif defined(MAGMA_EXPERT)
1910 magma_potrf_init_flag = 0;
1912 magma_tpotrf_expert_wrapper(
'L', NR, blockDense_dev, mf_block_lda(IB, IB), magma_info, mode, subN, subSubN, host_work, &lwork_host,
1913 device_work, &lwork_device, magma_events, magma_queues, magma_potrf_init_flag);
1917 cudaDeviceSynchronize();
1918 tpotrf_dev(
'L', NR, blockDense_dev, mf_block_lda(IB, IB), &info);
1919 cudaDeviceSynchronize();
1921 flop_count += 1.0/3.0 * matrix_nd * matrix_nd * matrix_nd + 0.5*matrix_nd * matrix_nd + 1.0/6.0 * matrix_nd;
1926 std::cout <<
"after complete Cholesky factorization" << std::endl;
1930 copy_supernode_diag(blockDense_dev, IB);
1934 t_fact = get_time(t_fact);
1937 double gflops = flop_count / (1e9*t_fact);
1943 double t_copy = -omp_get_wtime();
1946 diag =
new T[matrix_size];
1947 cudaMemcpy(diag, diag_dev, matrix_size*
sizeof(T), cudaMemcpyDeviceToHost );
1950 log_sum(diag, matrix_size, &logDet);
1953 t_copy += omp_get_wtime();
1956 std::cout <<
"Log Det Copy & Compute time : " << t_copy << std::endl;
1975 NR = Bmax[0]-Bmin[0];
1978 double flop_count = 0;
1981 std::cout <<
"calling first block solver in second stage factor now" << std::endl;
1984 nvtxRangeId_t id_solve = nvtxRangeStartA(
"ForwardPassSolve");
1986 double t_solve = get_time(0.0);
1990 c_ttrsm(
'L',
'L',
'N',
'N', NR, nrhs, ONE, &MF[mf_block_index(0, 0)], mf_block_lda(0, 0), &rhs[Bmin[0]], matrix_size);
1991 flop_count += 2.0 * NR * NR * nrhs;
1994 for (IB = 1; IB < matrix_nt; IB++)
1996 NR = Bmax[IB]-Bmin[IB];
1997 NM = Bmax[IB-1]-Bmin[IB-1];
2001 c_tgemm(
'N',
'N', NR, nrhs, NM, -ONE, &MF[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs[Bmin[IB-1]], matrix_size, ONE, &rhs[Bmin[IB]], matrix_size);
2002 flop_count += 2.0 * NR * nrhs * NM;
2005 c_ttrsm(
'L',
'L',
'N',
'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2006 flop_count += 2.0*NR * NR * nrhs;
2013 NR = Bmax[IB]-Bmin[IB];
2014 NM = Bmax[IB-1]-Bmin[IB-1];
2018 for (
size_t i = 0; i < NBlock-1; i++)
2020 c_tgemm(
'N',
'N', NR, nrhs, NM, -ONE, &MF[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs[Bmin[i]], matrix_size, ONE, &rhs[Bmin[IB]], matrix_size);
2021 flop_count += 2.0*NR * nrhs * NM;
2024 c_ttrsm(
'L',
'L',
'N',
'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2025 flop_count += 2.0*NR * NR * nrhs;
2043 double t_solve = get_time(0.0);
2048 NR = Bmax[IB]-Bmin[IB];
2049 NM = Bmax[IB-1]-Bmin[IB-1];
2053 c_ttrsm(
'L',
'L',
'T',
'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2054 flop_count += NR * NR * nrhs;
2057 for (
size_t i = 0; i < NBlock-1; i++)
2059 c_tgemm(
'T',
'N', NM, nrhs, NR, -ONE, &MF[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs[Bmin[IB]], matrix_size, ONE, &rhs[Bmin[i]], matrix_size);
2060 flop_count += 2.0*NM * nrhs * NR;
2065 for (IB = matrix_nt-1; IB > 0; IB--)
2067 NR = Bmax[IB]-Bmin[IB];
2068 NM = Bmax[IB-1]-Bmin[IB-1];
2072 c_ttrsm(
'L',
'L',
'T',
'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2073 flop_count += NR * NR * nrhs;
2076 c_tgemm(
'T',
'N', NM, nrhs, NR, -ONE, &MF[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs[Bmin[IB]], matrix_size, ONE, &rhs[Bmin[IB-1]], matrix_size);
2077 flop_count += 2.0 * NM * nrhs * NR;
2081 NR = Bmax[IB]-Bmin[IB];
2085 c_ttrsm(
'L',
'L',
'T',
'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2087 flop_count += NR * NR * nrhs;
2088 t_solve = get_time(t_solve);
2090 double gflops = flop_count / (1e9*t_solve);
2094 std::cout <<
"after forward-backword solve in 2nd stage factor" << std::endl;
2106 t_secondStageForwardPass = get_time(0.0);
2107 gflops += ForwardPassSolve(nrhs);
2108 t_secondStageForwardPass = get_time(t_secondStageForwardPass);
2110 t_secondStageBackwardPass = get_time(0.0);
2111 gflops += BackwardPassSolve(nrhs);
2112 t_secondStageBackwardPass = get_time(t_secondStageBackwardPass);
2129 NR = Bmax[0]-Bmin[0];
2132 double flop_count = 0;
2135 std::cout <<
"calling first block solver in second stage factor now" << std::endl;
2138 nvtxRangeId_t id_solve = nvtxRangeStartA(
"solve");
2140 double t_solve = get_time(0.0);
2144 c_ttrsm(
'L',
'L',
'N',
'N', NR, nrhs, ONE, &MF[mf_block_index(0, 0)], mf_block_lda(0, 0), &rhs[Bmin[0]], matrix_size);
2145 flop_count += 2.0 * NR * NR * nrhs;
2148 for (IB = 1; IB < matrix_nt; IB++)
2150 NR = Bmax[IB]-Bmin[IB];
2151 NM = Bmax[IB-1]-Bmin[IB-1];
2155 c_tgemm(
'N',
'N', NR, nrhs, NM, -ONE, &MF[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs[Bmin[IB-1]], matrix_size, ONE, &rhs[Bmin[IB]], matrix_size);
2156 flop_count += 2.0 * NR * nrhs * NM;
2159 c_ttrsm(
'L',
'L',
'N',
'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2160 flop_count += 2.0*NR * NR * nrhs;
2167 NR = Bmax[IB]-Bmin[IB];
2168 NM = Bmax[IB-1]-Bmin[IB-1];
2172 for (
size_t i = 0; i < NBlock-1; i++)
2174 c_tgemm(
'N',
'N', NR, nrhs, NM, -ONE, &MF[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs[Bmin[i]], matrix_size, ONE, &rhs[Bmin[IB]], matrix_size);
2175 flop_count += 2.0*NR * nrhs * NM;
2178 c_ttrsm(
'L',
'L',
'N',
'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2179 flop_count += 2.0*NR * NR * nrhs;
2186 NR = Bmax[IB]-Bmin[IB];
2187 NM = Bmax[IB-1]-Bmin[IB-1];
2191 c_ttrsm(
'L',
'L',
'T',
'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2192 flop_count += NR * NR * nrhs;
2195 for (
size_t i = 0; i < NBlock-1; i++)
2197 c_tgemm(
'T',
'N', NM, nrhs, NR, -ONE, &MF[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs[Bmin[IB]], matrix_size, ONE, &rhs[Bmin[i]], matrix_size);
2198 flop_count += 2.0*NM * nrhs * NR;
2203 for (IB = matrix_nt-1; IB > 0; IB--)
2205 NR = Bmax[IB]-Bmin[IB];
2206 NM = Bmax[IB-1]-Bmin[IB-1];
2210 c_ttrsm(
'L',
'L',
'T',
'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2211 flop_count += NR * NR * nrhs;
2214 c_tgemm(
'T',
'N', NM, nrhs, NR, -ONE, &MF[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs[Bmin[IB]], matrix_size, ONE, &rhs[Bmin[IB-1]], matrix_size);
2215 flop_count += 2.0 * NM * nrhs * NR;
2219 NR = Bmax[IB]-Bmin[IB];
2223 c_ttrsm(
'L',
'L',
'T',
'N', NR, nrhs, ONE, &MF[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs[Bmin[IB]], matrix_size);
2225 flop_count += NR * NR * nrhs;
2226 t_solve = get_time(t_solve);
2228 nvtxRangeEnd(id_solve);
2230 double gflops = flop_count / (1e9*t_solve);
2235 std::cout <<
"after forward-backword solve in 2nd stage factor" << std::endl;
2249 printf(
"in SecondStageSolve_d()\n");
2254 double ONE_d = f_one();
2258 double* MF_d =
new double[matrix_n_nonzeros];
2260 double t_MF_conv = get_time(0.0);
2262 for(
int i = 0; i<matrix_n_nonzeros; i++){
2263 MF_d[i] = (double) MF[i];
2266 t_MF_conv = get_time(t_MF_conv);
2267 printf(
"time spent conversion to MF double prec: %f\n", t_MF_conv);
2272 NR = Bmax[0]-Bmin[0];
2275 double flop_count = 0;
2278 std::cout <<
"calling first block solver in second stage factor now" << std::endl;
2281 nvtxRangeId_t id_solve = nvtxRangeStartA(
"solve");
2283 double t_solve = get_time(0.0);
2287 c_ttrsm(
'L',
'L',
'N',
'N', NR, nrhs, ONE_d, &MF_d[mf_block_index(0, 0)], mf_block_lda(0, 0), &rhs_d[Bmin[0]], matrix_size);
2288 flop_count += 2.0 * NR * NR * nrhs;
2291 for (IB = 1; IB < matrix_nt; IB++)
2293 NR = Bmax[IB]-Bmin[IB];
2294 NM = Bmax[IB-1]-Bmin[IB-1];
2298 c_tgemm(
'N',
'N', NR, nrhs, NM, -ONE_d, &MF_d[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs_d[Bmin[IB-1]], matrix_size, ONE_d, &rhs_d[Bmin[IB]], matrix_size);
2299 flop_count += 2.0 * NR * nrhs * NM;
2302 c_ttrsm(
'L',
'L',
'N',
'N', NR, nrhs, ONE_d, &MF_d[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_d[Bmin[IB]], matrix_size);
2303 flop_count += 2.0*NR * NR * nrhs;
2310 NR = Bmax[IB]-Bmin[IB];
2311 NM = Bmax[IB-1]-Bmin[IB-1];
2315 for (
size_t i = 0; i < NBlock-1; i++)
2317 c_tgemm(
'N',
'N', NR, nrhs, NM, -ONE_d, &MF_d[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs_d[Bmin[i]], matrix_size, ONE_d, &rhs_d[Bmin[IB]], matrix_size);
2318 flop_count += 2.0*NR * nrhs * NM;
2321 c_ttrsm(
'L',
'L',
'N',
'N', NR, nrhs, ONE_d, &MF_d[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_d[Bmin[IB]], matrix_size);
2322 flop_count += 2.0*NR * NR * nrhs;
2329 NR = Bmax[IB]-Bmin[IB];
2330 NM = Bmax[IB-1]-Bmin[IB-1];
2334 c_ttrsm(
'L',
'L',
'T',
'N', NR, nrhs, ONE_d, &MF_d[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_d[Bmin[IB]], matrix_size);
2335 flop_count += NR * NR * nrhs;
2338 for (
size_t i = 0; i < NBlock-1; i++)
2340 c_tgemm(
'T',
'N', NM, nrhs, NR, -ONE_d, &MF_d[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs_d[Bmin[IB]], matrix_size, ONE_d, &rhs_d[Bmin[i]], matrix_size);
2341 flop_count += 2.0*NM * nrhs * NR;
2346 for (IB = matrix_nt-1; IB > 0; IB--)
2348 NR = Bmax[IB]-Bmin[IB];
2349 NM = Bmax[IB-1]-Bmin[IB-1];
2353 c_ttrsm(
'L',
'L',
'T',
'N', NR, nrhs, ONE_d, &MF_d[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_d[Bmin[IB]], matrix_size);
2354 flop_count += NR * NR * nrhs;
2357 c_tgemm(
'T',
'N', NM, nrhs, NR, -ONE_d, &MF_d[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs_d[Bmin[IB]], matrix_size, ONE_d, &rhs_d[Bmin[IB-1]], matrix_size);
2358 flop_count += 2.0 * NM * nrhs * NR;
2362 NR = Bmax[IB]-Bmin[IB];
2366 c_ttrsm(
'L',
'L',
'T',
'N', NR, nrhs, ONE_d, &MF_d[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_d[Bmin[IB]], matrix_size);
2368 flop_count += NR * NR * nrhs;
2369 t_solve = get_time(t_solve);
2371 nvtxRangeEnd(id_solve);
2373 double gflops = flop_count / (1e9*t_solve);
2378 std::cout <<
"after forward-backword solve in 2nd stage factor" << std::endl;
2393 printf(
"in SecondStageSolve_s()\n");
2398 float ONE_s = f_one();
2402 float* MF_s =
new float[matrix_n_nonzeros];
2404 double t_MF_conv = get_time(0.0);
2406 for(
int i = 0; i<matrix_n_nonzeros; i++){
2407 MF_s[i] = (float) MF[i];
2410 t_MF_conv = get_time(t_MF_conv);
2411 printf(
"time spent conversion to MF single prec: %f\n", t_MF_conv);
2416 NR = Bmax[0]-Bmin[0];
2419 double flop_count = 0;
2422 std::cout <<
"calling first block solver in second stage factor now" << std::endl;
2425 nvtxRangeId_t id_solve = nvtxRangeStartA(
"solve");
2427 double t_solve = get_time(0.0);
2431 c_ttrsm(
'L',
'L',
'N',
'N', NR, nrhs, ONE_s, &MF_s[mf_block_index(0, 0)], mf_block_lda(0, 0), &rhs_s[Bmin[0]], matrix_size);
2432 flop_count += 2.0 * NR * NR * nrhs;
2435 for (IB = 1; IB < matrix_nt; IB++)
2437 NR = Bmax[IB]-Bmin[IB];
2438 NM = Bmax[IB-1]-Bmin[IB-1];
2442 c_tgemm(
'N',
'N', NR, nrhs, NM, -ONE_s, &MF_s[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs_s[Bmin[IB-1]], matrix_size, ONE_s, &rhs_s[Bmin[IB]], matrix_size);
2443 flop_count += 2.0 * NR * nrhs * NM;
2446 c_ttrsm(
'L',
'L',
'N',
'N', NR, nrhs, ONE_s, &MF_s[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_s[Bmin[IB]], matrix_size);
2447 flop_count += 2.0*NR * NR * nrhs;
2454 NR = Bmax[IB]-Bmin[IB];
2455 NM = Bmax[IB-1]-Bmin[IB-1];
2459 for (
size_t i = 0; i < NBlock-1; i++)
2461 c_tgemm(
'N',
'N', NR, nrhs, NM, -ONE_s, &MF_s[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs_s[Bmin[i]], matrix_size, ONE_s, &rhs_s[Bmin[IB]], matrix_size);
2462 flop_count += 2.0*NR * nrhs * NM;
2465 c_ttrsm(
'L',
'L',
'N',
'N', NR, nrhs, ONE_s, &MF_s[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_s[Bmin[IB]], matrix_size);
2466 flop_count += 2.0*NR * NR * nrhs;
2473 NR = Bmax[IB]-Bmin[IB];
2474 NM = Bmax[IB-1]-Bmin[IB-1];
2478 c_ttrsm(
'L',
'L',
'T',
'N', NR, nrhs, ONE_s, &MF_s[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_s[Bmin[IB]], matrix_size);
2479 flop_count += NR * NR * nrhs;
2482 for (
size_t i = 0; i < NBlock-1; i++)
2484 c_tgemm(
'T',
'N', NM, nrhs, NR, -ONE_s, &MF_s[mf_dense_block_index(i)], mf_dense_block_lda(i), &rhs_s[Bmin[IB]], matrix_size, ONE_s, &rhs_s[Bmin[i]], matrix_size);
2485 flop_count += 2.0*NM * nrhs * NR;
2490 for (IB = matrix_nt-1; IB > 0; IB--)
2492 NR = Bmax[IB]-Bmin[IB];
2493 NM = Bmax[IB-1]-Bmin[IB-1];
2497 c_ttrsm(
'L',
'L',
'T',
'N', NR, nrhs, ONE_s, &MF_s[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_s[Bmin[IB]], matrix_size);
2498 flop_count += NR * NR * nrhs;
2501 c_tgemm(
'T',
'N', NM, nrhs, NR, -ONE_s, &MF_s[mf_block_index(IB, IB-1)], mf_block_lda(IB, IB-1), &rhs_s[Bmin[IB]], matrix_size, ONE_s, &rhs_s[Bmin[IB-1]], matrix_size);
2502 flop_count += 2.0 * NM * nrhs * NR;
2506 NR = Bmax[IB]-Bmin[IB];
2510 c_ttrsm(
'L',
'L',
'T',
'N', NR, nrhs, ONE_s, &MF_s[mf_block_index(IB, IB)], mf_block_lda(IB, IB), &rhs_s[Bmin[IB]], matrix_size);
2512 flop_count += NR * NR * nrhs;
2513 t_solve = get_time(t_solve);
2515 nvtxRangeEnd(id_solve);
2517 double gflops = flop_count / (1e9*t_solve);
2522 std::cout <<
"after forward-backword solve in 2nd stage factor" << std::endl;
2545 mem_alloc_dev = allocate_data_on_device((
void**)&eye_dev,matrix_ns*matrix_ns*
sizeof(T));
2546 init_eye_on_dev(eye_dev,matrix_ns,0);
2554 allocate_data_on_device((
void**)&G_LastDense_dev,ND*ND*
sizeof(T));
2555 allocate_data_on_device((
void**)&G_dense_i_dev ,ND*matrix_ns*
sizeof(T));
2556 allocate_data_on_device((
void**)&tmp3_dev ,ND*matrix_ns*
sizeof(T));
2557 mem_alloc_dev = allocate_data_on_device((
void**)&tmp4_dev ,matrix_ns*matrix_ns*
sizeof(T));
2560 printf(
"in 3rd Stage Factor. Allocated GPU memory sel Inv: %ld, cpy ind %d\n", mem_alloc_dev, cpy_indicator);
2568 cudaMallocHost(&tmp3,ND*matrix_ns*
sizeof(T));
2569 cudaMallocHost(&tmp4,matrix_ns*matrix_ns*
sizeof(T));
2574 double flop_count = 0;
2575 nvtxRangeId_t id_inv = nvtxRangeStartA(
"Inversion");
2576 double t_inv = get_time(0.0);
2582 NR = Bmax[IB]-Bmin[IB];
2586 tlacpy_dev(
'L', NR, NR, blockDense_dev, mf_block_lda(IB, IB), G_LastDense_dev, NR, magma_queue_1);
2588 cudaDeviceSynchronize();
2592 tril_dev(G_LastDense_dev, NR, NR);
2601 cuda_buffer_flag_trtri[0] = 0;
2602 ttrtri_dev_cuda(
'L',
'N', NR, G_LastDense_dev, NR, info_cuda, cuda_buffer_flag_trtri,
2603 handle, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
2606 cudaDeviceSynchronize();
2607 ttrtri_dev(
'L',
'N', NR, G_LastDense_dev, NR, &info);
2609 flop_count += 1.0/3.0 * NR * NR * NR + 2.0/3.0 * NR;
2610 cudaDeviceSynchronize();
2614 tgemm_dev(
'T',
'N', NR, NR, NR, ONE, G_LastDense_dev, NR, G_LastDense_dev, NR, ZERO, blockDense_dev, mf_block_lda(IB, IB), magma_queue_1);
2615 flop_count += 2.0 * NR * NR * NR;
2616 if(cpy_indicator == 2){
2617 cudaDeviceSynchronize();
2621 ind_invBlks_fi = invblks_diag_block_index(IB);
2624 cudaMemcpy(&invBlks[ind_invBlks_fi], blockDense_dev, matrix_nd*matrix_nd*
sizeof(T), cudaMemcpyDeviceToHost);
2627 printf(
"first index final small diagonal block: %ld\n", ind_invBlks_fi);
2628 printf(
"final small block : \n");
2629 for(
int i=0; i<ND*ND; i++){
2630 printf(
"%f ", invBlks[ind_invBlks_fi+i]);
2639 NR = Bmax[IB]-Bmin[IB];
2640 NP = Bmax[IB+1]-Bmin[IB+1];
2646 tril_dev(blockR_dev, mf_block_lda(IB, IB), NR);
2653 cuda_buffer_flag_trtri[0] = 0;
2655 ttrtri_dev_cuda(
'L',
'N', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_trtri,
2656 handle, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
2658 ttrtri_dev(
'L',
'N', NR, blockR_dev, mf_block_lda(IB, IB), &info);
2661 flop_count += 1.0/3.0 * NR * NR * NR + 2.0/3.0 * NR;
2663 cudaDeviceSynchronize();
2667 init_eye_on_dev(tmp1_dev,NR,magma_cudaStream_1);
2669 tgemm_dev(
'T',
'N', NR, NP, NP, ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), blockDense_dev, mf_block_lda(IB+1, IB+1), ZERO, tmp2_dev, NR, magma_queue_1);
2670 flop_count += 2.0 * NR * NP * NP;
2672 tgemm_dev(
'N',
'N', NR, NR, NP, ONE, tmp2_dev, NR, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), ONE, tmp1_dev, NR, magma_queue_1);
2673 flop_count += 2.0 * NR * NR * NP;
2675 tgemm_dev(
'T',
'N', NR, NR, NR, ONE, blockR_dev, mf_block_lda(IB, IB), tmp1_dev, NR, ZERO, tmp2_dev, NR, magma_queue_1);
2676 flop_count += 2.0 * NR * NR * NR;
2678 tgemm_dev(
'N',
'N', NR, NR, NR, ONE, tmp2_dev, NR, blockR_dev, mf_block_lda(IB, IB), ZERO, tmp1_dev, NR, magma_queue_1);
2679 flop_count += 2.0 * NR * NR * NR;
2681 if(cpy_indicator == 2){
2682 cudaDeviceSynchronize();
2684 ind_invBlks_fi = invblks_diag_block_index(IB);
2687 cudaMemcpy(&invBlks[ind_invBlks_fi], tmp1_dev, matrix_ns*matrix_ns*
sizeof(T), cudaMemcpyDeviceToHost);
2689 printf(
"first index last ns x ns diagonal block: %ld\n", ind_invBlks_fi);
2690 printf(
"last ns x ns block : \n");
2691 for(
int i=0; i<NR*NR; i++){
2692 printf(
"%f ", invBlks[ind_invBlks_fi+i]);
2704 tgemm_dev(
'N',
'N', NP, NR, NP, ONE, blockDense_dev, mf_block_lda(IB+1, IB+1), &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), ZERO, tmp3_dev, NP, magma_queue_1);
2705 flop_count += 2.0 * NP * NR * NP;
2710 tgemm_dev(
'N',
'N', NP, NR, NR, -ONE, tmp3_dev, NP, blockR_dev, mf_block_lda(IB, IB), ZERO, G_dense_i_dev, NP, magma_queue_1);
2711 flop_count += 2.0 * NP * NR * NR;
2714 if(matrix_ns*ND < 200){
2715 cudaMemcpy(tmp3, G_dense_i_dev, matrix_ns*ND*
sizeof(T), cudaMemcpyDeviceToHost);
2716 printf(
"Sigma_n+1n : \n");
2717 for(
int i=0; i<matrix_ns*ND; i++){
2718 printf(
"%f ", tmp3[i]);
2724 cudaDeviceSynchronize();
2727 if(cpy_indicator == 0){
2729 copy_supernode_diag(blockDense_dev, NBlock-1);
2730 }
else if(cpy_indicator == 1){
2732 extract_nnzA(blockDense_dev, NBlock-1);
2733 }
else if(cpy_indicator == 2){
2735 ind_invBlks_fi = invblks_dense_block_index(IB);
2738 cudaMemcpy(&invBlks[ind_invBlks_fi], G_dense_i_dev, matrix_nd*matrix_ns*
sizeof(T), cudaMemcpyDeviceToHost);
2740 if(matrix_ns*ND < 200){
2741 printf(
"first index last ns x nd diagonal block: %ld\n", ind_invBlks_fi);
2742 printf(
"last ns x ns small block : \n");
2743 for(
int i=0; i<matrix_nd*matrix_ns; i++){
2744 printf(
"%f ", invBlks[ind_invBlks_fi+i]);
2750 cudaDeviceSynchronize();
2753 tlacpy_dev(
'F', NR, NR, tmp1_dev, NR, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
2756 if(matrix_ns*matrix_ns < 200){
2757 cudaMemcpy(tmp4, tmp1_dev, matrix_ns*matrix_ns*
sizeof(T), cudaMemcpyDeviceToHost);
2758 printf(
"Sigma_nn : \n");
2759 for(
int i=0; i<matrix_ns*matrix_ns; i++){
2760 printf(
"%f ", tmp4[i]);
2771 NR = Bmax[IB]-Bmin[IB];
2774 tlacpy_dev(
'L', NR, NR, blockR_dev, mf_block_lda(IB, IB), tmp1_dev, NR, magma_queue_1);
2775 cudaDeviceSynchronize();
2778 tril_dev(tmp1_dev, NR, NR);
2783 cuda_buffer_flag_trtri[0] = 0;
2784 ttrtri_dev_cuda(
'L',
'N', NR, tmp1_dev, NR, info_cuda, cuda_buffer_flag_trtri,
2785 handle, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
2788 cudaDeviceSynchronize();
2789 ttrtri_dev(
'L',
'N', NR, tmp1_dev, NR, &info);
2792 flop_count += 1.0/3.0 * NR * NR * NR + 2.0/3.0*NR;
2795 tgemm_dev(
'T',
'N', NR, NR, NR, ONE, tmp1_dev, NR, tmp1_dev, NR, ZERO, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
2796 flop_count += 2.0 * NR * NR * NR;
2798 if(cpy_indicator == 2){
2799 cudaDeviceSynchronize();
2801 ind_invBlks_fi = invblks_diag_block_index(IB);
2804 cudaMemcpy(&invBlks[ind_invBlks_fi], tmp1_dev, matrix_ns*matrix_ns*
sizeof(T), cudaMemcpyDeviceToHost);
2807 printf(
"first index last ns x ns diagonal block case with no fixed effects : %ld\n", ind_invBlks_fi);
2808 printf(
"last ns x ns block : \n");
2809 for(
int i=0; i<NR*NR; i++){
2810 printf(
"%f ", invBlks[ind_invBlks_fi+i]);
2838 cudaDeviceSynchronize();
2839 copy_supernode_to_device(blockM_dev, matrix_nt-2, copyStream);
2843 for (
int IBi = matrix_nt-2; IBi > -1; IBi--)
2846 NR = Bmax[IB]-Bmin[IB];
2847 NP = Bmax[IB+1]-Bmin[IB+1];
2850 cudaDeviceSynchronize();
2852 swap_pointers(&blockR_dev, &blockM_dev);
2853 cudaDeviceSynchronize();
2857 if(cpy_indicator == 0){
2858 copy_supernode_diag(blockM_dev, IB+1);
2859 }
else if(cpy_indicator == 1){
2864 tlacpy_dev(
'F', ND, NR, G_dense_i_dev, ND, &blockM_dev[mf_dense_block_offset(IB+1)], mf_dense_block_lda(IB+1), magma_queue_1);
2866 cudaDeviceSynchronize();
2867 extract_nnzA(blockM_dev, IB+1);
2878 tril_dev(blockR_dev, mf_block_lda(IB, IB), NR);
2880 nvtxRangeId_t id_trtri = nvtxRangeStartA(
"TriangularSolve_inLoop");
2884 cuda_buffer_flag_trtri[0] = 0;
2886 ttrtri_dev_cuda(
'L',
'N', NR, blockR_dev, mf_block_lda(IB, IB), info_cuda, cuda_buffer_flag_trtri,
2887 handle, magma_cudaStream_1, dev_size, host_size, mem_cuda_dev, mem_cuda_host);
2889 ttrtri_dev(
'L',
'N', NR, blockR_dev, mf_block_lda(IB, IB), &info);
2891 nvtxRangeEnd(id_trtri);
2892 flop_count += 1.0/3.0 * NR * NR * NR + 2.0/3.0 * NR;
2895 init_eye_on_dev(tmp1_dev,NR,magma_cudaStream_1);
2899 tgemm_dev(
'T',
'N', NR, NP, NP, ONE, &blockR_dev[NR], mf_block_lda(IB+1, IB), blockM_dev, mf_block_lda(IB+1, IB+1), ZERO, tmp2_dev, NR, magma_queue_1);
2900 flop_count += 2.0 * NR * NP * NP;
2903 tgemm_dev(
'N',
'N', NR, NR, NP, ONE, tmp2_dev, NR, &blockR_dev[NR], mf_block_lda(IB+1, IB), ONE, tmp1_dev, NR, magma_queue_1);
2904 flop_count += 2.0 * NR * NR * NP;
2912 gpuErrchk(cudaEventRecord(potrf_dev_ev, magma_cudaStream_1));
2913 gpuErrchk(cudaStreamWaitEvent(copyStream, potrf_dev_ev, 0));
2915 nvtxRangeId_t id_initSN = nvtxRangeStartA(
"CpySNtoDevice");
2916 copy_supernode_to_device(blockM_dev, IB-1, copyStream);
2917 nvtxRangeEnd(id_initSN);
2923 nvtxRangeId_t id_denseGEMMS = nvtxRangeStartA(
"GEMMsDenseRows");
2925 nvtxRangeId_t id_firstDenseGEMM = nvtxRangeStartA(
"FirstDenseGEMM");
2927 tgemm_dev(
'T',
'N', NR, ND, ND, ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), blockDense_dev, mf_block_lda(NBlock-1, NBlock-1), ZERO, tmp3_dev, NR, magma_queue_1);
2928 flop_count += 2.0 * NR * ND * ND;
2930 nvtxRangeEnd(id_firstDenseGEMM);
2933 tgemm_dev(
'N',
'N', NR, NR, ND, ONE, tmp3_dev, NR, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), ONE, tmp1_dev, NR, magma_queue_1);
2934 flop_count += 2.0 * NR * NR * ND;
2941 tgemm_dev(
'T',
'N', NP, NR, ND, ONE, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), G_dense_i_dev, ND, ZERO, tmp4_dev, NR, magma_queue_1);
2942 flop_count += 2.0 * NP * NR * ND;
2946 cudaMemcpy(tmp4, tmp4_dev,NR*NR*
sizeof(T), cudaMemcpyDeviceToHost);
2947 printf(
"L_Fi^T*Gn+1i+1: \n");
2948 for(
int i=0; i<NR*NR; i++){
2949 printf(
"%f ", tmp4[i]);
2956 tgemm_dev(
'N',
'N', NR, NP, NR, ONE, tmp4_dev, NR, &blockR_dev[NR], mf_block_lda(IB+1, IB), ZERO, tmp2_dev, NR, magma_queue_1);
2957 flop_count += 2.0 * NR * NP * NR;
2961 cudaMemcpy(tmp4, tmp4_dev,NR*NR*
sizeof(T), cudaMemcpyDeviceToHost);
2962 printf(
"L_Fi^T*Gn+1i+1*L_Ei:\n");
2963 for(
int i=0; i<NR*NR; i++){
2964 printf(
"%f ", tmp4[i]);
2971 tlacpy_dev(
'F', NR, NR, tmp2_dev, NR, tmp4_dev, NR, magma_queue_1);
2973 tgemm_dev(
'T',
'N', NR, NR, NR, ONE, tmp2_dev, NR, eye_dev, NR, ONE, tmp4_dev, NR, magma_queue_1);
2974 flop_count += 2.0 * NR * NR * NR;
2978 cudaMemcpy(tmp4, tmp4_dev,NR*NR*
sizeof(T), cudaMemcpyDeviceToHost);
2979 printf(
"L_Fi^T*Gn+1i+1*L_Ei + t(L_Fi^T*Gn+1i+1*L_Ei): \n");
2980 for(
int i=0; i<NR*NR; i++){
2981 printf(
"%f ", tmp4[i]);
2988 tgemm_dev(
'N',
'N', NR, NR, NR, ONE, tmp4_dev, NR, eye_dev, NR, ONE, tmp1_dev, NR, magma_queue_1);
2989 flop_count += 2.0 * NR * NR * NR;
2993 cudaMemcpy(tmp4, tmp1_dev,NR*NR*
sizeof(T), cudaMemcpyDeviceToHost);
2994 printf(
"tmp4_dev: \n");
2995 for(
int i=0; i<NR*NR; i++){
2996 printf(
"%f ", tmp4[i]);
3004 tgemm_dev(
'N',
'N', ND, NP, NR, ONE, G_dense_i_dev, ND, &blockR_dev[NR], mf_block_lda(IB+1, IB), ZERO, tmp3_dev, ND, magma_queue_1);
3005 flop_count += 2.0 * ND * NP * NR;
3009 cudaMemcpy(tmp3, tmp3_dev,NR*ND*
sizeof(T), cudaMemcpyDeviceToHost);
3010 printf(
"tmp3_dev: \n");
3011 for(
int i=0; i<ND*NR; i++){
3012 printf(
"%f ", tmp3[i]);
3020 tgemm_dev(
'N',
'N', ND, NR, ND, ONE, blockDense_dev, ND, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), ONE, tmp3_dev, ND, magma_queue_1);
3021 flop_count += 2.0 * ND * NR * ND;
3025 cudaMemcpy(tmp3, tmp3_dev,NR*ND*
sizeof(T), cudaMemcpyDeviceToHost);
3026 printf(
"tmp3_dev: \n");
3027 for(
int i=0; i<ND*NR; i++){
3028 printf(
"%f ", tmp3[i]);
3035 tgemm_dev(
'N',
'N', ND, NR, NR, -ONE, tmp3_dev, ND, blockR_dev, mf_block_lda(IB, IB), ZERO, G_dense_i_dev, ND, magma_queue_1);
3036 flop_count += 2.0 * ND * NR * NR;
3040 cudaMemcpy(tmp3, G_dense_i_dev,NR*ND*
sizeof(T), cudaMemcpyDeviceToHost);
3041 printf(
"tmp3_dev: \n");
3042 for(
int i=0; i<ND*NR; i++){
3043 printf(
"%f ", tmp3[i]);
3049 if(cpy_indicator == 2){
3051 ind_invBlks_fi = invblks_dense_block_index(IB);
3054 cudaMemcpy(&invBlks[ind_invBlks_fi], G_dense_i_dev, matrix_nd*matrix_ns*
sizeof(T), cudaMemcpyDeviceToHost);
3056 if(matrix_nd*matrix_ns < 200){
3057 printf(
"first index of i-th ns x nd diagonal block: %ld\n", ind_invBlks_fi);
3058 printf(
" i-th ns x nd dense block : \n");
3059 for(
int i=0; i<matrix_nd*matrix_ns; i++){
3060 printf(
"%f ", invBlks[ind_invBlks_fi+i]);
3067 nvtxRangeEnd(id_denseGEMMS);
3071 tgemm_dev(
'T',
'N', NR, NR, NR, ONE, blockR_dev, mf_block_lda(IB, IB), tmp1_dev, NR, ZERO, tmp2_dev, NR, magma_queue_1);
3072 flop_count += 2.0 * NR * NR * NR;
3075 tgemm_dev(
'N',
'N', NR, NR, NR, ONE, tmp2_dev, NR, blockR_dev, mf_block_lda(IB, IB), ZERO, tmp1_dev, NR, magma_queue_1);
3076 flop_count += 2.0 * NR * NR * NR;
3080 cudaMemcpy(tmp4, tmp1_dev, NR*NR*
sizeof(T), cudaMemcpyDeviceToHost);
3082 for(
int i=0; i<NR*NR; i++){
3083 printf(
"%f ", tmp4[i]);
3089 if(cpy_indicator == 2){
3090 cudaDeviceSynchronize();
3092 ind_invBlks_fi = invblks_diag_block_index(IB);
3095 cudaMemcpy(&invBlks[ind_invBlks_fi], tmp1_dev, matrix_ns*matrix_ns*
sizeof(T), cudaMemcpyDeviceToHost);
3098 printf(
"first index last ns x ns diagonal block case with no fixed effects : %ld\n", ind_invBlks_fi);
3099 printf(
"i-th ns x ns block : \n");
3100 for(
int i=0; i<NR*NR; i++){
3101 printf(
"%f ", invBlks[ind_invBlks_fi+i]);
3109 tlacpy_dev(
'F', NR, NR, tmp1_dev, NR, blockR_dev, mf_block_lda(IB, IB), magma_queue_1);
3114 if(cpy_indicator == 2){
3115 printf(
"array containing all neccessary inv blk entries: \n");
3116 for(
int i=0; i<(matrix_ns+matrix_nd)*matrix_ns*matrix_nt+matrix_nd*matrix_nd; i++){
3117 printf(
"%f ", invBlks[i]);
3123 t_inv = get_time(t_inv);
3124 double gflops = flop_count / (1e9*t_inv);
3125 printf(
"GFLOPS sel. inversion : %f, time selInv : %f\n", gflops, t_inv);
3127 nvtxRangeEnd(id_inv);
3129 cudaDeviceSynchronize();
3132 if(cpy_indicator == 0){
3133 copy_supernode_diag(blockR_dev, 0);
3136 if(cpy_indicator == 1){
3138 tlacpy_dev(
'F', ND, NR, G_dense_i_dev, ND, &blockR_dev[mf_dense_block_offset(IB)], mf_dense_block_lda(IB), magma_queue_1);
3140 cudaDeviceSynchronize();
3141 extract_nnzA(blockR_dev, 0);
3145 deallocate_data_on_dev(G_LastDense_dev, ND*ND*
sizeof(T));
3146 deallocate_data_on_dev(eye_dev ,matrix_ns*matrix_ns*
sizeof(T));
3147 deallocate_data_on_dev(G_dense_i_dev ,ND*matrix_ns*
sizeof(T));
3149 deallocate_data_on_dev(tmp3_dev, ND*matrix_ns*
sizeof(T));
3150 mem_alloc_dev = deallocate_data_on_dev(tmp4_dev, matrix_ns*matrix_ns*
sizeof(T));
3153 printf(
"Allocated GPU memory after sel Inv: %ld, cpyInd %d\n", mem_alloc_dev, cpy_indicator);
3166 std::cout <<
"in initialize MF host. MF allocated : " << MF_allocated << std::endl;
3178 cudaMallocHost(&MF, (matrix_n_nonzeros) *
sizeof(T));
3179 MF_allocated =
true;
3190 std::cout <<
"in initialize invBlks host" << std::endl;
3194 if(!invBlks_allocated)
3196 double t_invBlk_alloc = get_time(0.0);
3198 size_t nnz_invBlks = (matrix_ns+matrix_nd)*matrix_ns*matrix_nt + matrix_nd*matrix_nd;
3199 cudaMallocHost(&invBlks, nnz_invBlks *
sizeof(T));
3200 t_invBlk_alloc = get_time(t_invBlk_alloc);
3202 invBlks_allocated =
true;
3205 std::cout <<
"in initialize invBlks host. nnz(invBlks) : " << nnz_invBlks <<
", Allocation time : " << t_invBlk_alloc << std::endl;
3218 std::cout <<
"entered factorize." << std::endl;
3222 cudaGetDevice(&GPU_currRank);
3224 if(GPU_currRank != GPU_rank){
3226 cudaSetDevice(GPU_rank);
3227 cudaGetDevice(&GPU_currRank);
3237 double t_allocate_MF_host = get_time(0.0);
3239 initialize_MF_host();
3242 std::cout <<
"in factorize. MF_allocated : " << MF_allocated << std::endl;
3255 cudaMallocHost(&MF, (matrix_n_nonzeros) *
sizeof(T));
3256 MF_allocated =
true;
3261 t_allocate_MF_host = get_time(t_allocate_MF_host);
3264 double t_allocate_MF_dev = get_time(0.0);
3267 if (!MF_dev_allocated)
3271 size_t max_supernode_nnz_dense = matrix_nt > 1 ? matrix_ns*(2*matrix_ns+matrix_nd) : matrix_ns*(matrix_ns+matrix_nd);
3272 size_t final_supernode_nnz_dense = matrix_nd > 0 ? matrix_nd*matrix_nd : 0;
3273 allocate_data_on_device((
void**)&blockR_dev,max_supernode_nnz_dense*
sizeof(T));
3274 allocate_data_on_device((
void**)&blockM_dev,max_supernode_nnz_dense*
sizeof(T));
3275 allocate_data_on_device((
void**)&blockDense_dev,final_supernode_nnz_dense*
sizeof(T));
3276 MF_dev_allocated =
true;
3279 t_allocate_MF_dev = get_time(t_allocate_MF_dev);
3283 size_t nnz = matrix_ia[matrix_size];
3285 size_t max_rows = matrix_nt > 1 ? 2*matrix_ns+matrix_nd : matrix_ns+matrix_nd;
3286 size_t max_cols = max(matrix_ns, matrix_nd);
3289 allocate_data_on_device((
void**)&ia_dev,(max_cols+1)*
sizeof(
size_t));
3290 allocate_data_on_device((
void**)&ja_dev,max_rows*max_cols*
sizeof(
size_t));
3291 mem_alloc_dev = allocate_data_on_device((
void**)&a_dev,max_rows*max_cols*
sizeof(T));
3294 printf(
"Allocated GPU memory Factorize: %ld\n", mem_alloc_dev);
3298 t_firstStageFactor = get_time(0.0);
3299 double gflops = FirstStageFactor();
3300 t_firstStageFactor = get_time(t_firstStageFactor);
3322 deallocate_data_on_dev(ia_dev,(max_cols+1)*
sizeof(
size_t));
3323 deallocate_data_on_dev(ja_dev,max_rows*max_cols*
sizeof(
size_t));
3324 mem_alloc_dev = deallocate_data_on_dev(a_dev,max_rows*max_cols*
sizeof(T));
3327 printf(
"Allocated GPU memory after Factorize: %ld\n", mem_alloc_dev);
3330 factorization_completed =
true;
3343 printf(
"In factorize no copy host function.\n");
3351 cudaGetDevice(&GPU_currRank);
3353 if(GPU_currRank != GPU_rank){
3355 cudaSetDevice(GPU_rank);
3356 cudaGetDevice(&GPU_currRank);
3361 if (!MF_dev_allocated)
3365 size_t max_supernode_nnz_dense = matrix_nt > 1 ? matrix_ns*(2*matrix_ns+matrix_nd) : matrix_ns*(matrix_ns+matrix_nd);
3366 size_t final_supernode_nnz_dense = matrix_nd > 0 ? matrix_nd*matrix_nd : 0;
3367 allocate_data_on_device((
void**)&blockR_dev,max_supernode_nnz_dense*
sizeof(T));
3368 allocate_data_on_device((
void**)&blockM_dev,max_supernode_nnz_dense*
sizeof(T));
3369 allocate_data_on_device((
void**)&blockDense_dev,final_supernode_nnz_dense*
sizeof(T));
3370 MF_dev_allocated =
true;
3373 size_t nnz = matrix_ia[matrix_size];
3374 size_t max_rows = matrix_nt > 1 ? 2*matrix_ns+matrix_nd : matrix_ns+matrix_nd;
3375 size_t max_cols = max(matrix_ns, matrix_nd);
3378 allocate_data_on_device((
void**)&ia_dev,(max_cols+1)*
sizeof(
size_t));
3379 allocate_data_on_device((
void**)&ja_dev,max_rows*max_cols*
sizeof(
size_t));
3380 allocate_data_on_device((
void**)&a_dev,max_rows*max_cols*
sizeof(T));
3384 allocate_data_on_device((
void**)&diag_dev,matrix_size*
sizeof(T));
3385 mem_alloc_dev = allocate_data_on_device((
void**)&diag_pos_dev,matrix_size*
sizeof(
size_t));
3388 printf(
"Allocated GPU memory Factorize noCpyHost: %ld\n", mem_alloc_dev);
3392 memcpy_to_device(diag_pos,diag_pos_dev,matrix_size*
sizeof(
size_t), NULL );
3395 double gflops = FirstStageFactor_noCopyHost(logDet);
3400 deallocate_data_on_dev(diag_dev,matrix_size*
sizeof(T));
3401 deallocate_data_on_dev(diag_pos_dev,matrix_size*
sizeof(
size_t));
3404 deallocate_data_on_dev(ia_dev,(max_cols+1)*
sizeof(
size_t));
3405 deallocate_data_on_dev(ja_dev,max_rows*max_cols*
sizeof(
size_t));
3406 mem_alloc_dev = deallocate_data_on_dev(a_dev,max_rows*max_cols*
sizeof(T));
3409 printf(
"Allocated GPU memory after Factorize noCpyHost: %ld\n", mem_alloc_dev);
3413 factorization_completed =
false;
3421double BTA<T>::factorizeSolve(
size_t *ia,
size_t *ja, T *a, T *x, T *b,
size_t nrhs,
double& t_firstSecondStage,
double& t_SecondStageBackPass)
3425 std::cout <<
"entered factorize." << std::endl;
3429 cudaGetDevice(&GPU_currRank);
3431 if(GPU_currRank != GPU_rank){
3433 cudaSetDevice(GPU_rank);
3434 cudaGetDevice(&GPU_currRank);
3444 double t_allocate_MF_host = get_time(0.0);
3446 initialize_MF_host();
3449 std::cout <<
"in factorize. MF_allocated : " << MF_allocated << std::endl;
3452 t_allocate_MF_host = get_time(t_allocate_MF_host);
3455 double t_allocate_MF_dev = get_time(0.0);
3458 if (!MF_dev_allocated){
3461 size_t max_supernode_nnz_dense = matrix_nt > 1 ? matrix_ns*(2*matrix_ns+matrix_nd) : matrix_ns*(matrix_ns+matrix_nd);
3462 size_t final_supernode_nnz_dense = matrix_nd > 0 ? matrix_nd*matrix_nd : 0;
3463 allocate_data_on_device((
void**)&blockR_dev,max_supernode_nnz_dense*
sizeof(T));
3464 allocate_data_on_device((
void**)&blockM_dev,max_supernode_nnz_dense*
sizeof(T));
3465 allocate_data_on_device((
void**)&blockDense_dev,final_supernode_nnz_dense*
sizeof(T));
3466 MF_dev_allocated =
true;
3469 t_allocate_MF_dev = get_time(t_allocate_MF_dev);
3472 size_t nnz = matrix_ia[matrix_size];
3474 size_t max_rows = matrix_nt > 1 ? 2*matrix_ns+matrix_nd : matrix_ns+matrix_nd;
3475 size_t max_cols = max(matrix_ns, matrix_nd);
3478 allocate_data_on_device((
void**)&ia_dev,(max_cols+1)*
sizeof(
size_t));
3479 allocate_data_on_device((
void**)&ja_dev,max_rows*max_cols*
sizeof(
size_t));
3480 mem_alloc_dev = allocate_data_on_device((
void**)&a_dev,max_rows*max_cols*
sizeof(T));
3483 rhs =
new T[matrix_size*nrhs];
3485 memcpy(rhs,b,matrix_size*nrhs*
sizeof(T));
3487 allocate_data_on_device((
void**)&rhs_dev,matrix_size*
sizeof(T));
3489 memcpy_to_device(b, rhs_dev,matrix_size*
sizeof(T), NULL);
3492 printf(
"Allocated GPU memory FactorizeSolve(): %ld\n", mem_alloc_dev);
3497 t_firstSecondStage = get_time(0.0);
3499 double gflops = FirstSecondStageFactor(nrhs);
3501 t_firstSecondStage = get_time(t_firstSecondStage);
3502 printf(
"time firstSecondStageFactor : %f\n", t_firstSecondStage);
3507 struct tm * timeinfo;
3511 timeinfo = localtime(&rawtime);
3513 strftime(buffer,
sizeof(buffer),
"L_%d-%m-%Y_%H:%M:%S.txt",timeinfo);
3514 std::string fileName(buffer);
3515 ofstream file(fileName, ios::out | ::ios::trunc);
3517 for(
int i=0; i<matrix_n_nonzeros; i++){
3518 file << std::setprecision(15) << MF[i] << endl;
3524 memcpy_to_host(rhs, rhs_dev,matrix_size*
sizeof(T), NULL);
3536 t_SecondStageBackPass = get_time(0.0);
3538 gflops += BackwardPassSolve(nrhs);
3540 t_SecondStageBackPass = get_time(t_SecondStageBackPass);
3541 printf(
"time backwardPassSolve : %f\n", t_SecondStageBackPass);
3544 memcpy(x,rhs,matrix_size*nrhs*
sizeof(T));
3550 deallocate_data_on_dev(ia_dev,(max_cols+1)*
sizeof(
size_t));
3551 deallocate_data_on_dev(ja_dev,max_rows*max_cols*
sizeof(
size_t));
3552 mem_alloc_dev = deallocate_data_on_dev(a_dev,max_rows*max_cols*
sizeof(T));
3555 deallocate_data_on_dev(rhs_dev, matrix_size*
sizeof(T));
3558 printf(
"Allocated GPU memory after Factorize: %ld\n", mem_alloc_dev);
3561 factorization_completed =
true;
3569double BTA<T>::solve(
size_t *ia,
size_t *ja, T *a, T *b,
size_t nrhs,
double& t_secondStageForwardPass,
double& t_secondStageBackwardPass)
3571 double gflops = solve(b, b, nrhs, t_secondStageForwardPass, t_secondStageBackwardPass);
3579double BTA<T>::solve(
size_t *ia,
size_t *ja, T *a, T *x, T *b,
size_t nrhs,
double& t_secondStageForwardPass,
double& t_secondStageBackwardPass)
3584 if (!factorization_completed){
3585 factorize(ia, ja, a, t_FSF);
3589 if(!(ia[matrix_size] == matrix_ia[matrix_size])){
3590 std::cout <<
"Matrices don't match!!" << std::endl;
3616 rhs =
new T[matrix_size*nrhs];
3619 memcpy(rhs,b,matrix_size*nrhs*
sizeof(T));
3622 double gflops = SecondStageSolve(nrhs, t_secondStageForwardPass, t_secondStageBackwardPass);
3625 memcpy(x,rhs,matrix_size*nrhs*
sizeof(T));
3642 if (!factorization_completed){
3643 factorize(ia, ja, a, t_FSF);
3647 if(!(ia[matrix_size] == matrix_ia[matrix_size])){
3648 std::cout <<
"Matrices don't match!!" << std::endl;
3653 printf(
"T already double precision. Doesn't make sense to call solve_d()! Cholesky factor already in double precision!\n");
3658 double* rhs_d =
new double[matrix_size*nrhs];
3661 for(
int i=0; i<matrix_size*nrhs; i++){
3662 rhs_d[i] = (double) b[i];
3666 double gflops = SecondStageSolve_d(nrhs, rhs_d);
3669 for(
int i=0; i<matrix_size*nrhs; i++){
3670 x[i] = (float) rhs_d[i];
3685double BTA<T>::solve_s(
size_t *ia,
size_t *ja,
double* a,
double* x,
double*b,
size_t nrhs)
3690 if (!factorization_completed){
3691 factorize(ia, ja, a, t_FSF);
3695 if(!(ia[matrix_size] == matrix_ia[matrix_size])){
3696 std::cout <<
"Matrices don't match!!" << std::endl;
3701 printf(
"T already single precision. Doesn't make sense to call solve_s()! Cholesky factor already in single precision!\n");
3706 float* rhs_s =
new float[matrix_size*nrhs];
3709 for(
int i=0; i<matrix_size*nrhs; i++){
3710 rhs_s[i] = (float) b[i];
3714 double gflops = SecondStageSolve_s(nrhs, rhs_s);
3717 for(
int i=0; i<matrix_size*nrhs; i++){
3718 x[i] = (double) rhs_s[i];
3736 cudaGetDevice(&GPU_currRank);
3738 if(GPU_currRank != GPU_rank){
3740 cudaSetDevice(GPU_rank);
3741 cudaGetDevice(&GPU_currRank);
3747 if (!factorization_completed)
3748 factorize(ia, ja, a, t_FSF);
3751 if(!(ia[matrix_size] == matrix_ia[matrix_size])){
3752 std::cout <<
"Matrices don't match!!" << std::endl;
3756 factorization_completed =
false;
3764 printf(
"\nBTASelInv: before allocating data on device now. CPY indicator %d\n", cpy_indicator);
3767 allocate_data_on_device((
void**)&diag_dev,matrix_size*
sizeof(T));
3768 allocate_data_on_device((
void**)&diag_pos_dev,matrix_size*
sizeof(
size_t));
3769 allocate_data_on_device((
void**)&tmp1_dev,matrix_ns*matrix_ns*
sizeof(T));
3770 mem_alloc_dev = allocate_data_on_device((
void**)&tmp2_dev,matrix_ns*matrix_ns*
sizeof(T));
3773 printf(
"Allocated GPU memory sel Inv: %ld, cpy ind %d\n", mem_alloc_dev, cpy_indicator);
3777 memcpy_to_device(diag_pos,diag_pos_dev,matrix_size*
sizeof(
size_t), NULL );
3780 double gflops = ThirdStageBTA(tmp1_dev,tmp2_dev, cpy_indicator);
3783 memcpy_to_host(diag,diag_dev,matrix_size*
sizeof(T), NULL );
3786 deallocate_data_on_dev(diag_dev,matrix_size*
sizeof(T));
3787 deallocate_data_on_dev(diag_pos_dev,matrix_size*
sizeof(
size_t));
3788 deallocate_data_on_dev(tmp1_dev,matrix_ns*matrix_ns*
sizeof(T));
3789 deallocate_data_on_dev(tmp2_dev,matrix_ns*matrix_ns*
sizeof(T));
3792 printf(
"Allocated GPU memory after sel Inv: %ld, cpy ind %d\n", mem_alloc_dev, cpy_indicator);
3802 cudaGetDevice(&GPU_currRank);
3804 if(GPU_currRank != GPU_rank){
3806 cudaSetDevice(GPU_rank);
3807 cudaGetDevice(&GPU_currRank);
3813 if (!factorization_completed)
3814 factorize(ia, ja, a, t_FSF);
3817 if(!(ia[matrix_size] == matrix_ia[matrix_size])){
3818 std::cout <<
"Matrices don't match!!" << std::endl;
3828 size_t nnz = matrix_ia[matrix_size];
3831 gpuErrchk(cudaMallocHost((
void**)&inv_a, nnz*
sizeof(T)));
3834 size_t max_rows = matrix_nt > 1 ? 2*matrix_ns+matrix_nd : matrix_ns+matrix_nd;
3835 size_t max_cols = max(matrix_ns, matrix_nd);
3838 printf(
"\nBTASelInv: allocating data on device now. CPY indicator %d\n", cpy_indicator);
3843 allocate_data_on_device((
void**)&tmp1_dev,matrix_ns*matrix_ns*
sizeof(T));
3844 allocate_data_on_device((
void**)&tmp2_dev,matrix_ns*matrix_ns*
sizeof(T));
3854 get_max_supernode_nnz();
3857 allocate_data_on_device((
void**)&inv_ia_dev,(max_cols+1)*
sizeof(
size_t));
3858 allocate_data_on_device((
void**)&inv_ja_dev,max_supernode_nnz*
sizeof(
size_t));
3859 allocate_data_on_device((
void**)&inv_a_dev,max_supernode_nnz*
sizeof(T));
3862 double gflops = ThirdStageBTA(tmp1_dev,tmp2_dev, cpy_indicator);
3864 factorization_completed =
false;
3875 for(
size_t i=0; i<nnz; i++){
3882 deallocate_data_on_dev(tmp1_dev,matrix_ns*matrix_ns*
sizeof(T));
3883 deallocate_data_on_dev(tmp2_dev,matrix_ns*matrix_ns*
sizeof(T));
3886 deallocate_data_on_dev(inv_ia_dev,(max_cols+1)*
sizeof(
size_t));
3887 deallocate_data_on_dev(inv_ja_dev,max_rows*max_cols*
sizeof(
size_t));
3888 deallocate_data_on_dev(inv_a_dev,max_rows*max_cols*
sizeof(T));
3890 cudaFreeHost(inv_a);
3899 printf(
"!!!CAREFUL: BTAinvBlks function abandoned. Not sure if it gives correct results!!!\n");
3903 if (!factorization_completed)
3904 factorize(ia, ja, a, t_FSF);
3907 if(!(ia[matrix_size] == matrix_ia[matrix_size])){
3908 std::cout <<
"Matrices don't match!!" << std::endl;
3912 factorization_completed =
false;
3916 size_t max_rows = matrix_nt > 1 ? 2*matrix_ns+matrix_nd : matrix_ns+matrix_nd;
3917 size_t max_cols = max(matrix_ns, matrix_nd);
3919 initialize_invBlks_host();
3927 allocate_data_on_device((
void**)&tmp1_dev,matrix_ns*matrix_ns*
sizeof(T));
3928 allocate_data_on_device((
void**)&tmp2_dev,matrix_ns*matrix_ns*
sizeof(T));
3934 double gflops = ThirdStageBTA(tmp1_dev,tmp2_dev, cpy_indicator);
3940 size_t nnz_invBlks = (matrix_ns+matrix_nd)*matrix_ns*matrix_nt+matrix_nd*matrix_nd;
3941 for(
size_t i=0; i<nnz_invBlks; i++){
3942 invBlks_ext[i] = invBlks[i];
3948 deallocate_data_on_dev(tmp1_dev,matrix_ns*matrix_ns*
sizeof(T));
3949 deallocate_data_on_dev(tmp2_dev,matrix_ns*matrix_ns*
sizeof(T));
3961 if (!factorization_completed)
3962 factorize(ia, ja, a, t_FSF);
3966 indexed_log_sum(MF, diag_pos, matrix_size, &det);
3976 T *r =
new T[matrix_size];
3978 memcpy(r, b, matrix_size*
sizeof(T));
3980 for (
size_t ic = 0; ic < matrix_size; ic++)
3982 for (
size_t i = matrix_ia[ic]; i < matrix_ia[ic+1]; i++)
3984 size_t ir = matrix_ja[i];
3986 r[ir] -= matrix_a[i]*x[ic];
3988 r[ic] -= matrix_a[i]*x[ir];
3992 double res = c_dtnrm2(matrix_size, r, 1);
4004 return residualNorm(x, b) / c_dtnrm2(matrix_size, b, 1);