From 10c485f3a923834e6add12093e9d06cdcaa4f1c2 Mon Sep 17 00:00:00 2001 From: leonhostetler Date: Sat, 14 Dec 2024 15:37:42 -0600 Subject: [PATCH 1/7] Added QUDA deflation for UML, CG, and CGZ for single right-hand side --- generic_ks/d_congrad5_fn_quda.c | 43 +++++++++++++++++++++++++++++++++ generic_ks/mat_invert.c | 19 ++++++++++++++- include/imp_ferm_links.h | 4 ++- ks_spectrum/control.c | 12 +++++++-- ks_spectrum/setup.c | 8 ++++++ 5 files changed, 82 insertions(+), 4 deletions(-) diff --git a/generic_ks/d_congrad5_fn_quda.c b/generic_ks/d_congrad5_fn_quda.c index defbdea3..e50c2ac8 100644 --- a/generic_ks/d_congrad5_fn_quda.c +++ b/generic_ks/d_congrad5_fn_quda.c @@ -133,6 +133,49 @@ int ks_congrad_parity_gpu(su3_vector *t_src, su3_vector *t_dest, inv_args.tadpole = u0; #endif + // Setup for deflation on GPU + + inv_args.tol_restart = param.eigen_param.tol_restart; + //int parity = param.eigen_param.parity; + int parity = qic->parity; + int blockSize = 1; + + QudaEigParam qep = newQudaEigParam(); + qep.block_size = blockSize; + qep.eig_type = ( blockSize > 1 ) ? QUDA_EIG_BLK_TR_LANCZOS : QUDA_EIG_TR_LANCZOS; /* or QUDA_EIG_IR_ARNOLDI, QUDA_EIG_BLK_IR_ARNOLDI */ + //qep.invert_param = &qip; + qep.eig_type = ( blockSize > 1 ) ? QUDA_EIG_BLK_TR_LANCZOS : QUDA_EIG_TR_LANCZOS; /* or QUDA_EIG_IR_ARNOLDI, QUDA_EIG_BLK_IR_ARNOLDI */ + qep.spectrum = QUDA_SPECTRUM_SR_EIG; /* Smallest Real. Other options: LM, SM, LR, SR, LI, SI */ + qep.n_conv = param.eigen_param.Nvecs_in; + qep.n_ev_deflate = param.eigen_param.Nvecs; + qep.n_ev = qep.n_conv; + qep.n_kr = 2*qep.n_conv; + //qep.block_size = blockSize; + //qep.tol = tol; + //qep.qr_tol = qep.tol; + //qep.max_restarts = maxIter; + qep.batched_rotate = 0; + qep.require_convergence = QUDA_BOOLEAN_TRUE; + qep.check_interval = 1; + qep.use_norm_op = ( parity == EVENANDODD ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + qep.use_pc = ( parity != EVENANDODD) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + qep.use_dagger = QUDA_BOOLEAN_FALSE; + qep.compute_gamma5 = QUDA_BOOLEAN_FALSE; + qep.compute_svd = QUDA_BOOLEAN_FALSE; + qep.use_eigen_qr = QUDA_BOOLEAN_TRUE; + //qep.use_poly_acc = QUDA_BOOLEAN_TRUE; + //qep.poly_deg = polyDeg; + //qep.a_min = aMin; + //qep.a_max = aMax; + //qep.arpack_check = QUDA_BOOLEAN_FALSE; + //strcpy( qep.arpack_logfile, "" ); + strcpy( qep.vec_infile, param.ks_eigen_startfile ); + strcpy( qep.vec_outfile, param.ks_eigen_savefile ); + qep.save_prec = (MILC_PRECISION==2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + qep.io_parity_inflate = QUDA_BOOLEAN_FALSE; + + inv_args.eig_param = qep; + qudaInvert(MILC_PRECISION, quda_precision, mass, diff --git a/generic_ks/mat_invert.c b/generic_ks/mat_invert.c index 3130a249..ad8221e9 100644 --- a/generic_ks/mat_invert.c +++ b/generic_ks/mat_invert.c @@ -205,7 +205,8 @@ int mat_invert_cg_field(su3_vector *src, su3_vector *dst, ks_dirac_adj_op( src, tmp, mass, EVENANDODD, fn); /* Do deflation if we have eigenvectors and the deflate parameter is true */ - + /* Skip MILC CPU deflation if using QUDA deflation */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) if(param.eigen_param.Nvecs > 0 && qic->deflate){ dtime = - dclock(); @@ -222,12 +223,15 @@ int mat_invert_cg_field(su3_vector *src, su3_vector *dst, node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime); #endif } +#endif /* dst_e <- (M_adj M)^-1 tmp_e (even sites only) */ qic->parity = EVEN; int cgn = ks_congrad_field( tmp, dst, qic, mass, fn ); int even_iters = qic->final_iters; + /* Skip MILC CPU deflation if using QUDA deflation */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) if(param.eigen_param.Nvecs > 0 && qic->deflate){ dtime = - dclock(); @@ -242,6 +246,7 @@ int mat_invert_cg_field(su3_vector *src, su3_vector *dst, node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime); #endif } +#endif /* dst_o <- (M_adj M)^-1 tmp_o (odd sites only) */ qic->parity = ODD; @@ -277,6 +282,8 @@ int mat_invert_cgz_field(su3_vector *src, su3_vector *dst, /* Put "exact" low-mode even-site solution in tmp if deflate parameter is true */ + /* Skip MILC CPU deflation if using QUDA deflation */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) if(param.eigen_param.Nvecs > 0 && qic->deflate){ dtime = - dclock(); @@ -292,6 +299,7 @@ int mat_invert_cgz_field(su3_vector *src, su3_vector *dst, node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime); #endif } +#endif /* Solve for all modes using tmp as an initial guess */ /* tmp_e <- (M_adj M)^-1 src_e (even sites only) */ @@ -301,6 +309,8 @@ int mat_invert_cgz_field(su3_vector *src, su3_vector *dst, /* Put "exact" low-mode odd-site solution in tmp if deflate parameter is true */ + /* Skip MILC CPU deflation if using QUDA deflation */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) if(param.eigen_param.Nvecs > 0 && qic->deflate){ dtime = - dclock(); @@ -316,6 +326,7 @@ int mat_invert_cgz_field(su3_vector *src, su3_vector *dst, node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime); #endif } +#endif /* Solve for all modes using tmp as an initial guess */ /* tmp_o <- (M_adj M)^-1 src_o (odd sites only) */ @@ -427,6 +438,8 @@ int mat_invert_uml_field(su3_vector *src, su3_vector *dst, ks_dirac_adj_op( src, tmp, mass, EVENANDODD, fn ); #if EIGMODE != EIGCG + /* Skip MILC CPU deflation if using QUDA deflation */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) if(param.eigen_param.Nvecs > 0 && qic->deflate){ dtime = - dclock(); @@ -439,6 +452,7 @@ int mat_invert_uml_field(su3_vector *src, su3_vector *dst, node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime); #endif } +#endif #endif /* dst_e <- (M_adj M)^-1 tmp_e (even sites only) */ @@ -460,6 +474,8 @@ int mat_invert_uml_field(su3_vector *src, su3_vector *dst, } END_LOOP_OMP; #if EIGMODE != EIGCG + /* Skip MILC CPU deflation if using QUDA deflation */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) if(param.eigen_param.Nvecs > 0 && qic->deflate){ dtime = - dclock(); @@ -470,6 +486,7 @@ int mat_invert_uml_field(su3_vector *src, su3_vector *dst, dtime += dclock(); node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime); } +#endif #endif /* Polish off odd sites to correct for possible roundoff error */ diff --git a/include/imp_ferm_links.h b/include/imp_ferm_links.h index 06ca95cd..2b92c997 100644 --- a/include/imp_ferm_links.h +++ b/include/imp_ferm_links.h @@ -346,7 +346,8 @@ typedef struct { int Nkr; /* size of the Krylov subspace */ ks_eigen_poly poly; /* Preconditioning polynomial */ int blockSize; /* block size for block variant eigensolvers */ - int parity; + int parity; + double tol_restart; } ks_eigen_param; #elif defined(HAVE_QDP) #define ks_eigensolve Kalkreuter @@ -371,6 +372,7 @@ typedef struct { int Restart ; /* Restart Rayleigh every so many iterations */ int Kiters ; /* Kalkreuter iterations */ int parity; + double tol_restart; } ks_eigen_param; #endif diff --git a/ks_spectrum/control.c b/ks_spectrum/control.c index ff5be2e0..256059d9 100644 --- a/ks_spectrum/control.c +++ b/ks_spectrum/control.c @@ -318,6 +318,8 @@ int main(int argc, char *argv[]) #endif #if EIGMODE != EIGCG + /* If using QUDA for deflation, then eigenvectors are loaded directly by QUDA and not MILC */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) if(param.eigen_param.Nvecs > 0){ /* malloc for eigenpairs */ eigVal = (Real *)malloc(param.eigen_param.Nvecs*sizeof(double)); @@ -338,6 +340,7 @@ int main(int argc, char *argv[]) node0_printf("WARNING: Gauge fixing does not readjust the eigenvectors"); } } +#endif #endif /**************************************************************/ @@ -378,6 +381,8 @@ int main(int argc, char *argv[]) /* Check the eigenvectors */ + /* If using QUDA for deflation, then eigenvectors are loaded directly by QUDA and not checked by MILC */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) /* Calculate and print the residues and norms of the eigenvectors */ resid = (Real *)malloc(Nvecs_curr*sizeof(double)); node0_printf("Even site residuals\n"); @@ -385,12 +390,14 @@ int main(int argc, char *argv[]) construct_eigen_other_parity(eigVec, eigVal, ¶m.eigen_param, fn); node0_printf("Odd site residuals\n"); check_eigres( resid, eigVec, eigVal, Nvecs_curr, ODD, fn ); - +#endif /* Unapply twisted boundary conditions on the fermion links and restore conventional KS phases and antiperiodic BC, if changed. */ boundary_twist_fn(fn, OFF); - + + /* If using QUDA for deflation, then eigenvalues are printed by QUDA */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) /* print eigenvalues of iDslash */ node0_printf("The above were eigenvalues of -Dslash^2 in MILC normalization\n"); node0_printf("Here we also list eigenvalues of iDslash in continuum normalization\n"); @@ -403,6 +410,7 @@ int main(int argc, char *argv[]) node0_printf("eigenval(%i): %10g\n", i, 0.0); } } +#endif #endif } diff --git a/ks_spectrum/setup.c b/ks_spectrum/setup.c index 7f4f8116..e29c9035 100644 --- a/ks_spectrum/setup.c +++ b/ks_spectrum/setup.c @@ -256,6 +256,14 @@ int readin(int prompt) { IF_OK if(param.eigen_param.Nvecs > 0){ + /* Additional parameters for QUDA deflation */ +#if ( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) + /* allow file to have more eigenpairs than will be used for deflation */ + IF_OK status += get_i(stdin, prompt,"file_number_of_eigenpairs", ¶m.eigen_param.Nvecs_in); + /* controls how often redeflation occurs during deflated inversions */ + IF_OK status += get_f(stdin, prompt,"tol_restart", ¶m.eigen_param.tol_restart); +#endif + /* eigenvector input */ IF_OK status += ask_starting_ks_eigen(stdin, prompt, ¶m.ks_eigen_startflag, param.ks_eigen_startfile); From edcfd9282a80231c0e17d623058f7d1a822e7196 Mon Sep 17 00:00:00 2001 From: leonhostetler Date: Thu, 19 Dec 2024 07:19:53 -0600 Subject: [PATCH 2/7] Fixed fresh and save options for QUDA eigenvectors --- generic_ks/d_congrad5_fn_quda.c | 46 +++++++++++++++++---------------- generic_ks/read_eigen_param.c | 7 ++++- ks_spectrum/control.c | 12 +++++++-- 3 files changed, 40 insertions(+), 25 deletions(-) diff --git a/generic_ks/d_congrad5_fn_quda.c b/generic_ks/d_congrad5_fn_quda.c index e50c2ac8..9298f1c4 100644 --- a/generic_ks/d_congrad5_fn_quda.c +++ b/generic_ks/d_congrad5_fn_quda.c @@ -133,46 +133,48 @@ int ks_congrad_parity_gpu(su3_vector *t_src, su3_vector *t_dest, inv_args.tadpole = u0; #endif - // Setup for deflation on GPU + // Setup for deflation (and eigensolve) on GPU inv_args.tol_restart = param.eigen_param.tol_restart; //int parity = param.eigen_param.parity; int parity = qic->parity; - int blockSize = 1; + int blockSize = param.eigen_param.blockSize; QudaEigParam qep = newQudaEigParam(); qep.block_size = blockSize; qep.eig_type = ( blockSize > 1 ) ? QUDA_EIG_BLK_TR_LANCZOS : QUDA_EIG_TR_LANCZOS; /* or QUDA_EIG_IR_ARNOLDI, QUDA_EIG_BLK_IR_ARNOLDI */ - //qep.invert_param = &qip; - qep.eig_type = ( blockSize > 1 ) ? QUDA_EIG_BLK_TR_LANCZOS : QUDA_EIG_TR_LANCZOS; /* or QUDA_EIG_IR_ARNOLDI, QUDA_EIG_BLK_IR_ARNOLDI */ qep.spectrum = QUDA_SPECTRUM_SR_EIG; /* Smallest Real. Other options: LM, SM, LR, SR, LI, SI */ - qep.n_conv = param.eigen_param.Nvecs_in; + qep.n_conv = param.eigen_param.Nvecs_in; // +2? qep.n_ev_deflate = param.eigen_param.Nvecs; qep.n_ev = qep.n_conv; - qep.n_kr = 2*qep.n_conv; - //qep.block_size = blockSize; - //qep.tol = tol; - //qep.qr_tol = qep.tol; - //qep.max_restarts = maxIter; - qep.batched_rotate = 0; + //qep.n_kr = param.eigen_param.Nkr; + qep.n_kr = 2*qep.n_ev; + qep.tol = param.eigen_param.tol; + qep.qr_tol = qep.tol; // change? + qep.max_restarts = param.eigen_param.MaxIter; qep.require_convergence = QUDA_BOOLEAN_TRUE; - qep.check_interval = 1; + qep.check_interval = 10; // change? qep.use_norm_op = ( parity == EVENANDODD ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; qep.use_pc = ( parity != EVENANDODD) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; qep.use_dagger = QUDA_BOOLEAN_FALSE; qep.compute_gamma5 = QUDA_BOOLEAN_FALSE; qep.compute_svd = QUDA_BOOLEAN_FALSE; - qep.use_eigen_qr = QUDA_BOOLEAN_TRUE; - //qep.use_poly_acc = QUDA_BOOLEAN_TRUE; - //qep.poly_deg = polyDeg; - //qep.a_min = aMin; - //qep.a_max = aMax; - //qep.arpack_check = QUDA_BOOLEAN_FALSE; - //strcpy( qep.arpack_logfile, "" ); - strcpy( qep.vec_infile, param.ks_eigen_startfile ); - strcpy( qep.vec_outfile, param.ks_eigen_savefile ); - qep.save_prec = (MILC_PRECISION==2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + qep.use_eigen_qr = QUDA_BOOLEAN_TRUE; // change? + qep.use_poly_acc = QUDA_BOOLEAN_TRUE; + qep.poly_deg = param.eigen_param.poly.norder; + qep.a_min = param.eigen_param.poly.minE; + qep.a_max = param.eigen_param.poly.maxE; + qep.arpack_check = QUDA_BOOLEAN_FALSE; + strcpy( qep.vec_infile, param.ks_eigen_startfile ); // auto FRESH if empty? + strcpy( qep.vec_outfile, param.ks_eigen_savefile ); // auto forget if empty? + //qep.save_prec = (MILC_PRECISION==2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; qep.io_parity_inflate = QUDA_BOOLEAN_FALSE; + + + qep.batched_rotate = 20; // add to input parameters + qep.save_prec = QUDA_SINGLE_PRECISION; // add to input parameters? + qep.partfile = QUDA_BOOLEAN_TRUE; // add to input parameters + inv_args.eig_param = qep; diff --git a/generic_ks/read_eigen_param.c b/generic_ks/read_eigen_param.c index 94093e9d..5370a2ff 100644 --- a/generic_ks/read_eigen_param.c +++ b/generic_ks/read_eigen_param.c @@ -31,7 +31,12 @@ int read_ks_eigen_param(ks_eigen_param *eigen_param, int status, int prompt){ IF_OK status += get_f(stdin, prompt, "Chebyshev_beta", &eigen_param->poly.maxE ); IF_OK status += get_i(stdin, prompt, "Chebyshev_order", &eigen_param->poly.norder ); IF_OK status += get_s(stdin, prompt, "diag_algorithm", param.eigen_param.diagAlg ); - + +#elif defined(HAVE_QUDA) && defined(USE_CG_GPU) && !defined(USE_EIG_GPU) + node0_printf("ERROR: When using QUDA for CG and wanting FRESH eigenvectors, only the QUDA eigensolver is allowed!\n"); + node0_printf("ERROR: Recompile with WANT_QUDA=true, WANT_CG_GPU=true, and WANT_EIG_GPU=true\n"); + terminate(1); + #elif defined(HAVE_QUDA) && defined(USE_EIG_GPU) IF_OK status += get_i(stdin, prompt, "Max_Lanczos_restart_iters", &eigen_param->MaxIter ); diff --git a/ks_spectrum/control.c b/ks_spectrum/control.c index 256059d9..9399a4ce 100644 --- a/ks_spectrum/control.c +++ b/ks_spectrum/control.c @@ -362,7 +362,10 @@ int main(int argc, char *argv[]) set_boundary_twist_fn(fn, bdry_phase, param.coord_origin); /* Apply the operation */ boundary_twist_fn(fn, ON); - + + // If using QUDA deflated CG + asking for QUDA to do the eigensolve, then + // the eigensolver is called from within QUDA's CG solver...not from MILC +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) && defined(USE_EIG_GPU) ) /* compute eigenpairs if requested */ if(param.ks_eigen_startflag == FRESH){ int total_R_iters; @@ -378,7 +381,7 @@ int main(int argc, char *argv[]) initialize_site_prn_from_seed(iseed); #endif } - +#endif /* Check the eigenvectors */ /* If using QUDA for deflation, then eigenvectors are loaded directly by QUDA and not checked by MILC */ @@ -955,6 +958,9 @@ int main(int argc, char *argv[]) #endif if(param.eigen_param.Nvecs > 0){ + + /* If using QUDA for deflation, then eigenvectors are loaded and saved directly by QUDA and not MILC */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) STARTTIME; /* save eigenvectors if requested */ @@ -969,6 +975,8 @@ int main(int argc, char *argv[]) free(eigVal); free(eigVec); free(resid); ENDTIME("save eigenvectors (if requested)"); + +#endif } /* Clean up quark sources, both base and modified */ From 2535fbf9ce73e5587af52b1a7083a97c7f889781 Mon Sep 17 00:00:00 2001 From: leonhostetler Date: Fri, 20 Dec 2024 14:01:47 -0600 Subject: [PATCH 3/7] Updated some interfacing with quda --- generic/milc_to_quda_utilities.c | 1 + generic_ks/d_congrad5_fn_quda.c | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/generic/milc_to_quda_utilities.c b/generic/milc_to_quda_utilities.c index 29cbca7e..da9473f6 100644 --- a/generic/milc_to_quda_utilities.c +++ b/generic/milc_to_quda_utilities.c @@ -50,6 +50,7 @@ int initialize_quda(void){ void finalize_quda(void){ #ifdef USE_CG_GPU + qudaCleanUpDeflationSpace(); #ifdef MULTIGRID mat_invert_mg_cleanup(); #endif diff --git a/generic_ks/d_congrad5_fn_quda.c b/generic_ks/d_congrad5_fn_quda.c index 9298f1c4..c37e3f0a 100644 --- a/generic_ks/d_congrad5_fn_quda.c +++ b/generic_ks/d_congrad5_fn_quda.c @@ -147,7 +147,6 @@ int ks_congrad_parity_gpu(su3_vector *t_src, su3_vector *t_dest, qep.n_conv = param.eigen_param.Nvecs_in; // +2? qep.n_ev_deflate = param.eigen_param.Nvecs; qep.n_ev = qep.n_conv; - //qep.n_kr = param.eigen_param.Nkr; qep.n_kr = 2*qep.n_ev; qep.tol = param.eigen_param.tol; qep.qr_tol = qep.tol; // change? @@ -170,6 +169,17 @@ int ks_congrad_parity_gpu(su3_vector *t_src, su3_vector *t_dest, //qep.save_prec = (MILC_PRECISION==2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; qep.io_parity_inflate = QUDA_BOOLEAN_FALSE; + qep.compute_evals_batch_size = 16; // Default is 4, memory considerations + qep.preserve_deflation = QUDA_BOOLEAN_TRUE; + + // If mass changes, eigenvalues need to be recomputed + static Real previous_mass = -1.0; + if( fabs(mass - previous_mass) < 1e-6 ) { + qep.preserve_evals = QUDA_BOOLEAN_TRUE; + } else { + qep.preserve_evals = QUDA_BOOLEAN_FALSE; + } + previous_mass = mass; qep.batched_rotate = 20; // add to input parameters qep.save_prec = QUDA_SINGLE_PRECISION; // add to input parameters? @@ -177,6 +187,7 @@ int ks_congrad_parity_gpu(su3_vector *t_src, su3_vector *t_dest, inv_args.eig_param = qep; + inv_args.prec_eigensolver = QUDA_SINGLE_PRECISION; qudaInvert(MILC_PRECISION, quda_precision, From eb301ba16f384d2f10bd0f842d705f4852ab1df9 Mon Sep 17 00:00:00 2001 From: leonhostetler Date: Sat, 21 Dec 2024 10:39:06 -0600 Subject: [PATCH 4/7] Fixed deflate savebuf was overwriting mass savebuf --- ks_spectrum/setup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ks_spectrum/setup.c b/ks_spectrum/setup.c index e29c9035..92b403c4 100644 --- a/ks_spectrum/setup.c +++ b/ks_spectrum/setup.c @@ -693,7 +693,6 @@ int readin(int prompt) { } IF_OK param.ksp[nprop].mass = atof(param.mass_label[nprop]); - IF_OK { int dir; FORALLUPDIR(dir)param.bdry_phase[nprop][dir] = bdry_phase[dir]; @@ -725,9 +724,10 @@ int readin(int prompt) { param.qic[nprop].deflate = 0; IF_OK { if(param.eigen_param.Nvecs > 0){ /* Need eigenvectors to deflate */ - IF_OK status += get_s(stdin, prompt,"deflate", savebuf); + char savebuf2[128]; + IF_OK status += get_s(stdin, prompt,"deflate", savebuf2); IF_OK { - if(strcmp(savebuf,"yes") == 0)param.qic[nprop].deflate = 1; + if(strcmp(savebuf2,"yes") == 0)param.qic[nprop].deflate = 1; } } } From 7a6d501fbb305a828221dedf126aeccaf660098f Mon Sep 17 00:00:00 2001 From: leonhostetler Date: Sat, 21 Dec 2024 14:15:07 -0600 Subject: [PATCH 5/7] QUDA batched deflation for multiple right hand sides --- generic_ks/d_congrad5_fn_quda.c | 72 ++++++++++++++++++++++++++++----- generic_ks/mat_invert.c | 23 +++++++++-- 2 files changed, 81 insertions(+), 14 deletions(-) diff --git a/generic_ks/d_congrad5_fn_quda.c b/generic_ks/d_congrad5_fn_quda.c index c37e3f0a..82d3cee2 100644 --- a/generic_ks/d_congrad5_fn_quda.c +++ b/generic_ks/d_congrad5_fn_quda.c @@ -158,28 +158,26 @@ int ks_congrad_parity_gpu(su3_vector *t_src, su3_vector *t_dest, qep.use_dagger = QUDA_BOOLEAN_FALSE; qep.compute_gamma5 = QUDA_BOOLEAN_FALSE; qep.compute_svd = QUDA_BOOLEAN_FALSE; - qep.use_eigen_qr = QUDA_BOOLEAN_TRUE; // change? + qep.use_eigen_qr = QUDA_BOOLEAN_TRUE; qep.use_poly_acc = QUDA_BOOLEAN_TRUE; qep.poly_deg = param.eigen_param.poly.norder; qep.a_min = param.eigen_param.poly.minE; qep.a_max = param.eigen_param.poly.maxE; qep.arpack_check = QUDA_BOOLEAN_FALSE; - strcpy( qep.vec_infile, param.ks_eigen_startfile ); // auto FRESH if empty? - strcpy( qep.vec_outfile, param.ks_eigen_savefile ); // auto forget if empty? - //qep.save_prec = (MILC_PRECISION==2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + strcpy( qep.vec_infile, param.ks_eigen_startfile ); + strcpy( qep.vec_outfile, param.ks_eigen_savefile ); qep.io_parity_inflate = QUDA_BOOLEAN_FALSE; - qep.compute_evals_batch_size = 16; // Default is 4, memory considerations + qep.compute_evals_batch_size = 16; // Default is 4 qep.preserve_deflation = QUDA_BOOLEAN_TRUE; // If mass changes, eigenvalues need to be recomputed + // but no need to recompute on the first solve static Real previous_mass = -1.0; - if( fabs(mass - previous_mass) < 1e-6 ) { - qep.preserve_evals = QUDA_BOOLEAN_TRUE; - } else { - qep.preserve_evals = QUDA_BOOLEAN_FALSE; - } + static bool first_solve=true; + qep.preserve_evals = ( first_solve || fabs(mass - previous_mass) < 1e-6 ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; previous_mass = mass; + first_solve = false; qep.batched_rotate = 20; // add to input parameters qep.save_prec = QUDA_SINGLE_PRECISION; // add to input parameters? @@ -336,6 +334,60 @@ int ks_congrad_block_parity_gpu(int nsrc, su3_vector **t_src, su3_vector **t_des inv_args.tadpole = u0; #endif + // Setup for deflation (and eigensolve) on GPU + + inv_args.tol_restart = param.eigen_param.tol_restart; + //int parity = param.eigen_param.parity; + int parity = qic->parity; + int blockSize = param.eigen_param.blockSize; + + QudaEigParam qep = newQudaEigParam(); + qep.block_size = blockSize; + qep.eig_type = ( blockSize > 1 ) ? QUDA_EIG_BLK_TR_LANCZOS : QUDA_EIG_TR_LANCZOS; /* or QUDA_EIG_IR_ARNOLDI, QUDA_EIG_BLK_IR_ARNOLDI */ + qep.spectrum = QUDA_SPECTRUM_SR_EIG; /* Smallest Real. Other options: LM, SM, LR, SR, LI, SI */ + qep.n_conv = param.eigen_param.Nvecs_in; // +2? + qep.n_ev_deflate = param.eigen_param.Nvecs; + qep.n_ev = qep.n_conv; + qep.n_kr = 2*qep.n_ev; + qep.tol = param.eigen_param.tol; + qep.qr_tol = qep.tol; // change? + qep.max_restarts = param.eigen_param.MaxIter; + qep.require_convergence = QUDA_BOOLEAN_TRUE; + qep.check_interval = 10; // change? + qep.use_norm_op = ( parity == EVENANDODD ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + qep.use_pc = ( parity != EVENANDODD) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + qep.use_dagger = QUDA_BOOLEAN_FALSE; + qep.compute_gamma5 = QUDA_BOOLEAN_FALSE; + qep.compute_svd = QUDA_BOOLEAN_FALSE; + qep.use_eigen_qr = QUDA_BOOLEAN_TRUE; // change? + qep.use_poly_acc = QUDA_BOOLEAN_TRUE; + qep.poly_deg = param.eigen_param.poly.norder; + qep.a_min = param.eigen_param.poly.minE; + qep.a_max = param.eigen_param.poly.maxE; + qep.arpack_check = QUDA_BOOLEAN_FALSE; + strcpy( qep.vec_infile, param.ks_eigen_startfile ); + strcpy( qep.vec_outfile, param.ks_eigen_savefile ); + qep.io_parity_inflate = QUDA_BOOLEAN_FALSE; + + qep.compute_evals_batch_size = 16; // Default is 4 + qep.preserve_deflation = QUDA_BOOLEAN_TRUE; + + // If mass changes, eigenvalues need to be recomputed + // but no need to recompute on the first solve + static Real previous_mass = -1.0; + static bool first_solve=true; + qep.preserve_evals = ( first_solve || fabs(mass - previous_mass) < 1e-6 ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + previous_mass = mass; + first_solve = false; + + qep.batched_rotate = 20; // add to input parameters + qep.save_prec = QUDA_SINGLE_PRECISION; // add to input parameters? + qep.partfile = QUDA_BOOLEAN_TRUE; // add to input parameters + + + inv_args.eig_param = qep; + inv_args.prec_eigensolver = QUDA_SINGLE_PRECISION; + qudaInvertMsrc(MILC_PRECISION, quda_precision, mass, diff --git a/generic_ks/mat_invert.c b/generic_ks/mat_invert.c index ad8221e9..0638bc6a 100644 --- a/generic_ks/mat_invert.c +++ b/generic_ks/mat_invert.c @@ -728,7 +728,8 @@ int mat_invert_block_cg(su3_vector **src, su3_vector **dst, } /* Put "exact" low-mode even-site solution in tmp if deflate parameter is true */ - + /* Skip MILC CPU deflation if using QUDA deflation */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) if(param.eigen_param.Nvecs > 0 && qic->deflate){ dtime = -dclock(); @@ -745,6 +746,7 @@ int mat_invert_block_cg(su3_vector **src, su3_vector **dst, node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime); #endif } +#endif /* dst_e <- (M_adj M)^-1 tmp_e (even sites only) */ qic->parity = EVEN; @@ -752,6 +754,8 @@ int mat_invert_block_cg(su3_vector **src, su3_vector **dst, int even_iters = qic->final_iters; /* Deflation on odd sites */ + /* Skip MILC CPU deflation if using QUDA deflation */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) if(param.eigen_param.Nvecs > 0 && qic->deflate){ dtime = -dclock(); @@ -767,6 +771,7 @@ int mat_invert_block_cg(su3_vector **src, su3_vector **dst, node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime); #endif } +#endif /* dst_o <- (M_adj M)^-1 tmp_o (odd sites only) */ qic->parity = ODD; @@ -797,7 +802,8 @@ int mat_invert_block_cgz(su3_vector **src, su3_vector **dst, tmp[is] = create_v_field(); /* Put "exact" low-mode even-site solution in tmp if deflate parameter is true */ - + /* Skip MILC CPU deflation if using QUDA deflation */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) if(param.eigen_param.Nvecs > 0 && qic->deflate){ for(int is = 0; is < nsrc; is++){ @@ -816,6 +822,7 @@ int mat_invert_block_cgz(su3_vector **src, su3_vector **dst, #endif } } +#endif /* Solve for all modes on even sites using tmp as an initial guess */ /* tmp_e <- (M_adj M)^-1 src_e (even sites only) */ @@ -824,7 +831,8 @@ int mat_invert_block_cgz(su3_vector **src, su3_vector **dst, int even_iters = qic->final_iters; /* Put "exact" low-mode odd-site solution in tmp if deflate parameter is true */ - + /* Skip MILC CPU deflation if using QUDA deflation */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) if(param.eigen_param.Nvecs > 0 && qic->deflate){ for(int is = 0; is < nsrc; is++){ @@ -842,6 +850,7 @@ int mat_invert_block_cgz(su3_vector **src, su3_vector **dst, #endif } } +#endif /* Solve for all modes on odd sites using tmp as an initial guess */ /* dst_o <- (M_adj M)^-1 tmp_o (odd sites only) */ @@ -887,7 +896,9 @@ int mat_invert_block_uml(su3_vector **src, su3_vector **dst, for(int is = 0; is < nsrc; is++){ ks_dirac_adj_op( src[is], tmp[is], mass, EVENANDODD, fn ); - + + /* Skip MILC CPU deflation if using QUDA deflation */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) if(param.eigen_param.Nvecs > 0 && qic->deflate){ dtime = - dclock(); #ifdef CGTIME @@ -901,6 +912,7 @@ int mat_invert_block_uml(su3_vector **src, su3_vector **dst, dtime += dclock(); node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime); } +#endif } /* dst_e <- (M_adj M)^-1 tmp_e (even sites only) */ @@ -917,6 +929,8 @@ int mat_invert_block_uml(su3_vector **src, su3_vector **dst, scalar_mult_su3_vector( dst[is]+i, 1.0/(2.0*mass), dst[is]+i ); } END_LOOP_OMP; + /* Skip MILC CPU deflation if using QUDA deflation */ +#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) if(param.eigen_param.Nvecs > 0 && qic->deflate){ dtime = - dclock(); node0_printf("deflating on odd sites for mass %g with %d eigenvec\n", @@ -927,6 +941,7 @@ int mat_invert_block_uml(su3_vector **src, su3_vector **dst, dtime += dclock(); node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime); } +#endif } /* Polish off odd sites to correct for possible roundoff error */ From 33e58df730a7225fcc1774628089288cccd5f723 Mon Sep 17 00:00:00 2001 From: leonhostetler Date: Sun, 22 Dec 2024 07:29:24 -0600 Subject: [PATCH 6/7] Added to input parameters --- generic_ks/d_congrad5_fn_quda.c | 80 +++++++++++++++------------------ generic_ks/read_eigen_param.c | 2 + include/imp_ferm_links.h | 3 ++ ks_spectrum/setup.c | 20 ++++++++- 4 files changed, 59 insertions(+), 46 deletions(-) diff --git a/generic_ks/d_congrad5_fn_quda.c b/generic_ks/d_congrad5_fn_quda.c index 82d3cee2..eb97dc74 100644 --- a/generic_ks/d_congrad5_fn_quda.c +++ b/generic_ks/d_congrad5_fn_quda.c @@ -134,25 +134,24 @@ int ks_congrad_parity_gpu(su3_vector *t_src, su3_vector *t_dest, #endif // Setup for deflation (and eigensolve) on GPU - - inv_args.tol_restart = param.eigen_param.tol_restart; - //int parity = param.eigen_param.parity; int parity = qic->parity; int blockSize = param.eigen_param.blockSize; - + static Real previous_mass = -1.0; + static bool first_solve=true; + QudaEigParam qep = newQudaEigParam(); qep.block_size = blockSize; qep.eig_type = ( blockSize > 1 ) ? QUDA_EIG_BLK_TR_LANCZOS : QUDA_EIG_TR_LANCZOS; /* or QUDA_EIG_IR_ARNOLDI, QUDA_EIG_BLK_IR_ARNOLDI */ qep.spectrum = QUDA_SPECTRUM_SR_EIG; /* Smallest Real. Other options: LM, SM, LR, SR, LI, SI */ - qep.n_conv = param.eigen_param.Nvecs_in; // +2? + qep.n_conv = (param.eigen_param.Nvecs_in > param.eigen_param.Nvecs) ? param.eigen_param.Nvecs_in : param.eigen_param.Nvecs; qep.n_ev_deflate = param.eigen_param.Nvecs; qep.n_ev = qep.n_conv; - qep.n_kr = 2*qep.n_ev; + qep.n_kr = (param.eigen_param.Nkr < qep.n_ev ) ? 2*qep.n_ev : param.eigen_param.Nkr; qep.tol = param.eigen_param.tol; - qep.qr_tol = qep.tol; // change? + qep.qr_tol = qep.tol; qep.max_restarts = param.eigen_param.MaxIter; qep.require_convergence = QUDA_BOOLEAN_TRUE; - qep.check_interval = 10; // change? + qep.check_interval = 10; qep.use_norm_op = ( parity == EVENANDODD ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; qep.use_pc = ( parity != EVENANDODD) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; qep.use_dagger = QUDA_BOOLEAN_FALSE; @@ -167,25 +166,22 @@ int ks_congrad_parity_gpu(su3_vector *t_src, su3_vector *t_dest, strcpy( qep.vec_infile, param.ks_eigen_startfile ); strcpy( qep.vec_outfile, param.ks_eigen_savefile ); qep.io_parity_inflate = QUDA_BOOLEAN_FALSE; - - qep.compute_evals_batch_size = 16; // Default is 4 + qep.compute_evals_batch_size = 16; qep.preserve_deflation = QUDA_BOOLEAN_TRUE; - - // If mass changes, eigenvalues need to be recomputed - // but no need to recompute on the first solve - static Real previous_mass = -1.0; - static bool first_solve=true; qep.preserve_evals = ( first_solve || fabs(mass - previous_mass) < 1e-6 ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - previous_mass = mass; - first_solve = false; - - qep.batched_rotate = 20; // add to input parameters + qep.batched_rotate = param.eigen_param.batchedRotate; qep.save_prec = QUDA_SINGLE_PRECISION; // add to input parameters? - qep.partfile = QUDA_BOOLEAN_TRUE; // add to input parameters + qep.partfile = param.eigen_param.partfile ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - inv_args.eig_param = qep; - inv_args.prec_eigensolver = QUDA_SINGLE_PRECISION; + if(param.eigen_param.eigPrec == 2)inv_args.prec_eigensolver = QUDA_DOUBLE_PRECISION; + else if(param.eigen_param.eigPrec == 1)inv_args.prec_eigensolver = QUDA_SINGLE_PRECISION; + else inv_args.prec_eigensolver = QUDA_HALF_PRECISION; + + inv_args.tol_restart = param.eigen_param.tol_restart; + + previous_mass = mass; + first_solve = false; qudaInvert(MILC_PRECISION, quda_precision, @@ -335,31 +331,30 @@ int ks_congrad_block_parity_gpu(int nsrc, su3_vector **t_src, su3_vector **t_des #endif // Setup for deflation (and eigensolve) on GPU - - inv_args.tol_restart = param.eigen_param.tol_restart; - //int parity = param.eigen_param.parity; int parity = qic->parity; int blockSize = param.eigen_param.blockSize; + static Real previous_mass = -1.0; + static bool first_solve=true; QudaEigParam qep = newQudaEigParam(); qep.block_size = blockSize; qep.eig_type = ( blockSize > 1 ) ? QUDA_EIG_BLK_TR_LANCZOS : QUDA_EIG_TR_LANCZOS; /* or QUDA_EIG_IR_ARNOLDI, QUDA_EIG_BLK_IR_ARNOLDI */ qep.spectrum = QUDA_SPECTRUM_SR_EIG; /* Smallest Real. Other options: LM, SM, LR, SR, LI, SI */ - qep.n_conv = param.eigen_param.Nvecs_in; // +2? + qep.n_conv = (param.eigen_param.Nvecs_in > param.eigen_param.Nvecs) ? param.eigen_param.Nvecs_in : param.eigen_param.Nvecs; qep.n_ev_deflate = param.eigen_param.Nvecs; qep.n_ev = qep.n_conv; - qep.n_kr = 2*qep.n_ev; + qep.n_kr = (param.eigen_param.Nkr < qep.n_ev ) ? 2*qep.n_ev : param.eigen_param.Nkr; qep.tol = param.eigen_param.tol; - qep.qr_tol = qep.tol; // change? + qep.qr_tol = qep.tol; qep.max_restarts = param.eigen_param.MaxIter; qep.require_convergence = QUDA_BOOLEAN_TRUE; - qep.check_interval = 10; // change? + qep.check_interval = 10; qep.use_norm_op = ( parity == EVENANDODD ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; qep.use_pc = ( parity != EVENANDODD) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; qep.use_dagger = QUDA_BOOLEAN_FALSE; qep.compute_gamma5 = QUDA_BOOLEAN_FALSE; qep.compute_svd = QUDA_BOOLEAN_FALSE; - qep.use_eigen_qr = QUDA_BOOLEAN_TRUE; // change? + qep.use_eigen_qr = QUDA_BOOLEAN_TRUE; qep.use_poly_acc = QUDA_BOOLEAN_TRUE; qep.poly_deg = param.eigen_param.poly.norder; qep.a_min = param.eigen_param.poly.minE; @@ -368,25 +363,22 @@ int ks_congrad_block_parity_gpu(int nsrc, su3_vector **t_src, su3_vector **t_des strcpy( qep.vec_infile, param.ks_eigen_startfile ); strcpy( qep.vec_outfile, param.ks_eigen_savefile ); qep.io_parity_inflate = QUDA_BOOLEAN_FALSE; - - qep.compute_evals_batch_size = 16; // Default is 4 + qep.compute_evals_batch_size = 16; qep.preserve_deflation = QUDA_BOOLEAN_TRUE; - - // If mass changes, eigenvalues need to be recomputed - // but no need to recompute on the first solve - static Real previous_mass = -1.0; - static bool first_solve=true; qep.preserve_evals = ( first_solve || fabs(mass - previous_mass) < 1e-6 ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - previous_mass = mass; - first_solve = false; - - qep.batched_rotate = 20; // add to input parameters + qep.batched_rotate = param.eigen_param.batchedRotate; qep.save_prec = QUDA_SINGLE_PRECISION; // add to input parameters? - qep.partfile = QUDA_BOOLEAN_TRUE; // add to input parameters - + qep.partfile = param.eigen_param.partfile ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; inv_args.eig_param = qep; - inv_args.prec_eigensolver = QUDA_SINGLE_PRECISION; + if(param.eigen_param.eigPrec == 2)inv_args.prec_eigensolver = QUDA_DOUBLE_PRECISION; + else if(param.eigen_param.eigPrec == 1)inv_args.prec_eigensolver = QUDA_SINGLE_PRECISION; + else inv_args.prec_eigensolver = QUDA_HALF_PRECISION; + + inv_args.tol_restart = param.eigen_param.tol_restart; + + previous_mass = mass; + first_solve = false; qudaInvertMsrc(MILC_PRECISION, quda_precision, diff --git a/generic_ks/read_eigen_param.c b/generic_ks/read_eigen_param.c index 5370a2ff..5c39da8e 100644 --- a/generic_ks/read_eigen_param.c +++ b/generic_ks/read_eigen_param.c @@ -43,6 +43,8 @@ int read_ks_eigen_param(ks_eigen_param *eigen_param, int status, int prompt){ IF_OK status += get_f(stdin, prompt, "eigenval_tolerance", &eigen_param->tol ); IF_OK status += get_i(stdin, prompt, "Lanczos_max", &eigen_param->Nkr ); IF_OK status += get_i(stdin, prompt, "Lanczos_restart", &eigen_param->Nrestart ); + IF_OK status += get_i(stdin, prompt, "eigensolver_prec", &eigen_param->eigPrec ); + IF_OK status += get_i(stdin, prompt, "batched_rotate", &eigen_param->batchedRotate ); IF_OK status += get_f(stdin, prompt, "Chebyshev_alpha", &eigen_param->poly.minE ); IF_OK status += get_f(stdin, prompt, "Chebyshev_beta", &eigen_param->poly.maxE ); IF_OK status += get_i(stdin, prompt, "Chebyshev_order", &eigen_param->poly.norder ); diff --git a/include/imp_ferm_links.h b/include/imp_ferm_links.h index 2b92c997..03ac7c73 100644 --- a/include/imp_ferm_links.h +++ b/include/imp_ferm_links.h @@ -346,6 +346,9 @@ typedef struct { int Nkr; /* size of the Krylov subspace */ ks_eigen_poly poly; /* Preconditioning polynomial */ int blockSize; /* block size for block variant eigensolvers */ + int partfile; /* Whether to save in partfile or not */ + int eigPrec; /* Run the eigensolver in this precision */ + int batchedRotate; /* Size of the rotation space to use for solver */ int parity; double tol_restart; } ks_eigen_param; diff --git a/ks_spectrum/setup.c b/ks_spectrum/setup.c index 92b403c4..21e195c0 100644 --- a/ks_spectrum/setup.c +++ b/ks_spectrum/setup.c @@ -258,8 +258,6 @@ int readin(int prompt) { /* Additional parameters for QUDA deflation */ #if ( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) - /* allow file to have more eigenpairs than will be used for deflation */ - IF_OK status += get_i(stdin, prompt,"file_number_of_eigenpairs", ¶m.eigen_param.Nvecs_in); /* controls how often redeflation occurs during deflated inversions */ IF_OK status += get_f(stdin, prompt,"tol_restart", ¶m.eigen_param.tol_restart); #endif @@ -268,10 +266,28 @@ int readin(int prompt) { IF_OK status += ask_starting_ks_eigen(stdin, prompt, ¶m.ks_eigen_startflag, param.ks_eigen_startfile); + /* Additional parameters for QUDA deflation */ +#if ( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) + if(param.ks_eigen_startflag == RELOAD_ASCII || + param.ks_eigen_startflag == RELOAD_SERIAL || + param.ks_eigen_startflag == RELOAD_PARALLEL ){ + /* allow file to have more eigenpairs than will be used for deflation */ + IF_OK status += get_i(stdin, prompt,"file_number_of_eigenpairs", ¶m.eigen_param.Nvecs_in); + } +#endif + /* eigenvector output */ IF_OK status += ask_ending_ks_eigen(stdin, prompt, ¶m.ks_eigen_saveflag, param.ks_eigen_savefile); +#if ( defined(USE_CG_GPU) && defined(HAVE_QUDA) ) + if(param.ks_eigen_saveflag == SAVE_PARTFILE_SCIDAC){ + param.eigen_param.partfile = 1; + } else { + param.eigen_param.partfile = 0; + } +#endif + /* If we are reading in eigenpairs, we don't regenerate them */ #if EIGMODE != EIGCG From 7de9cda99e689b2cb164bfdfeeace748ae26a88a Mon Sep 17 00:00:00 2001 From: leonhostetler Date: Sat, 25 Jan 2025 11:28:16 -0600 Subject: [PATCH 7/7] Restructured to preserve backward compatibility between old MILC and new QUDA --- generic_ks/d_congrad5_fn_quda.c | 134 +++++++++++++------------------- 1 file changed, 52 insertions(+), 82 deletions(-) diff --git a/generic_ks/d_congrad5_fn_quda.c b/generic_ks/d_congrad5_fn_quda.c index eb97dc74..88352e1a 100644 --- a/generic_ks/d_congrad5_fn_quda.c +++ b/generic_ks/d_congrad5_fn_quda.c @@ -139,54 +139,39 @@ int ks_congrad_parity_gpu(su3_vector *t_src, su3_vector *t_dest, static Real previous_mass = -1.0; static bool first_solve=true; - QudaEigParam qep = newQudaEigParam(); - qep.block_size = blockSize; - qep.eig_type = ( blockSize > 1 ) ? QUDA_EIG_BLK_TR_LANCZOS : QUDA_EIG_TR_LANCZOS; /* or QUDA_EIG_IR_ARNOLDI, QUDA_EIG_BLK_IR_ARNOLDI */ - qep.spectrum = QUDA_SPECTRUM_SR_EIG; /* Smallest Real. Other options: LM, SM, LR, SR, LI, SI */ - qep.n_conv = (param.eigen_param.Nvecs_in > param.eigen_param.Nvecs) ? param.eigen_param.Nvecs_in : param.eigen_param.Nvecs; - qep.n_ev_deflate = param.eigen_param.Nvecs; - qep.n_ev = qep.n_conv; - qep.n_kr = (param.eigen_param.Nkr < qep.n_ev ) ? 2*qep.n_ev : param.eigen_param.Nkr; - qep.tol = param.eigen_param.tol; - qep.qr_tol = qep.tol; - qep.max_restarts = param.eigen_param.MaxIter; - qep.require_convergence = QUDA_BOOLEAN_TRUE; - qep.check_interval = 10; - qep.use_norm_op = ( parity == EVENANDODD ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - qep.use_pc = ( parity != EVENANDODD) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - qep.use_dagger = QUDA_BOOLEAN_FALSE; - qep.compute_gamma5 = QUDA_BOOLEAN_FALSE; - qep.compute_svd = QUDA_BOOLEAN_FALSE; - qep.use_eigen_qr = QUDA_BOOLEAN_TRUE; - qep.use_poly_acc = QUDA_BOOLEAN_TRUE; - qep.poly_deg = param.eigen_param.poly.norder; - qep.a_min = param.eigen_param.poly.minE; - qep.a_max = param.eigen_param.poly.maxE; - qep.arpack_check = QUDA_BOOLEAN_FALSE; - strcpy( qep.vec_infile, param.ks_eigen_startfile ); - strcpy( qep.vec_outfile, param.ks_eigen_savefile ); - qep.io_parity_inflate = QUDA_BOOLEAN_FALSE; - qep.compute_evals_batch_size = 16; - qep.preserve_deflation = QUDA_BOOLEAN_TRUE; - qep.preserve_evals = ( first_solve || fabs(mass - previous_mass) < 1e-6 ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - qep.batched_rotate = param.eigen_param.batchedRotate; - qep.save_prec = QUDA_SINGLE_PRECISION; // add to input parameters? - qep.partfile = param.eigen_param.partfile ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - - inv_args.eig_param = qep; - if(param.eigen_param.eigPrec == 2)inv_args.prec_eigensolver = QUDA_DOUBLE_PRECISION; - else if(param.eigen_param.eigPrec == 1)inv_args.prec_eigensolver = QUDA_SINGLE_PRECISION; - else inv_args.prec_eigensolver = QUDA_HALF_PRECISION; - - inv_args.tol_restart = param.eigen_param.tol_restart; + QudaEigensolverArgs_t eig_args; + eig_args.block_size = blockSize; + eig_args.n_conv = (param.eigen_param.Nvecs_in > param.eigen_param.Nvecs) ? param.eigen_param.Nvecs_in : param.eigen_param.Nvecs; + eig_args.n_ev_deflate = param.eigen_param.Nvecs; + eig_args.n_ev = eig_args.n_conv; + eig_args.n_kr = (param.eigen_param.Nkr < eig_args.n_ev ) ? 2*eig_args.n_ev : param.eigen_param.Nkr; + eig_args.tol = param.eigen_param.tol; + eig_args.max_restarts = param.eigen_param.MaxIter; + eig_args.poly_deg = param.eigen_param.poly.norder; + eig_args.a_min = param.eigen_param.poly.minE; + eig_args.a_max = param.eigen_param.poly.maxE; + strcpy( eig_args.vec_infile, param.ks_eigen_startfile ); + strcpy( eig_args.vec_outfile, param.ks_eigen_savefile ); + eig_args.preserve_evals = ( first_solve || fabs(mass - previous_mass) < 1e-6 ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + eig_args.batched_rotate = param.eigen_param.batchedRotate; + eig_args.save_prec = QUDA_SINGLE_PRECISION; // add to input parameters? + eig_args.partfile = param.eigen_param.partfile ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + eig_args.io_parity_inflate = QUDA_BOOLEAN_FALSE; + eig_args.use_norm_op = ( parity == EVENANDODD ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + eig_args.use_pc = ( parity != EVENANDODD) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + if(param.eigen_param.eigPrec == 2)eig_args.prec_eigensolver = QUDA_DOUBLE_PRECISION; + else if(param.eigen_param.eigPrec == 1)eig_args.prec_eigensolver = QUDA_SINGLE_PRECISION; + else eig_args.prec_eigensolver = QUDA_HALF_PRECISION; + eig_args.tol_restart = param.eigen_param.tol_restart; previous_mass = mass; first_solve = false; - qudaInvert(MILC_PRECISION, + qudaInvertDeflatable(MILC_PRECISION, quda_precision, mass, inv_args, + eig_args, qic->resid, qic->relresid, fatlink, @@ -336,54 +321,39 @@ int ks_congrad_block_parity_gpu(int nsrc, su3_vector **t_src, su3_vector **t_des static Real previous_mass = -1.0; static bool first_solve=true; - QudaEigParam qep = newQudaEigParam(); - qep.block_size = blockSize; - qep.eig_type = ( blockSize > 1 ) ? QUDA_EIG_BLK_TR_LANCZOS : QUDA_EIG_TR_LANCZOS; /* or QUDA_EIG_IR_ARNOLDI, QUDA_EIG_BLK_IR_ARNOLDI */ - qep.spectrum = QUDA_SPECTRUM_SR_EIG; /* Smallest Real. Other options: LM, SM, LR, SR, LI, SI */ - qep.n_conv = (param.eigen_param.Nvecs_in > param.eigen_param.Nvecs) ? param.eigen_param.Nvecs_in : param.eigen_param.Nvecs; - qep.n_ev_deflate = param.eigen_param.Nvecs; - qep.n_ev = qep.n_conv; - qep.n_kr = (param.eigen_param.Nkr < qep.n_ev ) ? 2*qep.n_ev : param.eigen_param.Nkr; - qep.tol = param.eigen_param.tol; - qep.qr_tol = qep.tol; - qep.max_restarts = param.eigen_param.MaxIter; - qep.require_convergence = QUDA_BOOLEAN_TRUE; - qep.check_interval = 10; - qep.use_norm_op = ( parity == EVENANDODD ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - qep.use_pc = ( parity != EVENANDODD) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - qep.use_dagger = QUDA_BOOLEAN_FALSE; - qep.compute_gamma5 = QUDA_BOOLEAN_FALSE; - qep.compute_svd = QUDA_BOOLEAN_FALSE; - qep.use_eigen_qr = QUDA_BOOLEAN_TRUE; - qep.use_poly_acc = QUDA_BOOLEAN_TRUE; - qep.poly_deg = param.eigen_param.poly.norder; - qep.a_min = param.eigen_param.poly.minE; - qep.a_max = param.eigen_param.poly.maxE; - qep.arpack_check = QUDA_BOOLEAN_FALSE; - strcpy( qep.vec_infile, param.ks_eigen_startfile ); - strcpy( qep.vec_outfile, param.ks_eigen_savefile ); - qep.io_parity_inflate = QUDA_BOOLEAN_FALSE; - qep.compute_evals_batch_size = 16; - qep.preserve_deflation = QUDA_BOOLEAN_TRUE; - qep.preserve_evals = ( first_solve || fabs(mass - previous_mass) < 1e-6 ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - qep.batched_rotate = param.eigen_param.batchedRotate; - qep.save_prec = QUDA_SINGLE_PRECISION; // add to input parameters? - qep.partfile = param.eigen_param.partfile ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - - inv_args.eig_param = qep; - if(param.eigen_param.eigPrec == 2)inv_args.prec_eigensolver = QUDA_DOUBLE_PRECISION; - else if(param.eigen_param.eigPrec == 1)inv_args.prec_eigensolver = QUDA_SINGLE_PRECISION; - else inv_args.prec_eigensolver = QUDA_HALF_PRECISION; - - inv_args.tol_restart = param.eigen_param.tol_restart; + QudaEigensolverArgs_t eig_args; + eig_args.block_size = blockSize; + eig_args.n_conv = (param.eigen_param.Nvecs_in > param.eigen_param.Nvecs) ? param.eigen_param.Nvecs_in : param.eigen_param.Nvecs; + eig_args.n_ev_deflate = param.eigen_param.Nvecs; + eig_args.n_ev = eig_args.n_conv; + eig_args.n_kr = (param.eigen_param.Nkr < eig_args.n_ev ) ? 2*eig_args.n_ev : param.eigen_param.Nkr; + eig_args.tol = param.eigen_param.tol; + eig_args.max_restarts = param.eigen_param.MaxIter; + eig_args.poly_deg = param.eigen_param.poly.norder; + eig_args.a_min = param.eigen_param.poly.minE; + eig_args.a_max = param.eigen_param.poly.maxE; + strcpy( eig_args.vec_infile, param.ks_eigen_startfile ); + strcpy( eig_args.vec_outfile, param.ks_eigen_savefile ); + eig_args.preserve_evals = ( first_solve || fabs(mass - previous_mass) < 1e-6 ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + eig_args.batched_rotate = param.eigen_param.batchedRotate; + eig_args.save_prec = QUDA_SINGLE_PRECISION; // add to input parameters? + eig_args.partfile = param.eigen_param.partfile ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + eig_args.io_parity_inflate = QUDA_BOOLEAN_FALSE; + eig_args.use_norm_op = ( parity == EVENANDODD ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + eig_args.use_pc = ( parity != EVENANDODD) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + if(param.eigen_param.eigPrec == 2)eig_args.prec_eigensolver = QUDA_DOUBLE_PRECISION; + else if(param.eigen_param.eigPrec == 1)eig_args.prec_eigensolver = QUDA_SINGLE_PRECISION; + else eig_args.prec_eigensolver = QUDA_HALF_PRECISION; + eig_args.tol_restart = param.eigen_param.tol_restart; previous_mass = mass; first_solve = false; - qudaInvertMsrc(MILC_PRECISION, + qudaInvertMsrcDeflatable(MILC_PRECISION, quda_precision, mass, inv_args, + eig_args, qic->resid, qic->relresid, fatlink,