diff --git a/source/source_hsolver/CMakeLists.txt b/source/source_hsolver/CMakeLists.txt index b115d6d4cd2..95f7e23e230 100644 --- a/source/source_hsolver/CMakeLists.txt +++ b/source/source_hsolver/CMakeLists.txt @@ -4,6 +4,7 @@ list(APPEND objects diago_david.cpp diago_dav_subspace.cpp diago_bpcg.cpp + diago_ppcg.cpp para_linear_transform.cpp hsolver_pw.cpp hsolver_lcaopw.cpp diff --git a/source/source_hsolver/diago_ppcg.cpp b/source/source_hsolver/diago_ppcg.cpp new file mode 100644 index 00000000000..adc2471ae20 --- /dev/null +++ b/source/source_hsolver/diago_ppcg.cpp @@ -0,0 +1,1358 @@ +#include "diago_ppcg.h" + +// ----------------------------------------------------------------------------- +// LAPACK Fortran bindings (CPU only) +// ----------------------------------------------------------------------------- +extern "C" +{ +void dsyevd_(const char* jobz, const char* uplo, + const int* n, double* a, const int* lda, double* w, + double* work, const int* lwork, int* iwork, + const int* liwork, int* info); + +void ssyevd_(const char* jobz, const char* uplo, + const int* n, float* a, const int* lda, float* w, + float* work, const int* lwork, int* iwork, + const int* liwork, int* info); + +void dsygvd_(const int* itype, const char* jobz, const char* uplo, + const int* n, double* a, const int* lda, double* b, + const int* ldb, double* w, double* work, const int* lwork, + int* iwork, const int* liwork, int* info); + +void ssygvd_(const int* itype, const char* jobz, const char* uplo, + const int* n, float* a, const int* lda, float* b, + const int* ldb, float* w, float* work, const int* lwork, + int* iwork, const int* liwork, int* info); + +void dpotrf_(const char* uplo, const int* n, double* a, + const int* lda, int* info); +void spotrf_(const char* uplo, const int* n, float* a, + const int* lda, int* info); + +void dtrtri_(const char* uplo, const char* diag, + const int* n, double* a, const int* lda, int* info); +void strtri_(const char* uplo, const char* diag, + const int* n, float* a, const int* lda, int* info); +} + +namespace hsolver { + +// ============================================================================= +// LAPACK wrapper (specialized per real type) +// ============================================================================= +namespace { + +template +struct Lapack; + +template <> +struct Lapack +{ + static void syevd(int n, double* a, double* w) + { + const char jobz = 'V'; + const char uplo = 'U'; + const int lda = n; + int info = 0; + int lwork = -1; + int liwork = -1; + std::vector work(1); + std::vector iwork(1); + dsyevd_(&jobz, &uplo, &n, a, &lda, w, + work.data(), &lwork, iwork.data(), &liwork, &info); + if (info != 0) + { + lwork = std::max(1, 1 + 6 * n + 2 * n * n); + liwork = std::max(1, 3 + 5 * n); + } + else + { + lwork = static_cast(work[0]); + liwork = std::max(1, iwork[0]); + } + work.assign(static_cast(lwork), 0.0); + iwork.assign(static_cast(liwork), 0); + dsyevd_(&jobz, &uplo, &n, a, &lda, w, + work.data(), &lwork, iwork.data(), &liwork, &info); + if (info != 0) + throw std::runtime_error("PPCG: dsyevd failed."); + } + + static void sygvd(int n, double* a, double* b, double* w) + { + const int itype = 1; + const char jobz = 'V'; + const char uplo = 'U'; + const int lda = n; + const int ldb = n; + int info = 0; + int lwork = -1; + int liwork = -1; + std::vector work(1); + std::vector iwork(1); + dsygvd_(&itype, &jobz, &uplo, &n, a, &lda, b, &ldb, w, + work.data(), &lwork, iwork.data(), &liwork, &info); + if (info != 0) + { + lwork = std::max(1, 1 + 18 * n + 10 * n * n); + liwork = std::max(1, 3 + 10 * n); + } + else + { + lwork = static_cast(work[0]); + liwork = std::max(1, iwork[0]); + } + work.assign(static_cast(lwork), 0.0); + iwork.assign(static_cast(liwork), 0); + dsygvd_(&itype, &jobz, &uplo, &n, a, &lda, b, &ldb, w, + work.data(), &lwork, iwork.data(), &liwork, &info); + if (info != 0) + throw std::runtime_error("PPCG: dsygvd failed."); + } + + static void potrf(int n, double* a) + { + const char uplo = 'U'; + const int lda = n; + int info = 0; + + // Save a copy so we can restore and retry with a diagonal shift. + double diag_max = 0; + for (int i = 0; i < n; ++i) + diag_max = std::max(diag_max, std::abs(a[i + i * lda])); + std::vector a0(a, a + n * lda); + + for (const double shift : {0.0, 1e-12, 1e-10, 1e-8, 1e-6, 1e-4, 1e-3, 1e-2, 1e-1, 1.0}) { + // Restore original and apply shift + std::copy(a0.begin(), a0.end(), a); + if (shift > 0) { + for (int i = 0; i < n; ++i) + a[i + i * lda] += shift * std::max(diag_max, 1.0); + } + info = 0; + dpotrf_(&uplo, &n, a, &lda, &info); + if (info == 0) return; + } + throw std::runtime_error("PPCG: dpotrf failed."); + } + + static void trtri(int n, double* a) + { + const char uplo = 'U'; + const char diag = 'N'; + const int lda = n; + int info = 0; + dtrtri_(&uplo, &diag, &n, a, &lda, &info); + if (info != 0) + throw std::runtime_error("PPCG: dtrtri failed."); + } +}; + +template <> +struct Lapack +{ + static void syevd(int n, float* a, float* w) + { + const char jobz = 'V'; + const char uplo = 'U'; + const int lda = n; + int info = 0; + int lwork = -1; + int liwork = -1; + std::vector work(1); + std::vector iwork(1); + ssyevd_(&jobz, &uplo, &n, a, &lda, w, + work.data(), &lwork, iwork.data(), &liwork, &info); + if (info != 0) + { + lwork = std::max(1, 1 + 6 * n + 2 * n * n); + liwork = std::max(1, 3 + 5 * n); + } + else + { + lwork = static_cast(work[0]); + liwork = std::max(1, iwork[0]); + } + work.assign(static_cast(lwork), 0.0f); + iwork.assign(static_cast(liwork), 0); + ssyevd_(&jobz, &uplo, &n, a, &lda, w, + work.data(), &lwork, iwork.data(), &liwork, &info); + if (info != 0) + throw std::runtime_error("PPCG: ssyevd failed."); + } + + static void sygvd(int n, float* a, float* b, float* w) + { + const int itype = 1; + const char jobz = 'V'; + const char uplo = 'U'; + const int lda = n; + const int ldb = n; + int info = 0; + int lwork = -1; + int liwork = -1; + std::vector work(1); + std::vector iwork(1); + ssygvd_(&itype, &jobz, &uplo, &n, a, &lda, b, &ldb, w, + work.data(), &lwork, iwork.data(), &liwork, &info); + if (info != 0) + { + lwork = std::max(1, 1 + 18 * n + 10 * n * n); + liwork = std::max(1, 3 + 10 * n); + } + else + { + lwork = static_cast(work[0]); + liwork = std::max(1, iwork[0]); + } + work.assign(static_cast(lwork), 0.0f); + iwork.assign(static_cast(liwork), 0); + ssygvd_(&itype, &jobz, &uplo, &n, a, &lda, b, &ldb, w, + work.data(), &lwork, iwork.data(), &liwork, &info); + if (info != 0) + throw std::runtime_error("PPCG: ssygvd failed."); + } + + static void potrf(int n, float* a) + { + const char uplo = 'U'; + const int lda = n; + int info = 0; + + float diag_max = 0; + for (int i = 0; i < n; ++i) + diag_max = std::max(diag_max, std::abs(a[i + i * lda])); + std::vector a0(a, a + n * lda); + + for (const float shift : {0.0f, 1e-12f, 1e-10f, 1e-8f, 1e-6f, 1e-4f, 1e-3f, 1e-2f, 1e-1f, 1.0f}) { + std::copy(a0.begin(), a0.end(), a); + if (shift > 0) { + for (int i = 0; i < n; ++i) + a[i + i * lda] += shift * std::max(diag_max, 1.0f); + } + info = 0; + spotrf_(&uplo, &n, a, &lda, &info); + if (info == 0) return; + } + throw std::runtime_error("PPCG: spotrf failed."); + } + + static void trtri(int n, float* a) + { + const char uplo = 'U'; + const char diag = 'N'; + const int lda = n; + int info = 0; + strtri_(&uplo, &diag, &n, a, &lda, &info); + if (info != 0) + throw std::runtime_error("PPCG: strtri failed."); + } +}; + +template +inline void set_zero(std::vector& x) +{ + std::fill(x.begin(), x.end(), T(0)); +} + +} // anonymous namespace + +// ============================================================================= +// Constructor +// ============================================================================= +template +DiagoPPCG::DiagoPPCG(const Real& diag_thr, + const int& diag_iter_max, + const int& sbsize, + const int& rr_step, + const bool gamma_g0_real, + const PpcgStrategy strategy) + : maxiter_(diag_iter_max), + sbsize_(std::max(1, sbsize)), + rr_step_(std::max(1, rr_step)), + diag_thr_(std::max(diag_thr, static_cast(1.0e-14))), + gamma_g0_real_(gamma_g0_real), + strategy_(strategy) +{ +} + +// ============================================================================= +// Input validation +// ============================================================================= +template +void DiagoPPCG::validate_input( + const T* psi_in, + const Real* eigenvalue_in, + const std::vector& ethr_band, + const Real* prec) const +{ + if (psi_in == nullptr || eigenvalue_in == nullptr) + throw std::invalid_argument("PPCG: psi/eigenvalue pointer is null."); + if (prec == nullptr) + throw std::invalid_argument("PPCG: preconditioner pointer is null."); + if (ld_psi_ <= 0 || n_band_ <= 0 || n_dim_ <= 0) + throw std::invalid_argument("PPCG: invalid dimensions."); + if (n_dim_ > ld_psi_) + throw std::invalid_argument("PPCG: dim must not exceed ld_psi."); + if (ethr_band.size() < static_cast(n_band_)) + throw std::invalid_argument("PPCG: ethr_band size is smaller than nband."); +} + +// ============================================================================= +// Gamma-point symmetry: enforce real-valued first element +// ============================================================================= +template +void DiagoPPCG::force_g0_real(T* x, int ncol) const +{ + if (!gamma_g0_real_ || n_dim_ <= 0) + return; + for (int j = 0; j < ncol; ++j) + x[idx(0, j, ld_psi_)] = T(std::real(x[idx(0, j, ld_psi_)]), 0.0); +} + +// ============================================================================= +// Operator application +// ============================================================================= +template +void DiagoPPCG::apply_h(const HPsiFunc& hpsi_func, + T* psi_in, T* hpsi_out, + int ncol) const +{ + hpsi_func(psi_in, hpsi_out, ld_psi_, ncol); +} + +template +void DiagoPPCG::apply_s(const SPsiFunc& spsi_func, + T* psi_in, T* spsi_out, + int ncol) const +{ + if (spsi_func) + spsi_func(psi_in, spsi_out, ld_psi_, ncol); + else + for (int j = 0; j < ncol; ++j) + std::copy(psi_in + j * ld_psi_, psi_in + (j + 1) * ld_psi_, + spsi_out + j * ld_psi_); +} + +template +void DiagoPPCG::apply_s_current(T* psi_in, T* spsi_out, + int ncol) const +{ + apply_s(spsi_func_, psi_in, spsi_out, ncol); +} + +// ============================================================================= +// Inner product (real part only, for Hermitian operators) +// ============================================================================= +template +typename DiagoPPCG::Real +DiagoPPCG::gamma_dot(const T* x, const T* y) const +{ + Real acc = 0; + for (int i = 0; i < n_dim_; ++i) + acc += static_cast(std::real(std::conj(x[i]) * y[i])); + return acc; +} + +// ============================================================================= +// Gram matrix: out[i, j] = +// ============================================================================= +template +void DiagoPPCG::gram(const T* a, const T* b, + int ncol_a, int ncol_b, + std::vector& out, + int ld_out) const +{ + out.assign(ld_out * ncol_b, static_cast(0)); + for (int jb = 0; jb < ncol_b; ++jb) + for (int ia = 0; ia < ncol_a; ++ia) + out[ia + jb * ld_out] = gamma_dot(a + ia * ld_psi_, + b + jb * ld_psi_); +} + +// ============================================================================= +// Column gather: extract selected columns into contiguous storage +// ============================================================================= +template +void DiagoPPCG::copy_cols(const T* src, + const std::vector& cols, + std::vector& dst) const +{ + dst.assign(ld_psi_ * cols.size(), T(0)); + for (int j = 0; j < static_cast(cols.size()); ++j) + { + const int c = cols[j]; + std::copy(src + c * ld_psi_, src + c * ld_psi_ + ld_psi_, + dst.begin() + j * ld_psi_); + } +} + +// ============================================================================= +// Column scatter: write contiguous storage back into selected columns +// ============================================================================= +template +void DiagoPPCG::scatter_cols( + T* dst, + const std::vector& cols, + const std::vector& src) const +{ + for (int j = 0; j < static_cast(cols.size()); ++j) + { + const int c = cols[j]; + std::copy(src.begin() + j * ld_psi_, + src.begin() + (j + 1) * ld_psi_, + dst + c * ld_psi_); + } +} + +// ============================================================================= +// Project x onto vectors orthogonal to S-orthonormal basis +// ============================================================================= +template +void DiagoPPCG::project_against( + const T* basis, const T* sbasis, + const std::vector& basis_cols, + std::vector& x, std::vector& sx, + const std::vector& x_cols) const +{ + if (basis_cols.empty() || x_cols.empty()) + return; + + for (const int c : x_cols) + { + for (const int bc : basis_cols) + { + const Real coeff = gamma_dot(basis + bc * ld_psi_, + sx.data() + c * ld_psi_); + if (std::abs(coeff) <= std::numeric_limits::epsilon()) + continue; + for (int ig = 0; ig < n_dim_; ++ig) + { + x[ idx(ig, c, ld_psi_)] -= basis[ idx(ig, bc, ld_psi_)] * coeff; + sx[idx(ig, c, ld_psi_)] -= sbasis[idx(ig, bc, ld_psi_)] * coeff; + } + } + } +} + +// ============================================================================= +// Preconditioner: x[c] /= max(prec, eps) for each active column c +// ============================================================================= +template +void DiagoPPCG::divide_by_preconditioner( + const std::vector& active_cols, + const Real* prec, + std::vector& x) const +{ + for (const int c : active_cols) + for (int ig = 0; ig < n_dim_; ++ig) + x[idx(ig, c, ld_psi_)] /= + std::max(prec[ig], static_cast(1.0e-12)); +} + +//============================================================================== +// BLOCK_SUBSPACE STRATEGY +//============================================================================== + +// --------------------------------------------------------------------------- +// Lock converged eigenpairs: columns with residual below threshold +// --------------------------------------------------------------------------- +template +void DiagoPPCG::lock_epairs( + const std::vector& residual, + const std::vector& ethr_band, + std::vector& active_cols) const +{ + active_cols.clear(); + for (int j = 0; j < n_band_; ++j) + { + Real nrm2 = 0; + for (int ig = 0; ig < n_dim_; ++ig) + nrm2 += static_cast(std::norm(residual[idx(ig, j, ld_psi_)])); + const Real rnrm = std::sqrt(std::max(nrm2, static_cast(0))); + const Real thr = std::max(static_cast(ethr_band[j]), diag_thr_); + if (rnrm > thr) + active_cols.push_back(j); + } +} + +// --------------------------------------------------------------------------- +// Build K = V^H H V and M = V^H S V where V = [psi, w, p] +// --------------------------------------------------------------------------- +template +void DiagoPPCG::build_small_subspace( + const T* psi, + const std::vector& cols, + bool use_p, + SmallSubspace& subspace) const +{ + const int l = static_cast(cols.size()); + const int nblk = use_p ? 3 : 2; + const int dim = nblk * l; + subspace.k.assign(dim * dim, static_cast(0)); + subspace.m.assign(dim * dim, static_cast(0)); + subspace.eval.assign(dim, static_cast(0)); + + std::vector psi_l, spsi_l, hpsi_l; + std::vector w_l, sw_l, hw_l; + std::vector p_l, sp_l, hp_l; + copy_cols(psi, cols, psi_l); + copy_cols(spsi_.data(), cols, spsi_l); + copy_cols(hpsi_.data(), cols, hpsi_l); + copy_cols(w_.data(), cols, w_l); + copy_cols(sw_.data(), cols, sw_l); + copy_cols(hw_.data(), cols, hw_l); + if (use_p) + { + copy_cols(p_.data(), cols, p_l); + copy_cols(sp_.data(), cols, sp_l); + copy_cols(hp_.data(), cols, hp_l); + } + + auto fill_sym = [&](const std::vector& a, const std::vector& b, + int r0, int c0, std::vector& mat) + { + std::vector g; + gram(a.data(), b.data(), l, l, g, l); + for (int j = 0; j < l; ++j) + for (int i = 0; i < l; ++i) + { + mat[(r0 + i) + (c0 + j) * dim] = g[i + j * l]; + mat[(c0 + j) + (r0 + i) * dim] = g[i + j * l]; + } + }; + + fill_sym(psi_l, hpsi_l, 0, 0, subspace.k); + fill_sym(psi_l, spsi_l, 0, 0, subspace.m); + fill_sym(w_l, hw_l, l, l, subspace.k); + fill_sym(w_l, sw_l, l, l, subspace.m); + fill_sym(psi_l, hw_l, 0, l, subspace.k); + fill_sym(psi_l, sw_l, 0, l, subspace.m); + + if (use_p) + { + fill_sym(p_l, hp_l, 2*l, 2*l, subspace.k); + fill_sym(p_l, sp_l, 2*l, 2*l, subspace.m); + fill_sym(psi_l, hp_l, 0, 2*l, subspace.k); + fill_sym(psi_l, sp_l, 0, 2*l, subspace.m); + fill_sym(w_l, hp_l, l, 2*l, subspace.k); + fill_sym(w_l, sp_l, l, 2*l, subspace.m); + } +} + +// --------------------------------------------------------------------------- +// Solve K v = λ M v (small generalized eigenvalue problem) +// --------------------------------------------------------------------------- +template +void DiagoPPCG::solve_small_generalized( + int dim, SmallSubspace& subspace) const +{ + // Try with increasing diagonal shifts; fall back to identity (no update) + // if the subspace is too ill-conditioned. + // Save original M; dsygvd modifies it in-place before it may fail. + const std::vector m0 = subspace.m; + const Real shifts[] = {static_cast(1e-10), + static_cast(1e-8), + static_cast(1e-6)}; + for (int attempt = 0; attempt < 3; ++attempt) + { + try + { + Lapack::sygvd(dim, subspace.k.data(), subspace.m.data(), + subspace.eval.data()); + return; + } + catch (const std::runtime_error&) + { + subspace.m = m0; + for (int i = 0; i < dim; ++i) + subspace.m[i + i * dim] += shifts[attempt]; + } + } + // All attempts failed — set eigenvectors to identity (no update). + std::fill(subspace.k.begin(), subspace.k.end(), static_cast(0)); + for (int i = 0; i < dim; ++i) + subspace.k[i + i * dim] = static_cast(1); + std::fill(subspace.eval.begin(), subspace.eval.end(), static_cast(0)); +} + +// --------------------------------------------------------------------------- +// Update wavefunctions from small subspace eigenvectors +// --------------------------------------------------------------------------- +template +void DiagoPPCG::update_one_block( + T* psi, + const std::vector& cols, + int l, + bool use_p, + const SmallSubspace& subspace) +{ + const int dim = (use_p ? 3 : 2) * l; + const Real* eigvec = subspace.k.data(); + + std::vector psi_l, spsi_l, hpsi_l; + std::vector w_l, sw_l, hw_l; + std::vector p_l, sp_l, hp_l; + copy_cols(psi, cols, psi_l); + copy_cols(spsi_.data(), cols, spsi_l); + copy_cols(hpsi_.data(), cols, hpsi_l); + copy_cols(w_.data(), cols, w_l); + copy_cols(sw_.data(), cols, sw_l); + copy_cols(hw_.data(), cols, hw_l); + if (use_p) + { + copy_cols(p_.data(), cols, p_l); + copy_cols(sp_.data(), cols, sp_l); + copy_cols(hp_.data(), cols, hp_l); + } + + std::vector psi_new(ld_psi_ * l, T(0)); + std::vector spsi_new(ld_psi_ * l, T(0)); + std::vector hpsi_new(ld_psi_ * l, T(0)); + std::vector p_new(ld_psi_ * l, T(0)); + std::vector sp_new(ld_psi_ * l, T(0)); + std::vector hp_new(ld_psi_ * l, T(0)); + + for (int j = 0; j < l; ++j) + { + for (int i = 0; i < l; ++i) + { + const Real cpsi = eigvec[i + j * dim]; + const Real cw = eigvec[(l + i) + j * dim]; + + for (int ig = 0; ig < n_dim_; ++ig) + { + psi_new[idx(ig, j, ld_psi_)] += psi_l[idx(ig, i, ld_psi_)] * cpsi + + w_l[ idx(ig, i, ld_psi_)] * cw; + spsi_new[idx(ig, j, ld_psi_)] += spsi_l[idx(ig, i, ld_psi_)] * cpsi + + sw_l[ idx(ig, i, ld_psi_)] * cw; + hpsi_new[idx(ig, j, ld_psi_)] += hpsi_l[idx(ig, i, ld_psi_)] * cpsi + + hw_l[ idx(ig, i, ld_psi_)] * cw; + p_new[idx(ig, j, ld_psi_)] += w_l[ idx(ig, i, ld_psi_)] * cw; + sp_new[idx(ig, j, ld_psi_)] += sw_l[ idx(ig, i, ld_psi_)] * cw; + hp_new[idx(ig, j, ld_psi_)] += hw_l[ idx(ig, i, ld_psi_)] * cw; + } + + if (use_p) + { + const Real cp = eigvec[(2*l + i) + j * dim]; + for (int ig = 0; ig < n_dim_; ++ig) + { + psi_new[idx(ig, j, ld_psi_)] += p_l[ idx(ig, i, ld_psi_)] * cp; + spsi_new[idx(ig, j, ld_psi_)] += sp_l[idx(ig, i, ld_psi_)] * cp; + hpsi_new[idx(ig, j, ld_psi_)] += hp_l[idx(ig, i, ld_psi_)] * cp; + p_new[idx(ig, j, ld_psi_)] += p_l[ idx(ig, i, ld_psi_)] * cp; + sp_new[idx(ig, j, ld_psi_)] += sp_l[idx(ig, i, ld_psi_)] * cp; + hp_new[idx(ig, j, ld_psi_)] += hp_l[idx(ig, i, ld_psi_)] * cp; + } + } + } + } + + scatter_cols(psi, cols, psi_new); + scatter_cols(spsi_.data(), cols, spsi_new); + scatter_cols(hpsi_.data(), cols, hpsi_new); + scatter_cols(p_.data(), cols, p_new); + scatter_cols(sp_.data(), cols, sp_new); + scatter_cols(hp_.data(), cols, hp_new); +} + +// --------------------------------------------------------------------------- +// Back-substitute with upper triangular Cholesky factor: X *= R^{-1} +// --------------------------------------------------------------------------- +template +void DiagoPPCG::right_solve_upper_real( + const std::vector& r, int n, std::vector& x) const +{ + std::vector b = x; + for (int row = 0; row < n_dim_; ++row) + { + for (int j = 0; j < n; ++j) + { + T v = b[idx(row, j, ld_psi_)]; + for (int k = 0; k < j; ++k) + v -= x[idx(row, k, ld_psi_)] * r[k + j * n]; + x[idx(row, j, ld_psi_)] = v / r[j + j * n]; + } + } +} + +// --------------------------------------------------------------------------- +// Cholesky QR: S-orthonormalize active columns via Cholesky on S-gram +// --------------------------------------------------------------------------- +template +void DiagoPPCG::chol_qr_active( + T* psi, const std::vector& active_cols) +{ + if (active_cols.empty()) + return; + + const int nact = static_cast(active_cols.size()); + std::vector psi_a, spsi_a, hpsi_a; + copy_cols(psi, active_cols, psi_a); + copy_cols(spsi_.data(), active_cols, spsi_a); + copy_cols(hpsi_.data(), active_cols, hpsi_a); + + std::vector s(nact * nact, static_cast(0)); + gram(psi_a.data(), spsi_a.data(), nact, nact, s, nact); + + Lapack::potrf(nact, s.data()); + right_solve_upper_real(s, nact, psi_a); + right_solve_upper_real(s, nact, spsi_a); + right_solve_upper_real(s, nact, hpsi_a); + + scatter_cols(psi, active_cols, psi_a); + scatter_cols(spsi_.data(), active_cols, spsi_a); + scatter_cols(hpsi_.data(), active_cols, hpsi_a); +} + +// --------------------------------------------------------------------------- +// Rayleigh-Ritz: full subspace diagonalization + residual computation +// --------------------------------------------------------------------------- +template +void DiagoPPCG::rayleigh_ritz( + T* psi, Real* eigenvalue, + std::vector& active_cols, + const std::vector& ethr_band) +{ + std::vector hsub(n_band_ * n_band_, static_cast(0)); + std::vector ssub(n_band_ * n_band_, static_cast(0)); + gram(psi, hpsi_.data(), n_band_, n_band_, hsub, n_band_); + gram(psi, spsi_.data(), n_band_, n_band_, ssub, n_band_); + + std::vector eval(n_band_, static_cast(0)); + Lapack::sygvd(n_band_, hsub.data(), ssub.data(), eval.data()); + + std::vector psi_old(psi, psi + ld_psi_ * n_band_); + std::vector spsi_old = spsi_; + std::vector hpsi_old = hpsi_; + + std::fill(psi, psi + ld_psi_ * n_band_, T(0)); + set_zero(spsi_); + set_zero(hpsi_); + + for (int j = 0; j < n_band_; ++j) + { + for (int i = 0; i < n_band_; ++i) + { + const Real c = hsub[i + j * n_band_]; + for (int ig = 0; ig < n_dim_; ++ig) + { + psi[ idx(ig, j, ld_psi_)] += psi_old[ idx(ig, i, ld_psi_)] * c; + spsi_[idx(ig, j, ld_psi_)] += spsi_old[idx(ig, i, ld_psi_)] * c; + hpsi_[idx(ig, j, ld_psi_)] += hpsi_old[idx(ig, i, ld_psi_)] * c; + } + } + eigenvalue[j] = eval[j]; + } + + // Compute residual: w_i = H|psi_i> - eps_i * S|psi_i> + set_zero(w_); + for (int j = 0; j < n_band_; ++j) + for (int ig = 0; ig < n_dim_; ++ig) + w_[idx(ig, j, ld_psi_)] = hpsi_[idx(ig, j, ld_psi_)] + - spsi_[idx(ig, j, ld_psi_)] * eigenvalue[j]; + + lock_epairs(w_, ethr_band, active_cols); +} + +// --------------------------------------------------------------------------- +// Trace of H|psi> within active columns +// --------------------------------------------------------------------------- +template +typename DiagoPPCG::Real +DiagoPPCG::trace_of_active_projected( + const T* psi, const std::vector& active_cols) const +{ + if (active_cols.empty()) + return static_cast(0); + + std::vector psi_a, hpsi_a; + copy_cols(psi, active_cols, psi_a); + copy_cols(hpsi_.data(), active_cols, hpsi_a); + + const int nact = static_cast(active_cols.size()); + std::vector g(nact * nact, static_cast(0)); + gram(psi_a.data(), hpsi_a.data(), nact, nact, g, nact); + + Real tr = 0; + for (int i = 0; i < nact; ++i) + tr += g[i + i * nact]; + return tr; +} + +//============================================================================== +// CONJUGATE_GRADIENT STRATEGY +//============================================================================== + +// --------------------------------------------------------------------------- +// Compute gradient: grad_i = H|psi_i> - eps_i * S|psi_i> +// --------------------------------------------------------------------------- +template +void DiagoPPCG::calc_gradient( + const Real* /*prec*/, + const T* hpsi, + const T* spsi, + const T* /*psi*/, + const Real* eigenvalue, + std::vector& grad) const +{ + grad.assign(ld_psi_ * n_band_, T(0)); + for (int j = 0; j < n_band_; ++j) + { + const Real ej = eigenvalue[j]; + for (int ig = 0; ig < n_dim_; ++ig) + grad[idx(ig, j, ld_psi_)] = hpsi[idx(ig, j, ld_psi_)] + - spsi[idx(ig, j, ld_psi_)] * ej; + } +} + +// --------------------------------------------------------------------------- +// Orthogonalize gradient: grad_j -= sum_i * S|psi_i> +// --------------------------------------------------------------------------- +template +void DiagoPPCG::orth_gradient( + const T* psi, const T* spsi, + std::vector& grad) const +{ + for (int j = 0; j < n_band_; ++j) + { + for (int i = 0; i < n_band_; ++i) + { + const Real coeff = gamma_dot(psi + i * ld_psi_, + grad.data() + j * ld_psi_); + if (std::abs(coeff) <= std::numeric_limits::epsilon()) + continue; + for (int ig = 0; ig < n_dim_; ++ig) + grad[idx(ig, j, ld_psi_)] -= spsi[idx(ig, i, ld_psi_)] * coeff; + } + } +} + +// --------------------------------------------------------------------------- +// Polak-Ribiere conjugate gradient update with preconditioning: +// z_new = -P^{-1} * r_new +// beta = max(0, / ) +// d_new = z_new + beta * d_old +// --------------------------------------------------------------------------- +template +void DiagoPPCG::update_polak_ribiere( + const std::vector& grad, + std::vector& p, + std::vector& grad_old, + std::vector& z_old, + std::vector& beta_denom, + const Real* prec) const +{ + const bool first_iter = p.empty(); + if (first_iter) + { + p.assign(ld_psi_ * n_band_, T(0)); + z_old.assign(ld_psi_ * n_band_, T(0)); + beta_denom.assign(n_band_, std::numeric_limits::infinity()); + } + + std::vector z_new(ld_psi_ * n_band_, T(0)); + + for (int j = 0; j < n_band_; ++j) + { + const T* g = grad.data() + j * ld_psi_; + T* pj = p.data() + j * ld_psi_; + T* zn = z_new.data() + j * ld_psi_; + T* zo = z_old.data() + j * ld_psi_; + + Real beta_num_zr = 0; + Real beta_num_zo = 0; + + for (int ig = 0; ig < n_dim_; ++ig) + { + // z_new = -P^{-1} * grad + T z = -g[ig] / std::max(prec[ig], static_cast(1.0e-12)); + zn[ig] = z; + + // r_old = -P * z_old (recover old raw residual) + T r_old = -prec[ig] * zo[ig]; + + beta_num_zr += static_cast(std::real(z * std::conj(g[ig]))); + beta_num_zo += static_cast(std::real(z * std::conj(r_old))); + } + + Real beta = 0; + const Real denom = beta_denom[j]; + if (denom > static_cast(1.0e-30)) + { + beta = (beta_num_zr - beta_num_zo) / denom; + if (beta < 0) + beta = 0; + } + + // d_new = z_new + beta * d_old + for (int ig = 0; ig < n_dim_; ++ig) + pj[ig] = zn[ig] + beta * pj[ig]; + + // Save as denominator for next iteration. + beta_denom[j] = beta_num_zr + static_cast(1.0e-30); + } + + // Persist state for next iteration. + z_old.swap(z_new); + grad_old = grad; +} + +// --------------------------------------------------------------------------- +// Line minimization along search direction: +// For each band j: find optimal step α by minimizing the Rayleigh quotient +// in the 2D subspace spanned by |psi_j> and |p_j>. +// +// The optimal α satisfies: +// α = (h_ii * s_ip - h_ip * s_ii) / (h_pp * s_ii - h_ii * s_pp) +// +// Update: |psi> += α |p> +// H|psi> += α H|p> +// S|psi> += α S|p> +// --------------------------------------------------------------------------- +template +void DiagoPPCG::line_minimize( + T* psi, T* hpsi, T* spsi, + const T* p, const T* hp, const T* sp, + int ncol) const +{ + for (int j = 0; j < ncol; ++j) + { + const int off = j * ld_psi_; + T* pj = psi + off; + T* hj = hpsi + off; + T* sj = spsi + off; + const T* pp = p + off; + const T* hpp = hp + off; + const T* spp = sp + off; + + Real h_ii = gamma_dot(pj, hj); + Real s_ii = gamma_dot(pj, sj); + Real h_ip = gamma_dot(pj, hpp); + Real s_ip = gamma_dot(pj, spp); + Real h_pp = gamma_dot(pp, hpp); + Real s_pp = gamma_dot(pp, spp); + + Real alpha = 0; + Real denom = h_pp * s_ii - h_ii * s_pp; + if (std::abs(denom) > static_cast(1.0e-12)) + alpha = (h_ii * s_ip - h_ip * s_ii) / denom; + + for (int ig = 0; ig < n_dim_; ++ig) + { + pj[ig] += alpha * pp[ig]; + hj[ig] += alpha * hpp[ig]; + sj[ig] += alpha * spp[ig]; + } + } +} + +// --------------------------------------------------------------------------- +// Cholesky orthonormalization (S-orthonormal): +// 1. Form S-gram matrix J = psi^H * S * psi +// 2. Cholesky: J = U^T * U (upper) +// 3. Invert U: U^{-1} +// 4. psi *= U^{-1}, Hpsi *= U^{-1}, Spsi *= U^{-1} +// --------------------------------------------------------------------------- +template +void DiagoPPCG::orth_cholesky( + T* psi, T* hpsi, T* spsi, int ncol) const +{ + // Gram matrix of S-orthonormality: J_{ij} = + std::vector gram_s(ncol * ncol, static_cast(0)); + for (int j = 0; j < ncol; ++j) + for (int i = 0; i < ncol; ++i) + gram_s[i + j * ncol] = gamma_dot(psi + i * ld_psi_, + spsi + j * ld_psi_); + + // Cholesky factorization: gram_s = U^T U (U upper) + Lapack::potrf(ncol, gram_s.data()); + + // In-place triangular inverse: gram_s now holds U^{-1} + Lapack::trtri(ncol, gram_s.data()); + + // Right-multiply: result = input * U^{-1} + std::vector tmp(ld_psi_ * ncol, T(0)); + for (int j = 0; j < ncol; ++j) + { + for (int i = 0; i < ncol; ++i) + { + const Real uinv = gram_s[i + j * ncol]; + for (int ig = 0; ig < n_dim_; ++ig) + { + tmp[idx(ig, j, ld_psi_)] += psi[ idx(ig, i, ld_psi_)] * uinv; + } + } + } + std::copy(tmp.begin(), tmp.end(), psi); + + set_zero(tmp); + for (int j = 0; j < ncol; ++j) + { + for (int i = 0; i < ncol; ++i) + { + const Real uinv = gram_s[i + j * ncol]; + for (int ig = 0; ig < n_dim_; ++ig) + { + tmp[idx(ig, j, ld_psi_)] += hpsi[idx(ig, i, ld_psi_)] * uinv; + } + } + } + std::copy(tmp.begin(), tmp.end(), hpsi); + + set_zero(tmp); + for (int j = 0; j < ncol; ++j) + { + for (int i = 0; i < ncol; ++i) + { + const Real uinv = gram_s[i + j * ncol]; + for (int ig = 0; ig < n_dim_; ++ig) + { + tmp[idx(ig, j, ld_psi_)] += spsi[idx(ig, i, ld_psi_)] * uinv; + } + } + } + std::copy(tmp.begin(), tmp.end(), spsi); +} + +//============================================================================== +// MAIN DIAGONALIZATION ROUTINE +//============================================================================== +template +double DiagoPPCG::diag(const HPsiFunc& hpsi_func, + const SPsiFunc& spsi_func, + int ld_psi, + int nband, + int dim, + T* psi_in, + Real* eigenvalue_in, + const std::vector& ethr_band, + const Real* prec) +{ + ld_psi_ = ld_psi; + n_band_ = nband; + n_dim_ = dim; + + validate_input(psi_in, eigenvalue_in, ethr_band, prec); + spsi_func_ = spsi_func; + + // Allocate working storage. + const int ncol = n_band_; + const int sz = ld_psi_ * ncol; + + hpsi_.assign(sz, T(0)); + spsi_.assign(sz, T(0)); + w_.assign(sz, T(0)); + sw_.assign(sz, T(0)); + hw_.assign(sz, T(0)); + p_.assign(sz, T(0)); + sp_.assign(sz, T(0)); + hp_.assign(sz, T(0)); + + std::vector all_cols(ncol); + std::iota(all_cols.begin(), all_cols.end(), 0); + + force_g0_real(psi_in, ncol); + apply_h(hpsi_func, psi_in, hpsi_.data(), ncol); + apply_s_current(psi_in, spsi_.data(), ncol); + + double avg_iter = 1.0; + int iter = 1; + std::vector active_cols; + + // --------------------------------------------------------------------------- + // Strategy dispatch + // --------------------------------------------------------------------------- + if (strategy_ == PpcgStrategy::BLOCK_SUBSPACE) + { + // Initialize with Rayleigh-Ritz. + rayleigh_ritz(psi_in, eigenvalue_in, active_cols, ethr_band); + // Recompute to keep hpsi/spi consistent with rotated psi. + apply_h(hpsi_func, psi_in, hpsi_.data(), ncol); + apply_s_current(psi_in, spsi_.data(), ncol); + + Real trG = trace_of_active_projected(psi_in, active_cols); + Real trdif = static_cast(-1); + + while (!active_cols.empty() && iter <= maxiter_) + { + const int nact = static_cast(active_cols.size()); + const int nsb = std::max(1, (nact + sbsize_ - 1) / sbsize_); + const Real trtol = diag_thr_ * std::sqrt(static_cast(nact)); + + // Precondition the residual. + divide_by_preconditioner(active_cols, prec, w_); + apply_s_current(w_.data(), sw_.data(), ncol); + project_against(psi_in, spsi_.data(), all_cols, w_, sw_, active_cols); + + // Apply H to the search direction. + std::vector w_active; + copy_cols(w_.data(), active_cols, w_active); + force_g0_real(w_active.data(), nact); + std::vector hw_active(ld_psi_ * nact, T(0)); + scatter_cols(w_.data(), active_cols, w_active); + apply_h(hpsi_func, w_active.data(), hw_active.data(), nact); + scatter_cols(hw_.data(), active_cols, hw_active); + apply_s_current(w_.data(), sw_.data(), ncol); + + avg_iter += static_cast(nact) / static_cast(ncol); + + bool use_p = (iter != 1); + if (use_p) + { + apply_s_current(p_.data(), sp_.data(), ncol); + project_against(psi_in, spsi_.data(), all_cols, p_, sp_, active_cols); + + // For small nband with S=I, p can be nearly collinear + // with w (p gets initialized as a scalar multiple of w + // in update_one_block). This makes the 3-vector subspace + // [psi,w,p] nearly rank-2, causing sygvd to produce + // huge/negative eigenvalues -> NaN. + // + // When detected, replace p with H·w (a second-order + // Krylov direction) which is genuinely independent of w. + bool p_bad = false; + for (const int c : active_cols) + { + Real p_nrm2 = 0, w_nrm2 = 0, pw_re = 0; + for (int ig = 0; ig < n_dim_; ++ig) + { + p_nrm2 += static_cast(std::norm(p_[idx(ig, c, ld_psi_)])); + w_nrm2 += static_cast(std::norm(w_[idx(ig, c, ld_psi_)])); + pw_re += static_cast( + std::real(std::conj(p_[idx(ig, c, ld_psi_)]) + * w_[idx(ig, c, ld_psi_)])); + } + // p near-zero or p nearly collinear with w: + // both make the [w,p] block of the Gram matrix nearly + // singular, poisoning the 3x3 generalized eigenproblem. + const Real denom = p_nrm2 * w_nrm2; + Real cos2 = -1; + if (denom > Real(1e-60)) + cos2 = (pw_re * pw_re) / denom; + if (p_nrm2 <= Real(1e-30) || + (denom > Real(1e-60) && cos2 > Real(0.99))) + { + p_bad = true; + break; + } + } + if (p_bad) + { + // Replace p with H·w for active columns (Krylov direction). + for (const int c : active_cols) + { + T* pc = p_.data() + c * ld_psi_; + const T* hwc = hw_.data() + c * ld_psi_; + for (int ig = 0; ig < n_dim_; ++ig) + pc[ig] = hwc[ig]; + } + // Recompute S·p and H·p for the new direction. + apply_s_current(p_.data(), sp_.data(), ncol); + { + std::vector p_act; + copy_cols(p_.data(), active_cols, p_act); + std::vector hp_act(ld_psi_ * static_cast(active_cols.size()), T(0)); + apply_h(hpsi_func, p_act.data(), hp_act.data(), + static_cast(active_cols.size())); + scatter_cols(hp_.data(), active_cols, hp_act); + } + // Re-project against psi. + project_against(psi_in, spsi_.data(), all_cols, + p_, sp_, active_cols); + } + } + + // Block subspace solve. + for (int isb = 0; isb < nsb; ++isb) + { + const int i0 = isb * sbsize_; + const int l = std::min(sbsize_, nact - i0); + std::vector cols(active_cols.begin() + i0, + active_cols.begin() + i0 + l); + + SmallSubspace subspace; + build_small_subspace(psi_in, cols, use_p, subspace); + solve_small_generalized((use_p ? 3 : 2) * l, subspace); + update_one_block(psi_in, cols, l, use_p, subspace); + } + + // Re-orthonormalize and recompute after psi modification. + chol_qr_active(psi_in, active_cols); + apply_h(hpsi_func, psi_in, hpsi_.data(), ncol); + apply_s_current(psi_in, spsi_.data(), ncol); + + // Periodic Rayleigh-Ritz. + if (iter % rr_step_ == 0) + { + rayleigh_ritz(psi_in, eigenvalue_in, active_cols, ethr_band); + apply_h(hpsi_func, psi_in, hpsi_.data(), ncol); + apply_s_current(psi_in, spsi_.data(), ncol); + trdif = static_cast(-1); + trG = 0; + for (const int c : active_cols) + trG += eigenvalue_in[c]; + } + else + { + chol_qr_active(psi_in, active_cols); + + // Compute updated eigenvalues and residuals. + std::vector psi_a, hpsi_a; + copy_cols(psi_in, active_cols, psi_a); + copy_cols(hpsi_.data(), active_cols, hpsi_a); + + const int na = static_cast(active_cols.size()); + std::vector ga(ncol * na, static_cast(0)); + gram(psi_in, hpsi_a.data(), ncol, na, ga, ncol); + + set_zero(w_); + for (int ja = 0; ja < na; ++ja) + { + for (int ig = 0; ig < n_dim_; ++ig) + { + T sum = T(0); + for (int ia = 0; ia < ncol; ++ia) + sum += spsi_[idx(ig, ia, ld_psi_)] * ga[ia + ja * ncol]; + w_[idx(ig, active_cols[ja], ld_psi_)] = + hpsi_a[idx(ig, ja, ld_psi_)] - sum; + } + eigenvalue_in[active_cols[ja]] = ga[active_cols[ja] + ja * ncol]; + } + + Real trG1 = 0; + for (int ja = 0; ja < na; ++ja) + trG1 += ga[active_cols[ja] + ja * ncol]; + + trdif = std::abs(trG1 - trG); + trG = trG1; + + lock_epairs(w_, ethr_band, active_cols); + if (trdif >= 0 && trdif <= trtol) + { + rayleigh_ritz(psi_in, eigenvalue_in, active_cols, ethr_band); + apply_h(hpsi_func, psi_in, hpsi_.data(), ncol); + apply_s_current(psi_in, spsi_.data(), ncol); + trdif = static_cast(-1); + } + } + + ++iter; + } + + if ((iter - 1) % rr_step_ != 0) + rayleigh_ritz(psi_in, eigenvalue_in, active_cols, ethr_band); + // Final consistency: ensure hpsi/spi match the converged psi. + apply_h(hpsi_func, psi_in, hpsi_.data(), ncol); + apply_s_current(psi_in, spsi_.data(), ncol); + } + else // CONJUGATE_GRADIENT + { + // Initial eigenvalues from current subspace. + for (int i = 0; i < ncol; ++i) + eigenvalue_in[i] = gamma_dot(psi_in + i * ld_psi_, + hpsi_.data() + i * ld_psi_) + / gamma_dot(psi_in + i * ld_psi_, + spsi_.data() + i * ld_psi_); + + std::vector grad; + calc_gradient(prec, hpsi_.data(), spsi_.data(), psi_in, + eigenvalue_in, grad); + orth_gradient(psi_in, spsi_.data(), grad); + + std::vector p; + grad_old_.clear(); + z_old_.clear(); + beta_denom_.clear(); + update_polak_ribiere(grad, p, grad_old_, z_old_, beta_denom_, prec); + + // CG iteration loop. + while (iter <= maxiter_) + { + // Apply H and S to search direction. + std::vector hp(ld_psi_ * ncol, T(0)); + std::vector sp(ld_psi_ * ncol, T(0)); + apply_h(hpsi_func, p.data(), hp.data(), ncol); + apply_s_current(p.data(), sp.data(), ncol); + + // Line minimization. + line_minimize(psi_in, hpsi_.data(), spsi_.data(), + p.data(), hp.data(), sp.data(), ncol); + + // Periodic Rayleigh-Ritz: full subspace diagonalization + // corrects band ordering and gives accurate eigenvalues. + if (iter % rr_step_ == 0) + { + orth_cholesky(psi_in, hpsi_.data(), spsi_.data(), ncol); + apply_h(hpsi_func, psi_in, hpsi_.data(), ncol); + apply_s_current(psi_in, spsi_.data(), ncol); + + std::vector dummy_active; + rayleigh_ritz(psi_in, eigenvalue_in, dummy_active, ethr_band); + apply_h(hpsi_func, psi_in, hpsi_.data(), ncol); + apply_s_current(psi_in, spsi_.data(), ncol); + + // Reset PR state: the rotation changes the basis, + // so old gradients / search directions are invalid. + p.clear(); + grad_old_.clear(); + z_old_.clear(); + beta_denom_.clear(); + } + else + { + // Cholesky orthonormalization. + orth_cholesky(psi_in, hpsi_.data(), spsi_.data(), ncol); + + // Update eigenvalues. + for (int i = 0; i < ncol; ++i) + eigenvalue_in[i] = gamma_dot(psi_in + i * ld_psi_, + hpsi_.data() + i * ld_psi_) + / gamma_dot(psi_in + i * ld_psi_, + spsi_.data() + i * ld_psi_); + } + + // Compute new gradient. + calc_gradient(prec, hpsi_.data(), spsi_.data(), psi_in, + eigenvalue_in, grad); + orth_gradient(psi_in, spsi_.data(), grad); + + // Polak-Ribiere update. + update_polak_ribiere(grad, p, grad_old_, z_old_, beta_denom_, prec); + + // Convergence check. + bool all_converged = true; + for (int i = 0; i < ncol; ++i) + { + Real nrm2 = 0; + for (int ig = 0; ig < n_dim_; ++ig) + nrm2 += static_cast( + std::norm(grad[idx(ig, i, ld_psi_)])); + if (std::sqrt(nrm2) > std::max(static_cast(ethr_band[i]), + diag_thr_)) + { + all_converged = false; + break; + } + } + if (all_converged) + break; + + ++iter; + } + + avg_iter = static_cast(iter); + } + + return avg_iter; +} + +// ============================================================================= +// Explicit template instantiation (CPU only; extend for GPU as needed) +// ============================================================================= +template class DiagoPPCG, base_device::DEVICE_CPU>; +template class DiagoPPCG, base_device::DEVICE_CPU>; + +} // namespace hsolver diff --git a/source/source_hsolver/diago_ppcg.h b/source/source_hsolver/diago_ppcg.h new file mode 100644 index 00000000000..9cb0c0914d0 --- /dev/null +++ b/source/source_hsolver/diago_ppcg.h @@ -0,0 +1,223 @@ +#ifndef DIAGO_PPCG_H +#define DIAGO_PPCG_H + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace hsolver { + +// ----------------------------------------------------------------------------- +// DiagoPPCG: Projection Preconditioned Conjugate Gradient solver +// ----------------------------------------------------------------------------- +// +// Supports two algorithmic strategies: +// BLOCK_SUBSPACE — block subspace diagonalization (File 1 approach). +// CONJUGATE_GRADIENT — band-by-band Polak-Ribiere CG with line minimization +// (File 2 approach). +// +// The block-subspace strategy tends to be more robust near convergence; +// conjugate-gradient is more memory efficient for large systems. +// ----------------------------------------------------------------------------- + +enum class PpcgStrategy { BLOCK_SUBSPACE, CONJUGATE_GRADIENT }; + +// Device tags (extensible for GPU backends). +namespace base_device { + struct DEVICE_CPU {}; + struct DEVICE_GPU {}; +} + +template +class DiagoPPCG +{ +public: + // ------------------------------------------------------------------------- + // Type aliases + // ------------------------------------------------------------------------- + using Real = typename std::conditional< + std::is_same>::value, double, + float>::type; + using HPsiFunc = std::function; + using SPsiFunc = std::function; + + // ------------------------------------------------------------------------- + // Constructor + // ------------------------------------------------------------------------- + DiagoPPCG(const Real& diag_thr, + const int& diag_iter_max, + const int& sbsize, + const int& rr_step, + const bool gamma_g0_real, + const PpcgStrategy strategy = PpcgStrategy::BLOCK_SUBSPACE); + + // ------------------------------------------------------------------------- + // Main entry point + // + // Returns average number of subspace iterations per band. + // ------------------------------------------------------------------------- + double diag(const HPsiFunc& hpsi_func, + const SPsiFunc& spsi_func, + int ld_psi, + int nband, + int dim, + T* psi_in, + Real* eigenvalue_in, + const std::vector& ethr_band, + const Real* prec); + +private: + // ------------------------------------------------------------------------- + // Data members + // ------------------------------------------------------------------------- + int maxiter_; + int sbsize_; + int rr_step_; + Real diag_thr_; + bool gamma_g0_real_; + PpcgStrategy strategy_; + + // Problem dimensions (set in diag()) + int ld_psi_ = 0; + int n_band_ = 0; + int n_dim_ = 0; + + // Cached S-operator (null if identity). + SPsiFunc spsi_func_; + + // Working storage (column-major: ld_psi_ rows, n_band_ columns). + std::vector hpsi_; + std::vector spsi_; + std::vector w_; // residual / preconditioned residual + std::vector sw_; // S * w + std::vector hw_; // H * w + std::vector p_; // previous search direction (for block subspace) + std::vector sp_; // S * p + std::vector hp_; // H * p + + // Polak-Ribiere state (CONJUGATE_GRADIENT strategy) + std::vector grad_old_; // previous gradient + std::vector z_old_; // previous preconditioned residual + std::vector beta_denom_; + + // ------------------------------------------------------------------------- + // Internal helpers + // ------------------------------------------------------------------------- + static inline int idx(int row, int col, int ld) + { + return row + col * ld; + } + + void validate_input(const T* psi_in, const Real* eigenvalue_in, + const std::vector& ethr_band, + const Real* prec) const; + + void force_g0_real(T* x, int ncol) const; + + // S-application (identity fallback if spsi_func is null). + void apply_h(const HPsiFunc& hpsi_func, T* psi_in, T* hpsi_out, + int ncol) const; + void apply_s(const SPsiFunc& spsi_func, T* psi_in, T* spsi_out, + int ncol) const; + void apply_s_current(T* psi_in, T* spsi_out, int ncol) const; + + // Inner product (real part only). + Real gamma_dot(const T* x, const T* y) const; + + // Gram matrix: out[i, j] = . + void gram(const T* a, const T* b, + int ncol_a, int ncol_b, + std::vector& out, int ld_out) const; + + // Gather / scatter columns. + void copy_cols(const T* src, const std::vector& cols, + std::vector& dst) const; + void scatter_cols(T* dst, const std::vector& cols, + const std::vector& src) const; + + // Project x onto vectors orthogonal to the S-orthonormal basis. + void project_against(const T* basis, const T* sbasis, + const std::vector& basis_cols, + std::vector& x, std::vector& sx, + const std::vector& x_cols) const; + + // x[c] /= max(prec, eps) for each active column c. + void divide_by_preconditioner(const std::vector& active_cols, + const Real* prec, + std::vector& x) const; + + // ------------------------------------------------------------------------- + // Block-subspace strategy helpers (File 1 style) + // ------------------------------------------------------------------------- + struct SmallSubspace + { + std::vector k; // K matrix (projected H) + std::vector m; // M matrix (projected S) + std::vector eval; // eigenvalues + }; + + void lock_epairs(const std::vector& residual, + const std::vector& ethr_band, + std::vector& active_cols) const; + + void build_small_subspace(const T* psi, + const std::vector& cols, + bool use_p, + SmallSubspace& subspace) const; + + void solve_small_generalized(int dim, SmallSubspace& subspace) const; + + void update_one_block(T* psi, + const std::vector& cols, + int l, + bool use_p, + const SmallSubspace& subspace); + + void right_solve_upper_real(const std::vector& r, + int n, + std::vector& x) const; + + void chol_qr_active(T* psi, const std::vector& active_cols); + + void rayleigh_ritz(T* psi, Real* eigenvalue, + std::vector& active_cols, + const std::vector& ethr_band); + + Real trace_of_active_projected(const T* psi, + const std::vector& active_cols) const; + + // ------------------------------------------------------------------------- + // Conjugate-gradient strategy helpers (File 2 style) + // ------------------------------------------------------------------------- + void calc_gradient(const Real* prec, + const T* hpsi, + const T* spsi, + const T* psi, + const Real* eigenvalue, + std::vector& grad) const; + + void orth_gradient(const T* psi, const T* spsi, + std::vector& grad) const; + + void update_polak_ribiere(const std::vector& grad, + std::vector& p, + std::vector& grad_old, + std::vector& z_old, + std::vector& beta_denom, + const Real* prec) const; + + void line_minimize(T* psi, T* hpsi, T* spsi, + const T* p, const T* hp, const T* sp, + int ncol) const; + + void orth_cholesky(T* psi, T* hpsi, T* spsi, int ncol) const; +}; + +} // namespace hsolver + +#endif // DIAGO_PPCG_H diff --git a/source/source_hsolver/test/CMakeLists.txt b/source/source_hsolver/test/CMakeLists.txt index 1b1529adb4a..d3571c8257b 100644 --- a/source/source_hsolver/test/CMakeLists.txt +++ b/source/source_hsolver/test/CMakeLists.txt @@ -121,6 +121,12 @@ if (ENABLE_MPI) target_compile_definitions(MODULE_HSOLVER_LCAO_cusolver PRIVATE __CUDA) endif() endif() +AddTest( + TARGET MODULE_HSOLVER_ppcg + LIBS ${math_libs} + SOURCES diago_ppcg_test.cpp ../diago_ppcg.cpp +) + install(FILES H-KPoints-Si2.dat DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) install(FILES H-GammaOnly-Si2.dat DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) install(FILES S-KPoints-Si2.dat DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/source/source_hsolver/test/diago_ppcg_test.cpp b/source/source_hsolver/test/diago_ppcg_test.cpp new file mode 100644 index 00000000000..ceae4d9d711 --- /dev/null +++ b/source/source_hsolver/test/diago_ppcg_test.cpp @@ -0,0 +1,197 @@ +/** + * diago_ppcg_test.cpp — unit test for DiagoPPCG solver + * + * Solves the 1D particle-in-a-box problem (tridiagonal H) with S = I, + * and compares computed eigenvalues against exact analytic values. + * Both BLOCK_SUBSPACE and CONJUGATE_GRADIENT strategies are tested. + */ + +#include "../diago_ppcg.h" + +#include +#include +#include +#include +#include + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + +using T = std::complex; +using Real = double; + +// ----------------------------------------------------------------------------- +// Helper: dense H-matrix times a set of column vectors +// H is stored column-major: H(row, col) = H_data[row + col * n_dim] +// ----------------------------------------------------------------------------- +static void dense_h_multiply(const T* H_data, int n_dim, + const T* in, T* out, int ld, int ncol) +{ + for (int j = 0; j < ncol; ++j) { + for (int i = 0; i < n_dim; ++i) { + T sum = 0; + for (int k = 0; k < n_dim; ++k) + sum += H_data[i + k * n_dim] * in[k + j * ld]; + out[i + j * ld] = sum; + } + } +} + +// ----------------------------------------------------------------------------- +// Test fixture: 1D particle-in-a-box (tridiagonal Laplacian) +// ----------------------------------------------------------------------------- +class DiagoPPCGTest : public ::testing::Test +{ +protected: + void SetUp() override + { + n_dim = 10; + nband = 3; + ld = n_dim; + + // Build tridiagonal H: H[i,i] = 2, H[i,i±1] = -1 + // Exact λ_k = 2 - 2·cos(k·π / (n_dim+1)), k = 1, 2, ... + H_mat.assign(n_dim * n_dim, T(0)); + for (int i = 0; i < n_dim; ++i) { + H_mat[i + i * n_dim] = T(2.0, 0); + if (i > 0) H_mat[i + (i - 1) * n_dim] = T(-1.0, 0); + if (i < n_dim - 1) H_mat[i + (i + 1) * n_dim] = T(-1.0, 0); + } + + // Preconditioner — diagonal of H (all 2.0) + prec.assign(n_dim, 2.0); + + // Exact reference eigenvalues (lowest nband) + exact.resize(nband); + for (int k = 0; k < nband; ++k) + exact[k] = 2.0 - 2.0 * std::cos(static_cast(k + 1) + * M_PI / static_cast(n_dim + 1)); + + // Convergence thresholds + ethr.assign(nband, 1e-10); + + // Generate initial guess wavefunctions (fixed seed for reproducibility) + std::mt19937 rng(42); + std::uniform_real_distribution dist(-1.0, 1.0); + + psi.assign(ld * nband, T(0)); + for (int j = 0; j < nband; ++j) + for (int i = 0; i < n_dim; ++i) + psi[i + j * ld] = T(dist(rng), dist(rng)); + + // Gram-Schmidt orthonormalisation (S = I) + for (int j = 0; j < nband; ++j) { + for (int k = 0; k < j; ++k) { + T dot = 0; + for (int i = 0; i < n_dim; ++i) + dot += std::conj(psi[i + k * ld]) * psi[i + j * ld]; + for (int i = 0; i < n_dim; ++i) + psi[i + j * ld] -= dot * psi[i + k * ld]; + } + Real nrm = 0; + for (int i = 0; i < n_dim; ++i) + nrm += std::norm(psi[i + j * ld]); + nrm = std::sqrt(nrm); + for (int i = 0; i < n_dim; ++i) + psi[i + j * ld] /= nrm; + } + } + + int n_dim, nband, ld; + std::vector H_mat; + std::vector prec; + std::vector exact; + std::vector ethr; + std::vector psi; +}; + +// ----------------------------------------------------------------------------- +// Test BLOCK_SUBSPACE strategy +// ----------------------------------------------------------------------------- +TEST_F(DiagoPPCGTest, BlockSubspaceStrategy) +{ + std::vector psi_run = psi; + std::vector eval(nband, 0.0); + + hsolver::DiagoPPCG solver( + /* diag_thr = */ 1e-12, + /* max_iter = */ 100, + /* sbsize = */ 4, + /* rr_step = */ 4, + /* gamma_g0 = */ false, + hsolver::PpcgStrategy::BLOCK_SUBSPACE + ); + + auto h_op = [this](T* in, T* out, int ld_in, int ncol) { + dense_h_multiply(H_mat.data(), n_dim, in, out, ld_in, ncol); + }; + + double avg_iter = solver.diag( + h_op, + /* spsi_func = */ nullptr, // S = I + ld, nband, n_dim, + psi_run.data(), + eval.data(), + ethr, + prec.data() + ); + + // Check eigenvalues against exact solution + for (int i = 0; i < nband; ++i) { + EXPECT_NEAR(eval[i], exact[i], 1e-8) + << "BLOCK_SUBSPACE: eigenvalue[" << i << "] mismatch"; + } + + // Should converge within reasonable iterations + EXPECT_LE(avg_iter, static_cast(100)) + << "BLOCK_SUBSPACE: too many iterations"; +} + +// ----------------------------------------------------------------------------- +// Test CONJUGATE_GRADIENT strategy +// ----------------------------------------------------------------------------- +TEST_F(DiagoPPCGTest, ConjugateGradientStrategy) +{ + std::vector psi_run = psi; + std::vector eval(nband, 0.0); + + hsolver::DiagoPPCG solver( + /* diag_thr = */ 1e-12, + /* max_iter = */ 100, + /* sbsize = */ 4, + /* rr_step = */ 4, + /* gamma_g0 = */ false, + hsolver::PpcgStrategy::CONJUGATE_GRADIENT + ); + + auto h_op = [this](T* in, T* out, int ld_in, int ncol) { + dense_h_multiply(H_mat.data(), n_dim, in, out, ld_in, ncol); + }; + + double avg_iter = solver.diag( + h_op, + /* spsi_func = */ nullptr, // S = I + ld, nband, n_dim, + psi_run.data(), + eval.data(), + ethr, + prec.data() + ); + + // Check eigenvalues against exact solution + for (int i = 0; i < nband; ++i) { + EXPECT_NEAR(eval[i], exact[i], 1e-8) + << "CONJUGATE_GRADIENT: eigenvalue[" << i << "] mismatch"; + } + + // Should converge within reasonable iterations + EXPECT_LE(avg_iter, static_cast(100)) + << "CONJUGATE_GRADIENT: too many iterations"; +} + +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/source/source_pw/module_stodft/sto_wf.cpp b/source/source_pw/module_stodft/sto_wf.cpp index 2fca88e4824..b9b67e13722 100644 --- a/source/source_pw/module_stodft/sto_wf.cpp +++ b/source/source_pw/module_stodft/sto_wf.cpp @@ -63,7 +63,7 @@ void Stochastic_WF::clean_chiallorder() template void Stochastic_WF::init_sto_orbitals(const int seed_in) { - const unsigned int rank_seed_offset = 10000; + static const unsigned int rank_seed_offset = 10000; if (seed_in == 0 || seed_in == -1) { srand(static_cast(time(nullptr)) + GlobalV::MY_RANK * rank_seed_offset); // GlobalV global variables are reserved