first version of parallelized CG method

e6f2e40e · Tom Fischer · 9d64276f · e6f2e40e · e6f2e40e · e6f2e40e
Commit e6f2e40e authored 13 years ago by Tom Fischer
--- a/MathLib/CMakeLists.txt
+++ b/MathLib/CMakeLists.txt
@@ -38,6 +38,7 @@ SET( SOURCES
        LinAlg/Sparse/amuxCRS.cpp
        LinAlg/Solvers/BiCGStab.cpp
        LinAlg/Solvers/CG.cpp
+        LinAlg/Solvers/CGParallel.cpp
        LinAlg/Solvers/GMRes.cpp
 	LinAlg/Solvers/GaussAlgorithm.cpp
        LinAlg/Solvers/TriangularSolve.cpp

--- a/MathLib/LinAlg/Solvers/CG.cpp
+++ b/MathLib/LinAlg/Solvers/CG.cpp
@@ -7,8 +7,6 @@

 #include <limits>

-#include <omp.h>
-
 #include "MathTools.h"
 #include "blas.h"
 #include "../Sparse/CRSMatrix.h"
@@ -30,7 +28,7 @@
 namespace MathLib {

 unsigned CG(CRSMatrix<double,unsigned> const * mat, double const * const b,
-		double* const x, double& eps, unsigned& nsteps, unsigned num_threads)
+		double* const x, double& eps, unsigned& nsteps)
 {
 	unsigned N = mat->getNRows();
 	double *p, *q, *r, *rhat, rho, rho1 = 0.0;
@@ -64,11 +62,9 @@ unsigned CG(CRSMatrix<double,unsigned> const * mat, double const * const b,
 	}

 	for (unsigned l = 1; l <= nsteps; ++l) {
-
 #ifndef NDEBUG
 		std::cout << "Step " << l << ", resid=" << resid / nrmb << std::endl;
 #endif
-
 		// r^ = C r
 		blas::copy(N, r, rhat);
 		mat->precondApply(rhat);
@@ -80,8 +76,6 @@ unsigned CG(CRSMatrix<double,unsigned> const * mat, double const * const b,
 			double beta = rho / rho1;
 			// p = r^ + beta * p
 			unsigned k;
-			omp_set_num_threads(num_threads);
-//#pragma omp parallel for
 			for (k = 0; k < N; k++) {
 				p[k] = rhat[k] + beta * p[k];
 			}

--- a/MathLib/LinAlg/Solvers/CG.h
+++ b/MathLib/LinAlg/Solvers/CG.h
@@ -14,7 +14,12 @@ namespace MathLib {
 template <typename PF_TYPE, typename IDX_TYPE> class CRSMatrix;

 unsigned CG(CRSMatrix<double,unsigned> const * mat, double const * const b,
+		double* const x, double& eps, unsigned& nsteps);
+
+#ifdef _OPENMP
+unsigned CGParallel(CRSMatrix<double,unsigned> const * mat, double const * const b,
 		double* const x, double& eps, unsigned& nsteps, unsigned num_threads = 1);
+#endif

 } // end namespace MathLib


--- a/MathLib/LinAlg/Solvers/CGParallel.cpp
+++ b/MathLib/LinAlg/Solvers/CGParallel.cpp
+/*
+ * CGParallel.cpp
+ *
+ *  Created on: Dec 2, 2011
+ *      Author: TF
+ */
+
+#include <limits>
+
+#include <omp.h>
+
+#include "MathTools.h"
+#include "blas.h"
+#include "../Sparse/CRSMatrix.h"
+#include "../Sparse/CRSMatrixDiagPrecond.h"
+
+// CG solves the symmetric positive definite linear
+// system Ax=b using the Conjugate Gradient method.
+//
+// The return value indicates convergence within max_iter (input)
+// iterations (0), or no convergence within max_iter iterations (1).
+//
+// Upon successful return, output arguments have the following values:
+//
+//      x  --  approximate solution to Ax = b
+// nsteps  --  the number of iterations performed before the
+//             tolerance was reached
+//    eps  --  the residual after the final iteration
+
+namespace MathLib {
+
+#ifdef _OPENMP
+unsigned CGParallel(CRSMatrix<double,unsigned> const * mat, double const * const b,
+		double* const x, double& eps, unsigned& nsteps, unsigned num_threads)
+{
+	omp_set_num_threads(num_threads);
+	unsigned N = mat->getNRows();
+	double *p, *q, *r, *rhat, rho, rho1 = 0.0;
+
+	p = new double[4* N];
+	q = p + N;
+	r = q + N;
+	rhat = r + N;
+
+	double nrmb = sqrt(scpr(b, b, N));
+	if (nrmb < std::numeric_limits<double>::epsilon()) {
+		blas::setzero(N, x);
+		eps = 0.0;
+		nsteps = 0;
+		delete[] p;
+		return 0;
+	}
+
+	// r0 = b - Ax0
+	mat->amux(D_MONE, x, r);
+	for (unsigned k(0); k < N; k++) {
+		r[k] = b[k] - r[k];
+	}
+
+	double resid = blas::nrm2(N, r);
+	if (resid <= eps * nrmb) {
+		eps = resid / nrmb;
+		nsteps = 0;
+		delete[] p;
+		return 0;
+	}
+
+	for (unsigned l = 1; l <= nsteps; ++l) {
+#ifndef NDEBUG
+		std::cout << "Step " << l << ", resid=" << resid / nrmb << std::endl;
+#endif
+		// r^ = C r
+		blas::copy(N, r, rhat);
+		mat->precondApply(rhat);
+
+		// rho = r * r^;
+		rho = scpr(r, rhat, N, num_threads);
+
+		unsigned k;
+		if (l > 1) {
+			double beta = rho / rho1;
+			// p = r^ + beta * p
+			#pragma omp parallel for
+			for (k = 0; k < N; k++) {
+				p[k] = rhat[k] + beta * p[k];
+			}
+		} else blas::copy(N, rhat, p);
+
+		// q = Ap
+		blas::setzero(N, q);
+		mat->amux(D_ONE, p, q);
+
+		// alpha = rho / p*q
+		double alpha = rho / scpr(p, q, N, num_threads);
+
+		// x += alpha * p
+		#pragma omp parallel for
+		for (k = 0; k < N; k++) {
+			x[k] += alpha * p[k];
+		}
+
+		// r -= alpha * q
+		blas::axpy(N, -alpha, q, r);
+		#pragma omp parallel for
+		for (k = 0; k < N; k++) {
+			r[k] -= alpha * q[k];
+		}
+
+		resid = sqrt(scpr(r, r, N, num_threads));
+
+		if (resid <= eps * nrmb) {
+			eps = resid / nrmb;
+			nsteps = l;
+			delete[] p;
+			return 0;
+		}
+
+		rho1 = rho;
+	}
+	eps = resid / nrmb;
+	delete[] p;
+	return 1;
+}
+#endif
+
+} // end of namespace MathLib
--- a/MathLib/MathTools.cpp
+++ b/MathLib/MathTools.cpp
@@ -9,6 +9,26 @@

 namespace MathLib {

+#ifdef _OPENMP
+double scpr(double const * const v, double const * const w, unsigned n,
+        unsigned num_of_threads)
+{
+        long double res (0.0);
+        unsigned k;
+        omp_set_num_threads (num_of_threads);
+        #pragma omp parallel
+        {
+                #pragma omp parallel for reduction (+:res)
+                for (k = 0; k<n; k++) {
+                        res += v[k] * w[k];
+                }
+        }
+
+        return res;
+}
+#endif
+
+
 void crossProd(const double u[3], const double v[3], double r[3])
 {
 	r[0] = u[1] * v[2] - u[2] * v[1];

--- a/MathLib/MathTools.h
+++ b/MathLib/MathTools.h
@@ -2,7 +2,7 @@
 * \file MathTools.h
 *
 *  Created on: Jan 13, 2010
- *      Author: fischeth
+ *      Author: TF
 */

 #ifndef MATHTOOLS_H_
@@ -11,8 +11,12 @@
 #include <vector>
 #include <cmath>
 #include <limits>
+
+#include <omp.h>
+
 #include "Point.h"

+
 namespace MathLib {

 /**
@@ -22,13 +26,19 @@ namespace MathLib {
 * \param n the size of the array
 * */
 template<class T> inline
-double scpr(const T* v0, const T* v1, size_t n) {
+double scpr(const T* v0, const T* v1, size_t n)
+{
 	long double res(0.0);
 	for (size_t k(0); k<n; k++)
 		res += v0[k] * v1[k];
 	return (double) res;
 }

+#ifdef _OPENMP
+double scpr(double const * const v, double const * const w, unsigned n,
+        unsigned num_of_threads);
+#endif
+
 /**
 * computes the cross (or vector) product of the 3d vectors u and v
 * the result is given in the vector r

--- a/SimpleTests/SolverTests/ConjugateGradientDiagonalPreconditioned.cpp
+++ b/SimpleTests/SolverTests/ConjugateGradientDiagonalPreconditioned.cpp
@@ -61,7 +61,11 @@ int main(int argc, char *argv[])
 	run_timer.start();
 	cpu_timer.start();

-	MathLib::CG (mat, b, x, eps, steps, num_omp_threads);
+	if (num_omp_threads == 1) {
+		MathLib::CG(mat, b, x, eps, steps);
+	} else {
+		MathLib::CGParallel(mat, b, x, eps, steps, num_omp_threads);
+	}

 	cpu_timer.stop();
 	run_timer.stop();

--- a/SimpleTests/SolverTests/ConjugateGradientUnpreconditioned.cpp
+++ b/SimpleTests/SolverTests/ConjugateGradientUnpreconditioned.cpp
@@ -55,7 +55,7 @@ int main(int argc, char *argv[])
 //	double cg_time (cputime(0.0));
 	double eps (1.0e-6);
 	unsigned steps (4000);
-	CG (mat, b, x, eps, steps, 1);
+	CG (mat, b, x, eps, steps);
 //	cg_time = cputime(cg_time);
 	time(&end_time);
 	std::cout << " in " << steps << " iterations (residuum is " << eps << ") took " << /*cg_time <<*/ " sec time and " << (end_time-start_time) << " sec" << std::endl;