diff --git a/MathLib/CMakeLists.txt b/MathLib/CMakeLists.txt
index 7f64f5550870ba9983b0fd8260c67af090194ee2..9dfbbeaeb47d70763f13318d993b46f7e695a25e 100644
--- a/MathLib/CMakeLists.txt
+++ b/MathLib/CMakeLists.txt
@@ -38,6 +38,7 @@ SET( SOURCES
         LinAlg/Sparse/amuxCRS.cpp
         LinAlg/Solvers/BiCGStab.cpp
         LinAlg/Solvers/CG.cpp
+        LinAlg/Solvers/CGParallel.cpp
         LinAlg/Solvers/GMRes.cpp
 	LinAlg/Solvers/GaussAlgorithm.cpp
         LinAlg/Solvers/TriangularSolve.cpp
diff --git a/MathLib/LinAlg/Solvers/CG.cpp b/MathLib/LinAlg/Solvers/CG.cpp
index c5438a2261eeb42c7e075b819542f7613f49b5ae..93da2db1a53dd7e483af5346f121c39eed639911 100644
--- a/MathLib/LinAlg/Solvers/CG.cpp
+++ b/MathLib/LinAlg/Solvers/CG.cpp
@@ -7,8 +7,6 @@
 
 #include <limits>
 
-#include <omp.h>
-
 #include "MathTools.h"
 #include "blas.h"
 #include "../Sparse/CRSMatrix.h"
@@ -30,7 +28,7 @@
 namespace MathLib {
 
 unsigned CG(CRSMatrix<double,unsigned> const * mat, double const * const b,
-		double* const x, double& eps, unsigned& nsteps, unsigned num_threads)
+		double* const x, double& eps, unsigned& nsteps)
 {
 	unsigned N = mat->getNRows();
 	double *p, *q, *r, *rhat, rho, rho1 = 0.0;
@@ -64,11 +62,9 @@ unsigned CG(CRSMatrix<double,unsigned> const * mat, double const * const b,
 	}
 
 	for (unsigned l = 1; l <= nsteps; ++l) {
-
 #ifndef NDEBUG
 		std::cout << "Step " << l << ", resid=" << resid / nrmb << std::endl;
 #endif
-
 		// r^ = C r
 		blas::copy(N, r, rhat);
 		mat->precondApply(rhat);
@@ -80,8 +76,6 @@ unsigned CG(CRSMatrix<double,unsigned> const * mat, double const * const b,
 			double beta = rho / rho1;
 			// p = r^ + beta * p
 			unsigned k;
-			omp_set_num_threads(num_threads);
-//#pragma omp parallel for
 			for (k = 0; k < N; k++) {
 				p[k] = rhat[k] + beta * p[k];
 			}
diff --git a/MathLib/LinAlg/Solvers/CG.h b/MathLib/LinAlg/Solvers/CG.h
index 4b07debdf6a01ac5293e0e8ceb9d8810245bde76..60961d003b5adead7b4bbdd6f2e303cee365ad5c 100644
--- a/MathLib/LinAlg/Solvers/CG.h
+++ b/MathLib/LinAlg/Solvers/CG.h
@@ -14,7 +14,12 @@ namespace MathLib {
 template <typename PF_TYPE, typename IDX_TYPE> class CRSMatrix;
 
 unsigned CG(CRSMatrix<double,unsigned> const * mat, double const * const b,
+		double* const x, double& eps, unsigned& nsteps);
+
+#ifdef _OPENMP
+unsigned CGParallel(CRSMatrix<double,unsigned> const * mat, double const * const b,
 		double* const x, double& eps, unsigned& nsteps, unsigned num_threads = 1);
+#endif
 
 } // end namespace MathLib
 
diff --git a/MathLib/LinAlg/Solvers/CGParallel.cpp b/MathLib/LinAlg/Solvers/CGParallel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a30db79885c99707ddcdc041ffb5eeff1dac03f1
--- /dev/null
+++ b/MathLib/LinAlg/Solvers/CGParallel.cpp
@@ -0,0 +1,126 @@
+/*
+ * CGParallel.cpp
+ *
+ *  Created on: Dec 2, 2011
+ *      Author: TF
+ */
+
+#include <limits>
+
+#include <omp.h>
+
+#include "MathTools.h"
+#include "blas.h"
+#include "../Sparse/CRSMatrix.h"
+#include "../Sparse/CRSMatrixDiagPrecond.h"
+
+// CG solves the symmetric positive definite linear
+// system Ax=b using the Conjugate Gradient method.
+//
+// The return value indicates convergence within max_iter (input)
+// iterations (0), or no convergence within max_iter iterations (1).
+//
+// Upon successful return, output arguments have the following values:
+//
+//      x  --  approximate solution to Ax = b
+// nsteps  --  the number of iterations performed before the
+//             tolerance was reached
+//    eps  --  the residual after the final iteration
+
+namespace MathLib {
+
+#ifdef _OPENMP
+unsigned CGParallel(CRSMatrix<double,unsigned> const * mat, double const * const b,
+		double* const x, double& eps, unsigned& nsteps, unsigned num_threads)
+{
+	omp_set_num_threads(num_threads);
+	unsigned N = mat->getNRows();
+	double *p, *q, *r, *rhat, rho, rho1 = 0.0;
+
+	p = new double[4* N];
+	q = p + N;
+	r = q + N;
+	rhat = r + N;
+
+	double nrmb = sqrt(scpr(b, b, N));
+	if (nrmb < std::numeric_limits<double>::epsilon()) {
+		blas::setzero(N, x);
+		eps = 0.0;
+		nsteps = 0;
+		delete[] p;
+		return 0;
+	}
+
+	// r0 = b - Ax0
+	mat->amux(D_MONE, x, r);
+	for (unsigned k(0); k < N; k++) {
+		r[k] = b[k] - r[k];
+	}
+
+	double resid = blas::nrm2(N, r);
+	if (resid <= eps * nrmb) {
+		eps = resid / nrmb;
+		nsteps = 0;
+		delete[] p;
+		return 0;
+	}
+
+	for (unsigned l = 1; l <= nsteps; ++l) {
+#ifndef NDEBUG
+		std::cout << "Step " << l << ", resid=" << resid / nrmb << std::endl;
+#endif
+		// r^ = C r
+		blas::copy(N, r, rhat);
+		mat->precondApply(rhat);
+
+		// rho = r * r^;
+		rho = scpr(r, rhat, N, num_threads);
+
+		unsigned k;
+		if (l > 1) {
+			double beta = rho / rho1;
+			// p = r^ + beta * p
+			#pragma omp parallel for
+			for (k = 0; k < N; k++) {
+				p[k] = rhat[k] + beta * p[k];
+			}
+		} else blas::copy(N, rhat, p);
+
+		// q = Ap
+		blas::setzero(N, q);
+		mat->amux(D_ONE, p, q);
+
+		// alpha = rho / p*q
+		double alpha = rho / scpr(p, q, N, num_threads);
+
+		// x += alpha * p
+		#pragma omp parallel for
+		for (k = 0; k < N; k++) {
+			x[k] += alpha * p[k];
+		}
+
+		// r -= alpha * q
+		blas::axpy(N, -alpha, q, r);
+		#pragma omp parallel for
+		for (k = 0; k < N; k++) {
+			r[k] -= alpha * q[k];
+		}
+
+		resid = sqrt(scpr(r, r, N, num_threads));
+
+		if (resid <= eps * nrmb) {
+			eps = resid / nrmb;
+			nsteps = l;
+			delete[] p;
+			return 0;
+		}
+
+		rho1 = rho;
+	}
+	eps = resid / nrmb;
+	delete[] p;
+	return 1;
+}
+#endif
+
+} // end of namespace MathLib
diff --git a/MathLib/MathTools.cpp b/MathLib/MathTools.cpp
index b332447976cceb1908d620716915b9ef66aeb8f4..06b01cd6d0a88e248b92f4d078f4d7a7f84c418b 100644
--- a/MathLib/MathTools.cpp
+++ b/MathLib/MathTools.cpp
@@ -9,6 +9,26 @@
 
 namespace MathLib {
 
+#ifdef _OPENMP
+double scpr(double const * const v, double const * const w, unsigned n,
+        unsigned num_of_threads)
+{
+        long double res (0.0);
+        unsigned k;
+        omp_set_num_threads (num_of_threads);
+        #pragma omp parallel
+        {
+                #pragma omp parallel for reduction (+:res)
+                for (k = 0; k<n; k++) {
+                        res += v[k] * w[k];
+                }
+        }
+
+        return res;
+}
+#endif
+
+
 void crossProd(const double u[3], const double v[3], double r[3])
 {
 	r[0] = u[1] * v[2] - u[2] * v[1];
diff --git a/MathLib/MathTools.h b/MathLib/MathTools.h
index 71746cbafd1a7f15b7422c8da2f29ec8f6fbec7c..1d88785af6d6bbe991890b0e1349f0759d996870 100644
--- a/MathLib/MathTools.h
+++ b/MathLib/MathTools.h
@@ -2,7 +2,7 @@
  * \file MathTools.h
  *
  *  Created on: Jan 13, 2010
- *      Author: fischeth
+ *      Author: TF
  */
 
 #ifndef MATHTOOLS_H_
@@ -11,8 +11,12 @@
 #include <vector>
 #include <cmath>
 #include <limits>
+
+#include <omp.h>
+
 #include "Point.h"
 
+
 namespace MathLib {
 
 /**
@@ -22,13 +26,19 @@ namespace MathLib {
  * \param n the size of the array
  * */
 template<class T> inline
-double scpr(const T* v0, const T* v1, size_t n) {
+double scpr(const T* v0, const T* v1, size_t n)
+{
 	long double res(0.0);
 	for (size_t k(0); k<n; k++)
 		res += v0[k] * v1[k];
 	return (double) res;
 }
 
+#ifdef _OPENMP
+double scpr(double const * const v, double const * const w, unsigned n,
+        unsigned num_of_threads);
+#endif
+
 /**
  * computes the cross (or vector) product of the 3d vectors u and v
  * the result is given in the vector r
diff --git a/SimpleTests/SolverTests/ConjugateGradientDiagonalPreconditioned.cpp b/SimpleTests/SolverTests/ConjugateGradientDiagonalPreconditioned.cpp
index 043afa1fce8f63f801f4860ed1d06bb5ec1d8204..44855d1109847006bbc95b2991fe8a9f90be30ae 100644
--- a/SimpleTests/SolverTests/ConjugateGradientDiagonalPreconditioned.cpp
+++ b/SimpleTests/SolverTests/ConjugateGradientDiagonalPreconditioned.cpp
@@ -61,7 +61,11 @@ int main(int argc, char *argv[])
 	run_timer.start();
 	cpu_timer.start();
 
-	MathLib::CG (mat, b, x, eps, steps, num_omp_threads);
+	if (num_omp_threads == 1) {
+		MathLib::CG(mat, b, x, eps, steps);
+	} else {
+		MathLib::CGParallel(mat, b, x, eps, steps, num_omp_threads);
+	}
 
 	cpu_timer.stop();
 	run_timer.stop();
diff --git a/SimpleTests/SolverTests/ConjugateGradientUnpreconditioned.cpp b/SimpleTests/SolverTests/ConjugateGradientUnpreconditioned.cpp
index 258fde9616bbc4e15ad553c626bdd86a496f48f3..1f0c4cb92b1fc5ea10b796b4d998ca64e60ac861 100644
--- a/SimpleTests/SolverTests/ConjugateGradientUnpreconditioned.cpp
+++ b/SimpleTests/SolverTests/ConjugateGradientUnpreconditioned.cpp
@@ -55,7 +55,7 @@ int main(int argc, char *argv[])
 //	double cg_time (cputime(0.0));
 	double eps (1.0e-6);
 	unsigned steps (4000);
-	CG (mat, b, x, eps, steps, 1);
+	CG (mat, b, x, eps, steps);
 //	cg_time = cputime(cg_time);
 	time(&end_time);
 	std::cout << " in " << steps << " iterations (residuum is " << eps << ") took " << /*cg_time <<*/ " sec time and " << (end_time-start_time) << " sec" << std::endl;