diff --git a/MaterialLib/MPL/Properties/Function.cpp b/MaterialLib/MPL/Properties/Function.cpp
index 65897c1c11cc9d0d967393e773b8c3b7321c35f5..2c256f7935c71a15ecdc88abf1707fa886f1004c 100644
--- a/MaterialLib/MPL/Properties/Function.cpp
+++ b/MaterialLib/MPL/Properties/Function.cpp
@@ -77,13 +77,18 @@ static void updateVariableValues(
 static PropertyDataType evaluateExpressions(
     std::vector<std::pair<Variable, double*>> const& symbol_values,
     VariableArray const& variable_array,
-    std::vector<exprtk::expression<double>> const& expressions)
+    std::vector<exprtk::expression<double>> const& expressions,
+    std::mutex& mutex)
 {
-    updateVariableValues(symbol_values, variable_array);
-
     std::vector<double> result(expressions.size());
-    std::transform(begin(expressions), end(expressions), begin(result),
-                   [](auto const& e) { return e.value(); });
+
+    {
+        std::lock_guard lock_guard(mutex);
+        updateVariableValues(symbol_values, variable_array);
+
+        std::transform(begin(expressions), end(expressions), begin(result),
+                       [](auto const& e) { return e.value(); });
+    }
 
     switch (result.size())
     {
@@ -187,7 +192,7 @@ PropertyDataType Function::value(VariableArray const& variable_array,
                                  double const /*t*/, double const /*dt*/) const
 {
     return evaluateExpressions(symbol_values_, variable_array,
-                               value_expressions_);
+                               value_expressions_, mutex_);
 }
 
 PropertyDataType Function::dValue(VariableArray const& variable_array,
@@ -208,7 +213,8 @@ PropertyDataType Function::dValue(VariableArray const& variable_array,
             variable_enum_to_string[static_cast<int>(variable)], name_);
     }
 
-    return evaluateExpressions(symbol_values_, variable_array, it->second);
+    return evaluateExpressions(symbol_values_, variable_array, it->second,
+                               mutex_);
 }
 
 }  // namespace MaterialPropertyLib
diff --git a/MaterialLib/MPL/Properties/Function.h b/MaterialLib/MPL/Properties/Function.h
index 49263d67987fe5b1c2322c955ec26a7ecfb600d1..f7ec49e35be47dc19235429a5760a648f3e5a745 100644
--- a/MaterialLib/MPL/Properties/Function.h
+++ b/MaterialLib/MPL/Properties/Function.h
@@ -55,5 +55,7 @@ private:
     /// Multiple expressions are representing vector-valued functions.
     std::vector<std::pair<Variable, std::vector<Expression>>>
         dvalue_expressions_;
+
+    mutable std::mutex mutex_;
 };
 }  // namespace MaterialPropertyLib
diff --git a/MeshLib/IO/MPI_IO/PropertyVectorMetaData.h b/MeshLib/IO/MPI_IO/PropertyVectorMetaData.h
index d050b2f51b91d8d13ac848201a7ea8d4e0ecf452..a47bd5611eb8e00e630aa01a83a2fb9799dd008d 100644
--- a/MeshLib/IO/MPI_IO/PropertyVectorMetaData.h
+++ b/MeshLib/IO/MPI_IO/PropertyVectorMetaData.h
@@ -10,9 +10,13 @@
 
 #pragma once
 
+#include <istream>
 #include <optional>
+#include <ostream>
 #include <string>
 
+#include "BaseLib/Logging.h"
+
 namespace MeshLib
 {
 namespace IO
diff --git a/ParameterLib/FunctionParameter.h b/ParameterLib/FunctionParameter.h
index e919d56d42139fd30eb7ef5678a9f528d084873d..6cf9a720bf56ffe898f4f6e7a7a92341f27a053a 100644
--- a/ParameterLib/FunctionParameter.h
+++ b/ParameterLib/FunctionParameter.h
@@ -121,14 +121,20 @@ struct FunctionParameter final : public Parameter<T>
                 "coordinates.");
         }
         auto const coords = pos.getCoordinates().value();
-        x = coords[0];
-        y = coords[1];
-        z = coords[2];
-        time = t;
 
-        for (unsigned i = 0; i < _vec_expression.size(); i++)
         {
-            cache[i] = _vec_expression[i].value();
+            std::lock_guard lock_guard(_mutex);
+            x = coords[0];
+            y = coords[1];
+            z = coords[2];
+            time = t;
+
+            {
+                for (unsigned i = 0; i < _vec_expression.size(); i++)
+                {
+                    cache[i] = _vec_expression[i].value();
+                }
+            }
         }
 
         if (!this->_coordinate_system)
@@ -143,6 +149,7 @@ private:
     symbol_table_t _symbol_table;
     std::vector<expression_t> _vec_expression;
     std::vector<std::pair<std::string, CurveWrapper>> _curves;
+    mutable std::mutex _mutex;
 };
 
 std::unique_ptr<ParameterBase> createFunctionParameter(
diff --git a/ProcessLib/AbstractJacobianAssembler.h b/ProcessLib/AbstractJacobianAssembler.h
index b589df0aff89b41e72e6614a077bf5551e396999..d10ed2f4bc1c35354e069278ca75756b5a6a223b 100644
--- a/ProcessLib/AbstractJacobianAssembler.h
+++ b/ProcessLib/AbstractJacobianAssembler.h
@@ -50,6 +50,8 @@ public:
         OGS_FATAL("not implemented.");
     }
 
+    virtual std::unique_ptr<AbstractJacobianAssembler> copy() const = 0;
+
     virtual ~AbstractJacobianAssembler() = default;
 };
 
diff --git a/ProcessLib/AnalyticalJacobianAssembler.cpp b/ProcessLib/AnalyticalJacobianAssembler.cpp
index cc62365c52960f78feb0dbb1d15afaa34fe81441..61a1f91d12a21075f659471a9dd862f6bd705609 100644
--- a/ProcessLib/AnalyticalJacobianAssembler.cpp
+++ b/ProcessLib/AnalyticalJacobianAssembler.cpp
@@ -38,4 +38,10 @@ void AnalyticalJacobianAssembler::assembleWithJacobianForStaggeredScheme(
         local_b_data, local_Jac_data);
 }
 
+std::unique_ptr<AbstractJacobianAssembler> AnalyticalJacobianAssembler::copy()
+    const
+{
+    return std::make_unique<AnalyticalJacobianAssembler>(*this);
+}
+
 }  // namespace ProcessLib
diff --git a/ProcessLib/AnalyticalJacobianAssembler.h b/ProcessLib/AnalyticalJacobianAssembler.h
index a5b9f092f994b6544acb70c9a035a5beb8d905ac..b0204790f77bf4f11236414741d41f61b68fefd4 100644
--- a/ProcessLib/AnalyticalJacobianAssembler.h
+++ b/ProcessLib/AnalyticalJacobianAssembler.h
@@ -47,6 +47,8 @@ public:
         std::vector<double>& local_M_data, std::vector<double>& local_K_data,
         std::vector<double>& local_b_data,
         std::vector<double>& local_Jac_data) override;
+
+    std::unique_ptr<AbstractJacobianAssembler> copy() const override;
 };
 
 }  // namespace ProcessLib
diff --git a/ProcessLib/Assembly/MatrixAssemblyStats.h b/ProcessLib/Assembly/MatrixAssemblyStats.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ebc066effd64d388e8172686734237a38c2149d
--- /dev/null
+++ b/ProcessLib/Assembly/MatrixAssemblyStats.h
@@ -0,0 +1,135 @@
+/**
+ * \file
+ * \copyright
+ * Copyright (c) 2012-2023, OpenGeoSys Community (http://www.opengeosys.org)
+ *            Distributed under a Modified BSD License.
+ *              See accompanying file LICENSE.txt or
+ *              http://www.opengeosys.org/project/license
+ *
+ */
+
+#pragma once
+
+#include <memory>
+
+#include "BaseLib/Logging.h"
+
+namespace ProcessLib::Assembly
+{
+struct Stats
+{
+    std::size_t count = 0;
+    std::size_t count_nonzero = 0;
+    std::size_t count_global = 0;
+
+    Stats& operator+=(Stats const& other)
+    {
+        count += other.count;
+        count_nonzero += other.count_nonzero;
+        count_global += other.count_global;
+
+        return *this;
+    }
+
+    void print(std::string const& matrix_or_vector_name) const
+    {
+        DBUG("Stats [{}]: {} elements added to the matrix cache.",
+             matrix_or_vector_name,
+             count);
+        DBUG("Stats [{}]: {} nonzero elements added to the matrix cache.",
+             matrix_or_vector_name,
+             count_nonzero);
+        DBUG("Stats [{}]: {} elements added to the global matrix.",
+             matrix_or_vector_name,
+             count_global);
+    }
+};
+
+struct MultiStats
+{
+    Stats M;
+    Stats K;
+    Stats b;
+    Stats Jac;
+
+    MultiStats& operator+=(MultiStats const& other)
+    {
+        M += other.M;
+        K += other.K;
+        b += other.b;
+        Jac += other.Jac;
+
+        return *this;
+    }
+
+    void print() const
+    {
+        M.print("M");
+        K.print("K");
+        b.print("b");
+        Jac.print("J");
+    }
+};
+
+template <typename Data>
+class CumulativeStats
+    : public std::enable_shared_from_this<CumulativeStats<Data>>
+{
+    using Base = std::enable_shared_from_this<CumulativeStats<Data>>;
+
+public:
+    Data data;
+
+    static std::shared_ptr<CumulativeStats<Data>> create()
+    {
+        return std::shared_ptr<CumulativeStats<Data>>(
+            new CumulativeStats<Data>());
+    }
+
+    // Could return unique_ptr, but shared_ptr is more consistent with the
+    // create() method.
+    std::shared_ptr<CumulativeStats<Data>> clone()
+    {
+        return std::make_shared<CumulativeStats<Data>>(*this);
+    }
+
+    CumulativeStats(CumulativeStats<Data> const& other) = delete;
+
+    CumulativeStats(CumulativeStats<Data>& other)
+        : Base{other},
+          data{},
+          parent_{other.parent_ ? other.parent_ : other.shared_from_this()},
+          parent_mutex_{other.parent_mutex_}
+    {
+    }
+
+    CumulativeStats(CumulativeStats<Data>&& other)
+        : parent_{std::move(other.parent_)},
+          parent_mutex_{std::move(other.parent_mutex_)}
+    {
+        std::swap(data, other.data);
+    }
+
+    ~CumulativeStats()
+    {
+        if (!parent_)
+        {
+            return;
+        }
+
+        std::lock_guard<std::mutex> const lock(*parent_mutex_);
+
+        DBUG("Adding cumulative stats to parent.");
+
+        parent_->data += data;
+    }
+
+    void print() const { data.print(); }
+
+private:
+    CumulativeStats() : parent_mutex_{std::make_shared<std::mutex>()} {}
+
+    std::shared_ptr<CumulativeStats<Data>> parent_;
+    std::shared_ptr<std::mutex> parent_mutex_;
+};
+}  // namespace ProcessLib::Assembly
diff --git a/ProcessLib/Assembly/MatrixElementCache.h b/ProcessLib/Assembly/MatrixElementCache.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca2e2cb1d68d797fb9bcf3ce8fa90ad09797e19c
--- /dev/null
+++ b/ProcessLib/Assembly/MatrixElementCache.h
@@ -0,0 +1,308 @@
+/**
+ * \file
+ * \copyright
+ * Copyright (c) 2012-2023, OpenGeoSys Community (http://www.opengeosys.org)
+ *            Distributed under a Modified BSD License.
+ *              See accompanying file LICENSE.txt or
+ *              http://www.opengeosys.org/project/license
+ *
+ */
+
+#pragma once
+
+#include <algorithm>
+
+#include "MathLib/LinAlg/Eigen/EigenMapTools.h"
+#include "MathLib/LinAlg/GlobalMatrixVectorTypes.h"
+#include "ProcessLib/Assembly/MatrixAssemblyStats.h"
+
+namespace ProcessLib::Assembly
+{
+namespace detail
+{
+#ifdef USE_PETSC
+inline GlobalIndexType transformToNonGhostIndex(GlobalIndexType const i,
+                                                GlobalIndexType const n)
+{
+    if (i == -n)
+    {
+        return 0;
+    }
+    else
+    {
+        return std::abs(i);
+    }
+}
+#else
+inline GlobalIndexType transformToNonGhostIndex(GlobalIndexType const i,
+                                                GlobalIndexType const /*n*/)
+{
+    return i;
+}
+#endif
+}  // namespace detail
+
+template <std::size_t Dim>
+struct MatrixElementCacheEntry
+{
+    MatrixElementCacheEntry() = default;
+    MatrixElementCacheEntry(MatrixElementCacheEntry const&) = default;
+    MatrixElementCacheEntry(MatrixElementCacheEntry&&) = default;
+
+    MatrixElementCacheEntry(std::array<GlobalIndexType, Dim> const& indices,
+                            double value)
+        : indices{indices}, value{value}
+    {
+    }
+
+    MatrixElementCacheEntry& operator=(MatrixElementCacheEntry const&) =
+        default;
+    MatrixElementCacheEntry& operator=(MatrixElementCacheEntry&&) = default;
+
+    static bool compareIndices(MatrixElementCacheEntry<Dim> const& a,
+                               MatrixElementCacheEntry<Dim> const& b)
+    {
+        auto const& ia = a.indices;
+        auto const& ib = b.indices;
+
+        return std::lexicographical_compare(ia.begin(), ia.end(), ib.begin(),
+                                            ib.end());
+    }
+
+    bool indicesEqualTo(MatrixElementCacheEntry<Dim> const& other) const
+    {
+        return std::equal(indices.begin(), indices.end(),
+                          other.indices.begin());
+    }
+
+    std::array<GlobalIndexType, Dim> indices;
+    double value;
+};
+
+template <std::size_t Dim>
+class ConcurrentMatrixView
+{
+    static_assert(Dim == 1 || Dim == 2);
+    using GlobalMatOrVec =
+        std::conditional_t<Dim == 1, GlobalVector, GlobalMatrix>;
+
+public:
+    explicit ConcurrentMatrixView(GlobalMatOrVec& mat_or_vec)
+        : mat_or_vec_{mat_or_vec}
+    {
+    }
+
+    void add(std::vector<MatrixElementCacheEntry<Dim>> const& entries)
+    {
+        std::lock_guard<std::mutex> const lock(mutex_);
+
+        if constexpr (Dim == 2)
+        {
+            auto const n_cols = mat_or_vec_.getNumberOfColumns();
+
+            // TODO would be more efficient if our global matrix and vector
+            // implementations supported batch addition of matrix elements with
+            // arbitrary indices (not restricted to (n x m) shaped submatrices).
+            for (auto const [rc, value] : entries)
+            {
+                auto const [r, c] = rc;
+
+                assert(r >= 0);  // rows must not be ghost indices
+                auto const c_no_ghost =
+                    detail::transformToNonGhostIndex(c, n_cols);
+
+                mat_or_vec_.add(r, c_no_ghost, value);
+            }
+        }
+        else
+        {
+            // TODO batch addition would be more efficient. That needs the
+            // refactoring of the matrix element cache.
+            for (auto const [r, value] : entries)
+            {
+                mat_or_vec_.add(r.front(), value);
+            }
+        }
+    }
+
+private:
+    std::mutex mutex_;
+    GlobalMatOrVec& mat_or_vec_;
+};
+
+explicit ConcurrentMatrixView(GlobalVector&)->ConcurrentMatrixView<1>;
+explicit ConcurrentMatrixView(GlobalMatrix&)->ConcurrentMatrixView<2>;
+
+template <std::size_t Dim>
+class MatrixElementCache final
+{
+    static_assert(Dim == 1 || Dim == 2);
+    using GlobalMatView = ConcurrentMatrixView<Dim>;
+
+    static constexpr std::size_t cache_capacity = 1'000'000;
+
+public:
+    MatrixElementCache(GlobalMatView& mat_or_vec, Stats& stats)
+        : mat_or_vec_(mat_or_vec), stats_(stats)
+    {
+        cache_.reserve(cache_capacity);
+    }
+
+    void add(std::vector<double> const& local_data,
+             std::vector<GlobalIndexType> const& indices)
+    {
+        addToCache(local_data, indices);
+    }
+
+    ~MatrixElementCache() { addToGlobal(); }
+
+private:
+    void addToCache(std::vector<double> const& values,
+                    std::vector<GlobalIndexType> const& indices)
+    {
+        if (values.empty())
+        {
+            return;
+        }
+
+        ensureEnoughSpace(values.size());
+
+        addToCacheImpl(values, indices,
+                       std::integral_constant<std::size_t, Dim>{});
+    }
+
+    // Overload for vectors.
+    void addToCacheImpl(std::vector<double> const& values,
+                        std::vector<GlobalIndexType> const& indices,
+                        std::integral_constant<std::size_t, 1>)
+    {
+        auto const num_r_c = indices.size();
+
+        for (std::size_t r_local = 0; r_local < num_r_c; ++r_local)
+        {
+            ++stats_.count;
+            auto const value = values[r_local];
+
+            if (value == 0)
+            {
+                continue;
+            }
+            else
+            {
+                ++stats_.count_nonzero;
+            }
+
+            auto const r_global = indices[r_local];
+            cache_.emplace_back(std::array{r_global}, value);
+        }
+    }
+
+    // Overload for matrices.
+    void addToCacheImpl(std::vector<double> const& values,
+                        std::vector<GlobalIndexType> const& indices,
+                        std::integral_constant<std::size_t, 2>)
+    {
+        auto const num_r_c = indices.size();
+
+        // Note: There is an implicit storage order assumption, here!
+        auto const local_mat = MathLib::toMatrix(values, num_r_c, num_r_c);
+
+        for (std::size_t r_local = 0; r_local < num_r_c; ++r_local)
+        {
+            auto const r_global = indices[r_local];
+
+            for (std::size_t c_local = 0; c_local < num_r_c; ++c_local)
+            {
+                ++stats_.count;
+                auto const value = local_mat(r_local, c_local);
+
+                // TODO skipping zero values sometimes does not work together
+                // with the Eigen SparseLU linear solver. See also
+                // https://gitlab.opengeosys.org/ogs/ogs/-/merge_requests/4556#note_125561
+#if 0
+                if (value == 0)
+                {
+                    continue;
+                }
+#endif
+                ++stats_.count_nonzero;
+
+                auto const c_global = indices[c_local];
+                cache_.emplace_back(std::array{r_global, c_global}, value);
+            }
+        }
+    }
+
+    void ensureEnoughSpace(std::size_t const space_needed)
+    {
+        auto const size_initial = cache_.size();
+        auto const cap_initial = cache_.capacity();
+
+        if (size_initial + space_needed <= cap_initial)
+        {
+            return;
+        }
+
+        addToGlobal();
+
+        // ensure again that there is enough capacity (corner case, if initial
+        // capacity is too small because of whatever arcane reason)
+        auto const size_after = cache_.size();
+        auto const cap_after = cache_.capacity();
+
+        if (size_after + space_needed > cap_after)
+        {
+            cache_.reserve(size_after + 2 * space_needed);
+        }
+    }
+
+    void addToGlobal()
+    {
+        std::sort(cache_.begin(), cache_.end(),
+                  MatrixElementCacheEntry<Dim>::compareIndices);
+
+        mat_or_vec_.add(cache_);
+        stats_.count_global += cache_.size();
+        cache_.clear();
+    }
+
+    std::vector<MatrixElementCacheEntry<Dim>> cache_;
+    GlobalMatView& mat_or_vec_;
+    Stats& stats_;
+};
+
+class MultiMatrixElementCache final
+{
+    using GlobalMatrixView = ConcurrentMatrixView<2>;
+    using GlobalVectorView = ConcurrentMatrixView<1>;
+
+public:
+    MultiMatrixElementCache(GlobalMatrixView& M, GlobalMatrixView& K,
+                            GlobalVectorView& b, GlobalMatrixView& Jac,
+                            MultiStats& stats)
+        : cache_M_(M, stats.M),
+          cache_K_(K, stats.K),
+          cache_b_(b, stats.b),
+          cache_Jac_(Jac, stats.Jac)
+    {
+    }
+
+    void add(std::vector<double> const& local_M_data,
+             std::vector<double> const& local_K_data,
+             std::vector<double> const& local_b_data,
+             std::vector<double> const& local_Jac_data,
+             std::vector<GlobalIndexType> const& indices)
+    {
+        cache_M_.add(local_M_data, indices);
+        cache_K_.add(local_K_data, indices);
+        cache_b_.add(local_b_data, indices);
+        cache_Jac_.add(local_Jac_data, indices);
+    }
+
+private:
+    MatrixElementCache<2> cache_M_;
+    MatrixElementCache<2> cache_K_;
+    MatrixElementCache<1> cache_b_;
+    MatrixElementCache<2> cache_Jac_;
+};
+}  // namespace ProcessLib::Assembly
diff --git a/ProcessLib/Assembly/ParallelVectorMatrixAssembler.cpp b/ProcessLib/Assembly/ParallelVectorMatrixAssembler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e255fef352e252e4c63cc7613f08ee1a3ecc1ee0
--- /dev/null
+++ b/ProcessLib/Assembly/ParallelVectorMatrixAssembler.cpp
@@ -0,0 +1,237 @@
+/**
+ * \file
+ * \copyright
+ * Copyright (c) 2012-2023, OpenGeoSys Community (http://www.opengeosys.org)
+ *            Distributed under a Modified BSD License.
+ *              See accompanying file LICENSE.txt or
+ *              http://www.opengeosys.org/project/license
+ *
+ */
+
+#include "ParallelVectorMatrixAssembler.h"
+
+#include <cstdlib>
+#include <fstream>
+
+#include "BaseLib/StringTools.h"
+#include "NumLib/DOF/DOFTableUtil.h"
+#include "ProcessLib/Assembly/MatrixAssemblyStats.h"
+#include "ProcessLib/Assembly/MatrixElementCache.h"
+#include "ProcessLib/Assembly/MatrixOutput.h"
+
+namespace
+{
+void assembleWithJacobianOneElement(
+    const std::size_t mesh_item_id,
+    ProcessLib::LocalAssemblerInterface& local_assembler,
+    const NumLib::LocalToGlobalIndexMap& dof_table, const double t,
+    const double dt, const GlobalVector& x, const GlobalVector& xdot,
+    std::vector<double>& local_M_data, std::vector<double>& local_K_data,
+    std::vector<double>& local_b_data, std::vector<double>& local_Jac_data,
+    std::vector<GlobalIndexType>& indices,
+    ProcessLib::AbstractJacobianAssembler& jacobian_assembler,
+    ProcessLib::Assembly::MultiMatrixElementCache& cache)
+{
+    indices = NumLib::getIndices(mesh_item_id, dof_table);
+
+    local_M_data.clear();
+    local_K_data.clear();
+    local_b_data.clear();
+    local_Jac_data.clear();
+
+    auto const local_x = x.get(indices);
+    auto const local_xdot = xdot.get(indices);
+    jacobian_assembler.assembleWithJacobian(
+        local_assembler, t, dt, local_x, local_xdot, local_M_data, local_K_data,
+        local_b_data, local_Jac_data);
+
+    if (local_Jac_data.empty())
+    {
+        OGS_FATAL(
+            "No Jacobian has been assembled! This might be due to "
+            "programming errors in the local assembler of the "
+            "current process.");
+    }
+
+    cache.add(local_M_data, local_K_data, local_b_data, local_Jac_data,
+              indices);
+}
+
+int getNumberOfThreads()
+{
+    char const* const num_threads_env = std::getenv("OGS_ASM_THREADS");
+
+    if (!num_threads_env)
+    {
+        return 1;
+    }
+
+    if (std::strlen(num_threads_env) == 0)
+    {
+        OGS_FATAL("The environment variable OGS_ASM_THREADS is set but empty.");
+    }
+
+    std::string num_threads_str{num_threads_env};
+    BaseLib::trim(num_threads_str);
+
+    std::istringstream num_threads_iss{num_threads_str};
+    int num_threads = -1;
+
+    num_threads_iss >> num_threads;
+
+    if (!num_threads_iss)
+    {
+        OGS_FATAL("Error parsing OGS_ASM_THREADS (= \"{}\").", num_threads_env);
+    }
+
+    if (!num_threads_iss.eof())
+    {
+        OGS_FATAL(
+            "Error parsing OGS_ASM_THREADS (= \"{}\"): not read entirely, the "
+            "remainder is \"{}\"",
+            num_threads_env,
+            num_threads_iss.str().substr(num_threads_iss.tellg()));
+    }
+
+    if (num_threads < 1)
+    {
+        OGS_FATAL(
+            "You asked (via OGS_ASM_THREADS) to assemble with {} < 1 thread.",
+            num_threads);
+    }
+
+    return num_threads;
+}
+}  // namespace
+
+namespace ProcessLib::Assembly
+{
+ParallelVectorMatrixAssembler::ParallelVectorMatrixAssembler(
+    AbstractJacobianAssembler& jacobian_assembler)
+    : jacobian_assembler_{jacobian_assembler},
+      num_threads_(getNumberOfThreads())
+{
+}
+
+void ParallelVectorMatrixAssembler::assembleWithJacobian(
+    BaseLib::PolymorphicRandomAccessContainerView<
+        LocalAssemblerInterface> const& local_assemblers,
+    std::vector<std::size_t> const& active_elements,
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>> const&
+        dof_tables,
+    const double t, double const dt, std::vector<GlobalVector*> const& xs,
+    std::vector<GlobalVector*> const& xdots, int const process_id,
+    GlobalMatrix& M, GlobalMatrix& K, GlobalVector& b, GlobalMatrix& Jac)
+{
+    // checks //////////////////////////////////////////////////////////////////
+    if (process_id != 0)
+    {
+        OGS_FATAL("Process id is not 0 but {}", process_id);
+    }
+
+    if (dof_tables.size() != 1)
+    {
+        OGS_FATAL("More than 1 dof table");
+    }
+    auto const& dof_table = dof_tables.front().get();
+
+    if (xs.size() != 1)
+    {
+        OGS_FATAL("More than 1 solution vector");
+    }
+    auto const& x = *xs.front();
+
+    if (xdots.size() != 1)
+    {
+        OGS_FATAL("More than 1 xdot vector");
+    }
+    auto const& xdot = *xdots.front();
+
+    // algorithm ///////////////////////////////////////////////////////////////
+
+    auto stats = CumulativeStats<MultiStats>::create();
+    ConcurrentMatrixView M_view(M);
+    ConcurrentMatrixView K_view(K);
+    ConcurrentMatrixView b_view(b);
+    ConcurrentMatrixView Jac_view(Jac);
+
+#pragma omp parallel num_threads(num_threads_)
+    {
+#ifdef _OPENMP
+#pragma omp single nowait
+        {
+            INFO("Number of threads: {}", omp_get_num_threads());
+        }
+#endif
+
+        // temporary data only stored here in order to avoid frequent memory
+        // reallocations.
+        std::vector<double> local_M_data;
+        std::vector<double> local_K_data;
+        std::vector<double> local_b_data;
+        std::vector<double> local_Jac_data;
+        std::vector<GlobalIndexType> indices;
+
+        // copy to avoid concurrent access
+        auto const jac_asm = jacobian_assembler_.copy();
+        auto stats_this_thread = stats->clone();
+
+        MultiMatrixElementCache cache{M_view, K_view, b_view, Jac_view,
+                                      stats_this_thread->data};
+
+        // TODO corner case: what if all elements on a submesh are deactivated?
+        if (active_elements.empty())
+        {
+            // due to MSVC++ error:
+            // error C3016: 'element_id': index variable in OpenMP 'for'
+            // statement must have signed integral type
+            std::ptrdiff_t const n_loc_asm =
+                static_cast<std::ptrdiff_t>(local_assemblers.size());
+
+#pragma omp for
+            for (std::ptrdiff_t element_id = 0; element_id < n_loc_asm;
+                 ++element_id)
+            {
+                auto& loc_asm = local_assemblers[element_id];
+
+                assembleWithJacobianOneElement(
+                    element_id, loc_asm, dof_table, t, dt, x, xdot,
+                    local_M_data, local_K_data, local_b_data, local_Jac_data,
+                    indices, *jac_asm, cache);
+
+                local_matrix_output_(t, process_id, element_id, local_M_data,
+                                     local_K_data, local_b_data,
+                                     &local_Jac_data);
+            }
+        }
+        else
+        {
+            // due to MSVC++ error:
+            // error C3016: 'i': index variable in OpenMP 'for' statement must
+            // have signed integral type
+            std::ptrdiff_t const n_act_elem =
+                static_cast<std::ptrdiff_t>(active_elements.size());
+
+#pragma omp for
+            for (std::ptrdiff_t i = 0; i < n_act_elem; ++i)
+            {
+                auto const element_id = active_elements[i];
+                auto& loc_asm = local_assemblers[element_id];
+
+                assembleWithJacobianOneElement(
+                    element_id, loc_asm, dof_table, t, dt, x, xdot,
+                    local_M_data, local_K_data, local_b_data, local_Jac_data,
+                    indices, *jac_asm, cache);
+
+                local_matrix_output_(t, process_id, element_id, local_M_data,
+                                     local_K_data, local_b_data,
+                                     &local_Jac_data);
+            }
+        }
+    }  // OpenMP parallel section
+
+    stats->print();
+
+    global_matrix_output_(t, process_id, M, K, b, &Jac);
+}
+}  // namespace ProcessLib::Assembly
diff --git a/ProcessLib/Assembly/ParallelVectorMatrixAssembler.h b/ProcessLib/Assembly/ParallelVectorMatrixAssembler.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f87c1f55c5d30d364dbee2a9449d67adfa476a6
--- /dev/null
+++ b/ProcessLib/Assembly/ParallelVectorMatrixAssembler.h
@@ -0,0 +1,45 @@
+/**
+ * \file
+ * \copyright
+ * Copyright (c) 2012-2023, OpenGeoSys Community (http://www.opengeosys.org)
+ *            Distributed under a Modified BSD License.
+ *              See accompanying file LICENSE.txt or
+ *              http://www.opengeosys.org/project/license
+ *
+ */
+
+#pragma once
+
+#include "BaseLib/ContainerTools.h"
+#include "NumLib/DOF/LocalToGlobalIndexMap.h"
+#include "ProcessLib/AbstractJacobianAssembler.h"
+#include "ProcessLib/Assembly/MatrixOutput.h"
+
+namespace ProcessLib::Assembly
+{
+class ParallelVectorMatrixAssembler
+{
+public:
+    explicit ParallelVectorMatrixAssembler(
+        AbstractJacobianAssembler& jacobian_assembler);
+
+    void assembleWithJacobian(
+        BaseLib::PolymorphicRandomAccessContainerView<
+            LocalAssemblerInterface> const& local_assemblers,
+        std::vector<std::size_t> const& active_elements,
+        std::vector<
+            std::reference_wrapper<NumLib::LocalToGlobalIndexMap>> const&
+            dof_tables,
+        const double t, double const dt, std::vector<GlobalVector*> const& xs,
+        std::vector<GlobalVector*> const& xdots, int const process_id,
+        GlobalMatrix& M, GlobalMatrix& K, GlobalVector& b, GlobalMatrix& Jac);
+
+private:
+    AbstractJacobianAssembler& jacobian_assembler_;
+    LocalMatrixOutput local_matrix_output_;
+    GlobalMatrixOutput global_matrix_output_;
+
+    int const num_threads_;
+};
+
+}  // namespace ProcessLib::Assembly
diff --git a/ProcessLib/AssemblyMixin.h b/ProcessLib/AssemblyMixin.h
index 911e92027ee6230a01bbcc8bf2ee6b199e62eb21..fc17e2ba97d2f576f41a66df835bab3fea9e6103 100644
--- a/ProcessLib/AssemblyMixin.h
+++ b/ProcessLib/AssemblyMixin.h
@@ -12,7 +12,7 @@
 
 #include "MathLib/LinAlg/LinAlg.h"
 #include "NumLib/DOF/GlobalMatrixProviders.h"
-#include "NumLib/NumericsConfig.h"
+#include "ProcessLib/Assembly/ParallelVectorMatrixAssembler.h"
 #include "ProcessLib/ProcessVariable.h"
 #include "VectorMatrixAssembler.h"
 
@@ -46,6 +46,9 @@ class AssemblyMixinBase
     };
 
 protected:
+    explicit AssemblyMixinBase(AbstractJacobianAssembler& jacobian_assembler)
+        : pvma_{jacobian_assembler} {};
+
     void initializeAssemblyOnSubmeshes(
         const int process_id,
         MeshLib::Mesh& bulk_mesh,
@@ -74,8 +77,11 @@ protected:
     std::vector<std::reference_wrapper<MeshLib::PropertyVector<double>>>
         residuum_vectors_bulk_;
 
+    /// ID of the b vector on submeshes, cf. NumLib::VectorProvider.
     std::size_t b_submesh_id_ = 0;
 
+    Assembly::ParallelVectorMatrixAssembler pvma_;
+
 private:
     ActiveElementIDsState ids_state_ = ActiveElementIDsState::UNINITIALIZED;
 };
@@ -90,7 +96,7 @@ class AssemblyMixin : private AssemblyMixinBase
 {
     // Enforce correct use of CRTP, i.e., that Process is derived from
     // AssemblyMixin<Process>.
-    AssemblyMixin() = default;
+    using AssemblyMixinBase::AssemblyMixinBase;
     friend Process;
 
 public:
@@ -135,16 +141,21 @@ public:
         AssemblyMixinBase::updateActiveElements(pv);
     }
 
-    void assemble(const double t, double const dt,
-                  std::vector<GlobalVector*> const& x,
-                  std::vector<GlobalVector*> const& xdot, int const process_id,
-                  GlobalMatrix& M, GlobalMatrix& K, GlobalVector& b)
+    // cppcheck-suppress functionStatic
+    void assemble(const double /*t*/, double const /*dt*/,
+                  std::vector<GlobalVector*> const& /*x*/,
+                  std::vector<GlobalVector*> const& /*xdot*/,
+                  int const /*process_id*/, GlobalMatrix& /*M*/,
+                  GlobalMatrix& /*K*/, GlobalVector& /*b*/)
     {
+        /*
         DBUG("AssemblyMixin assemble(t={}, dt={}, process_id={}).", t, dt,
              process_id);
 
-        assembleGeneric(&VectorMatrixAssembler::assemble, t, dt, x, xdot,
-                        process_id, M, K, b);
+        assembleGeneric(&Assembly::ParallelVectorMatrixAssembler::assemble, t,
+        dt, x, xdot, process_id, M, K, b);
+        */
+        OGS_FATAL("Not yet implemented.");
     }
 
     void assembleWithJacobian(const double t, double const dt,
@@ -157,8 +168,9 @@ public:
         DBUG("AssemblyMixin assembleWithJacobian(t={}, dt={}, process_id={}).",
              t, dt, process_id);
 
-        assembleGeneric(&VectorMatrixAssembler::assembleWithJacobian, t, dt, x,
-                        xdot, process_id, M, K, b, Jac);
+        assembleGeneric(
+            &Assembly::ParallelVectorMatrixAssembler::assembleWithJacobian, t,
+            dt, x, xdot, process_id, M, K, b, Jac);
     }
 
 private:
@@ -181,6 +193,8 @@ private:
         std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>> const
             dof_tables{*derived()._local_to_global_index_map};
 
+        auto const& loc_asms = derived().local_assemblers_;
+
         if (!submesh_assembly_data_.empty())
         {
             auto& b_submesh = NumLib::GlobalVectorProvider::provider.getVector(
@@ -190,11 +204,9 @@ private:
             {
                 b_submesh.setZero();
 
-                GlobalExecutor::executeSelectedMemberDereferenced(
-                    derived()._global_assembler, global_assembler_method,
-                    derived().local_assemblers_, sad.active_element_ids,
-                    dof_tables, t, dt, x, xdot, process_id, M, K, b_submesh,
-                    jac_or_not_jac...);
+                (pvma_.*global_assembler_method)(
+                    loc_asms, sad.active_element_ids, dof_tables, t, dt, x,
+                    xdot, process_id, M, K, b_submesh, jac_or_not_jac...);
 
                 MathLib::LinAlg::axpy(b, 1.0, b_submesh);
 
@@ -206,16 +218,14 @@ private:
         }
         else
         {
-            // convention: process variable 0 governs where assembly takes place
-            // (active element IDs)
+            // convention: process variable 0 governs where assembly takes
+            // place (active element IDs)
             ProcessLib::ProcessVariable const& pv =
                 derived().getProcessVariables(process_id)[0];
 
-            GlobalExecutor::executeSelectedMemberDereferenced(
-                derived()._global_assembler, global_assembler_method,
-                derived().local_assemblers_, pv.getActiveElementIDs(),
-                dof_tables, t, dt, x, xdot, process_id, M, K, b,
-                jac_or_not_jac...);
+            (pvma_.*global_assembler_method)(
+                loc_asms, pv.getActiveElementIDs(), dof_tables, t, dt, x, xdot,
+                process_id, M, K, b, jac_or_not_jac...);
         }
 
         AssemblyMixinBase::copyResiduumVectorsToBulkMesh(
diff --git a/ProcessLib/CentralDifferencesJacobianAssembler.cpp b/ProcessLib/CentralDifferencesJacobianAssembler.cpp
index 82eb141334c79e2b7b0c2bc769888d56af52b3a5..4796bf5d2dc57794ce1b833946dbecad6729b4d5 100644
--- a/ProcessLib/CentralDifferencesJacobianAssembler.cpp
+++ b/ProcessLib/CentralDifferencesJacobianAssembler.cpp
@@ -221,4 +221,10 @@ createCentralDifferencesJacobianAssembler(BaseLib::ConfigTree const& config)
         std::move(abs_eps));
 }
 
+std::unique_ptr<AbstractJacobianAssembler>
+CentralDifferencesJacobianAssembler::copy() const
+{
+    return std::make_unique<CentralDifferencesJacobianAssembler>(*this);
+}
+
 }  // namespace ProcessLib
diff --git a/ProcessLib/CentralDifferencesJacobianAssembler.h b/ProcessLib/CentralDifferencesJacobianAssembler.h
index 2b1bec489fabb18076b4c40169fb19d80216dceb..320abf6908465053eb51a68301e74652063eb718 100644
--- a/ProcessLib/CentralDifferencesJacobianAssembler.h
+++ b/ProcessLib/CentralDifferencesJacobianAssembler.h
@@ -11,12 +11,13 @@
 #pragma once
 
 #include <memory>
+
 #include "AbstractJacobianAssembler.h"
 
 namespace BaseLib
 {
 class ConfigTree;
-}  // BaseLib
+}  // namespace BaseLib
 
 namespace ProcessLib
 {
@@ -59,6 +60,8 @@ public:
                               std::vector<double>& local_b_data,
                               std::vector<double>& local_Jac_data) override;
 
+    std::unique_ptr<AbstractJacobianAssembler> copy() const override;
+
 private:
     std::vector<double> const _absolute_epsilons;
 
diff --git a/ProcessLib/CompareJacobiansJacobianAssembler.cpp b/ProcessLib/CompareJacobiansJacobianAssembler.cpp
index 45afa470eeebf2378beb2e75c9fa6267fe9e3b60..0c8388b9dbac4fde29cbc3cee2e922aa15c7f77a 100644
--- a/ProcessLib/CompareJacobiansJacobianAssembler.cpp
+++ b/ProcessLib/CompareJacobiansJacobianAssembler.cpp
@@ -396,6 +396,15 @@ void CompareJacobiansJacobianAssembler::assembleWithJacobian(
     }
 }
 
+std::unique_ptr<AbstractJacobianAssembler>
+CompareJacobiansJacobianAssembler::copy() const
+{
+    OGS_FATAL(
+        "CompareJacobiansJacobianAssembler should not be copied. This class "
+        "logs to a file, which would most certainly break after copying "
+        "(concurrent file access) with the current implementation.");
+}
+
 std::unique_ptr<CompareJacobiansJacobianAssembler>
 createCompareJacobiansJacobianAssembler(BaseLib::ConfigTree const& config)
 {
@@ -427,5 +436,4 @@ createCompareJacobiansJacobianAssembler(BaseLib::ConfigTree const& config)
         std::move(asm1), std::move(asm2), abs_tol, rel_tol, fail_on_error,
         log_file);
 }
-
 }  // namespace ProcessLib
diff --git a/ProcessLib/CompareJacobiansJacobianAssembler.h b/ProcessLib/CompareJacobiansJacobianAssembler.h
index 29579a7d59577ea0170238bbadebe67fdaedaba4..74b85599fa8550d19320e64bf83d9ac7f105f53c 100644
--- a/ProcessLib/CompareJacobiansJacobianAssembler.h
+++ b/ProcessLib/CompareJacobiansJacobianAssembler.h
@@ -13,6 +13,7 @@
 #include <fstream>
 #include <limits>
 #include <memory>
+
 #include "AbstractJacobianAssembler.h"
 
 namespace BaseLib
@@ -60,6 +61,8 @@ public:
                               std::vector<double>& local_b_data,
                               std::vector<double>& local_Jac_data) override;
 
+    std::unique_ptr<AbstractJacobianAssembler> copy() const override;
+
 private:
     std::unique_ptr<AbstractJacobianAssembler> _asm1;
     std::unique_ptr<AbstractJacobianAssembler> _asm2;
diff --git a/ProcessLib/ForwardDifferencesJacobianAssembler.cpp b/ProcessLib/ForwardDifferencesJacobianAssembler.cpp
index 067b71fb7bb1685c4d920b3fffec304f0d7dcff4..2a22c0644543b3d4f66b565faf56df2928bd7d1f 100644
--- a/ProcessLib/ForwardDifferencesJacobianAssembler.cpp
+++ b/ProcessLib/ForwardDifferencesJacobianAssembler.cpp
@@ -153,4 +153,10 @@ void ForwardDifferencesJacobianAssembler::assembleWithJacobian(
     }
 }
 
+std::unique_ptr<AbstractJacobianAssembler>
+ForwardDifferencesJacobianAssembler::copy() const
+{
+    return std::make_unique<ForwardDifferencesJacobianAssembler>(*this);
+}
+
 }  // namespace ProcessLib
diff --git a/ProcessLib/ForwardDifferencesJacobianAssembler.h b/ProcessLib/ForwardDifferencesJacobianAssembler.h
index a7045aa6c39796cf233d043a72a6e9cd08f9fe71..c8e695702dadb71c72c6ccd47a801309ffc6fc18 100644
--- a/ProcessLib/ForwardDifferencesJacobianAssembler.h
+++ b/ProcessLib/ForwardDifferencesJacobianAssembler.h
@@ -52,6 +52,8 @@ public:
                               std::vector<double>& local_b_data,
                               std::vector<double>& local_Jac_data) override;
 
+    std::unique_ptr<AbstractJacobianAssembler> copy() const override;
+
 private:
     std::vector<double> const _absolute_epsilons;
 
diff --git a/ProcessLib/Process.cpp b/ProcessLib/Process.cpp
index 850d3948d72ee54cfaadf794ad6e0b49f9796e79..0319ed445afbaecd46538aff120f5052c7dfb95c 100644
--- a/ProcessLib/Process.cpp
+++ b/ProcessLib/Process.cpp
@@ -34,7 +34,8 @@ Process::Process(
     : name(std::move(name_)),
       _mesh(mesh),
       _secondary_variables(std::move(secondary_variables)),
-      _global_assembler(std::move(jacobian_assembler)),
+      _jacobian_assembler(std::move(jacobian_assembler)),
+      _global_assembler(*_jacobian_assembler),
       _use_monolithic_scheme(use_monolithic_scheme),
       _coupled_solutions(nullptr),
       _integration_order(integration_order),
diff --git a/ProcessLib/Process.h b/ProcessLib/Process.h
index 693cf75b92be793c9c187e75b74370e4f23c7830..3832505cc3f680df8cbbd83d9b02d6ce2b87064d 100644
--- a/ProcessLib/Process.h
+++ b/ProcessLib/Process.h
@@ -339,6 +339,9 @@ protected:
 
     SecondaryVariableCollection _secondary_variables;
 
+    // TODO (CL) switch to the parallel vector matrix assembler here, once
+    // feature complete, or use the assembly mixin and remove these two members.
+    std::unique_ptr<ProcessLib::AbstractJacobianAssembler> _jacobian_assembler;
     VectorMatrixAssembler _global_assembler;
 
     const bool _use_monolithic_scheme;
diff --git a/ProcessLib/TH2M/TH2MFEM-impl.h b/ProcessLib/TH2M/TH2MFEM-impl.h
index 2d6bcf372e8c60ad54c79bdd684c991f2c8b44bf..c85f53f6d30f6ed76726c94fbf4d9bda99421b8a 100644
--- a/ProcessLib/TH2M/TH2MFEM-impl.h
+++ b/ProcessLib/TH2M/TH2MFEM-impl.h
@@ -124,6 +124,8 @@ std::vector<ConstitutiveVariables<DisplacementDim>> TH2MLocalAssembler<
     std::vector<ConstitutiveVariables<DisplacementDim>>
         ip_constitutive_variables(n_integration_points);
 
+    PhaseTransitionModelVariables ptmv;
+
     for (unsigned ip = 0; ip < n_integration_points; ip++)
     {
         auto& ip_data = _ip_data[ip];
@@ -338,8 +340,9 @@ std::vector<ConstitutiveVariables<DisplacementDim>> TH2MLocalAssembler<
 
         // constitutive model object as specified in process creation
         auto& ptm = *_process_data.phase_transition_model_;
-        ptm.computeConstitutiveVariables(&medium, vars, pos, t, dt);
-        auto& c = ptm.cv;
+        ptmv = ptm.updateConstitutiveVariables(ptmv, &medium, vars, pos, t, dt);
+        auto const& c = ptmv;
+
         auto const phi_L = ip_data.s_L * ip_data.phi;
         auto const phi_G = (1. - ip_data.s_L) * ip_data.phi;
 
diff --git a/ProcessLib/TH2M/TH2MProcess.cpp b/ProcessLib/TH2M/TH2MProcess.cpp
index a53373046b3e0d3eaa5ad5db5820f4e55b40a5ce..537d011d8a4ea5c367515a52398a846ac92a8a3e 100644
--- a/ProcessLib/TH2M/TH2MProcess.cpp
+++ b/ProcessLib/TH2M/TH2MProcess.cpp
@@ -41,6 +41,7 @@ TH2MProcess<DisplacementDim>::TH2MProcess(
     : Process(std::move(name), mesh, std::move(jacobian_assembler), parameters,
               integration_order, std::move(process_variables),
               std::move(secondary_variables), use_monolithic_scheme),
+      AssemblyMixin<TH2MProcess<DisplacementDim>>{*_jacobian_assembler},
       _process_data(std::move(process_data))
 {
     // TODO (naumov) remove ip suffix. Probably needs modification of the mesh
diff --git a/ProcessLib/ThermoRichardsFlow/ThermoRichardsFlowProcess.cpp b/ProcessLib/ThermoRichardsFlow/ThermoRichardsFlowProcess.cpp
index e4d15a92b0cad829932595b8374d3f97800fded6..ea855b43a2f748b68c0f07c30641630ea40267e5 100644
--- a/ProcessLib/ThermoRichardsFlow/ThermoRichardsFlowProcess.cpp
+++ b/ProcessLib/ThermoRichardsFlow/ThermoRichardsFlowProcess.cpp
@@ -157,10 +157,9 @@ void ThermoRichardsFlowProcess::assembleWithJacobianConcreteProcess(
 
     ProcessLib::ProcessVariable const& pv = getProcessVariables(process_id)[0];
 
-    GlobalExecutor::executeSelectedMemberDereferenced(
-        _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, pv.getActiveElementIDs(), dof_tables, t, dt, x, xdot,
-        process_id, M, K, b, Jac);
+    _pvma.assembleWithJacobian(_local_assemblers, pv.getActiveElementIDs(),
+                               dof_tables, t, dt, x, xdot, process_id, M, K, b,
+                               Jac);
 
     auto copyRhs = [&](int const variable_id, auto& output_vector)
     {
diff --git a/ProcessLib/ThermoRichardsFlow/ThermoRichardsFlowProcess.h b/ProcessLib/ThermoRichardsFlow/ThermoRichardsFlowProcess.h
index c93f2ee3091f9246a12c147b7d07ce9d9c25f6b6..8b1eb63d4d464eb80781866cc20749d7f7f45199 100644
--- a/ProcessLib/ThermoRichardsFlow/ThermoRichardsFlowProcess.h
+++ b/ProcessLib/ThermoRichardsFlow/ThermoRichardsFlowProcess.h
@@ -11,6 +11,7 @@
 #pragma once
 
 #include "LocalAssemblerInterface.h"
+#include "ProcessLib/Assembly/ParallelVectorMatrixAssembler.h"
 #include "ProcessLib/Process.h"
 #include "ThermoRichardsFlowProcessData.h"
 
@@ -67,13 +68,11 @@ namespace ThermoRichardsFlow
  *  \f$\beta\f$ is a composite coefficient by the liquid compressibility and
  * solid compressibility, \f$\alpha_B\f$ is the Biot's constant,
  * \f$\alpha_T^s\f$ is the linear thermal  expansivity of solid, \f$Q_H\f$
- * is the point source or sink term,  \f$H(S-1)\f$ is the Heaviside function, and
- * \f$ \mathbf u\f$ is the displacement. While this process does not contain a fully
- * mechanical coupling, simplfied expressions can be given to approximate the latter
- * term under certain stress conditions.
- * The liquid velocity \f$\mathbf{v}^l\f$ is
- * described by the Darcy's law as
- * \f[
+ * is the point source or sink term,  \f$H(S-1)\f$ is the Heaviside function,
+ * and \f$ \mathbf u\f$ is the displacement. While this process does not contain
+ * a fully mechanical coupling, simplfied expressions can be given to
+ * approximate the latter term under certain stress conditions. The liquid
+ * velocity \f$\mathbf{v}^l\f$ is described by the Darcy's law as \f[
  * \mathbf{v}^l=-\frac{{\mathbf k} k_{ref}}{\mu} (\nabla p - \rho^l \mathbf g)
  * \f]
  * with \f${\mathbf k}\f$ the intrinsic permeability, \f$k_{ref}\f$ the relative
@@ -136,6 +135,8 @@ private:
     std::unique_ptr<MeshLib::MeshSubset const> _mesh_subset_base_nodes;
     ThermoRichardsFlowProcessData _process_data;
 
+    Assembly::ParallelVectorMatrixAssembler _pvma{*_jacobian_assembler};
+
     std::vector<std::unique_ptr<LocalAssemblerIF>> _local_assemblers;
 
     /// Sparsity pattern for the flow equation, and it is initialized only if
diff --git a/ProcessLib/ThermoRichardsMechanics/Tests.cmake b/ProcessLib/ThermoRichardsMechanics/Tests.cmake
index cb8f33a6e59fa021939929fb7e2f19262194ac19..69ae3a3b7cd9aa82184a6401714d3a23104bbc0a 100644
--- a/ProcessLib/ThermoRichardsMechanics/Tests.cmake
+++ b/ProcessLib/ThermoRichardsMechanics/Tests.cmake
@@ -219,7 +219,7 @@ AddTest(
     PointHeatSource_quarter_002_2nd_r_lt_2_ts_10_t_50000.000000.vtu PointHeatSource_quarter_002_2nd_r_lt_2_ts_10_t_50000.000000.vtu sigma sigma 1.1e-5 0
     PointHeatSource_quarter_002_2nd_r_lt_2_ts_10_t_50000.000000.vtu PointHeatSource_quarter_002_2nd_r_lt_2_ts_10_t_50000.000000.vtu HeatFlowRate HeatFlowRate 6.1e-12 0
     PointHeatSource_quarter_002_2nd_r_lt_2_ts_10_t_50000.000000.vtu PointHeatSource_quarter_002_2nd_r_lt_2_ts_10_t_50000.000000.vtu MassFlowRate MassFlowRate 1e-15 0
-    PointHeatSource_quarter_002_2nd_r_lt_2_ts_10_t_50000.000000.vtu PointHeatSource_quarter_002_2nd_r_lt_2_ts_10_t_50000.000000.vtu NodalForces NodalForces 2e-8 0
+    PointHeatSource_quarter_002_2nd_r_lt_2_ts_10_t_50000.000000.vtu PointHeatSource_quarter_002_2nd_r_lt_2_ts_10_t_50000.000000.vtu NodalForces NodalForces 2.1e-8 0
 )
 
 AddTest(
diff --git a/ProcessLib/ThermoRichardsMechanics/ThermoRichardsMechanicsProcess.cpp b/ProcessLib/ThermoRichardsMechanics/ThermoRichardsMechanicsProcess.cpp
index acdf602bfa9b85cb998d37d48c10c6a5a70587b4..c4204f086ac487268d7cbefd04fa68743a8dfefa 100644
--- a/ProcessLib/ThermoRichardsMechanics/ThermoRichardsMechanicsProcess.cpp
+++ b/ProcessLib/ThermoRichardsMechanics/ThermoRichardsMechanicsProcess.cpp
@@ -44,6 +44,9 @@ ThermoRichardsMechanicsProcess<DisplacementDim, ConstitutiveTraits>::
     : Process(std::move(name), mesh, std::move(jacobian_assembler), parameters,
               integration_order, std::move(process_variables),
               std::move(secondary_variables), use_monolithic_scheme),
+      AssemblyMixin<
+          ThermoRichardsMechanicsProcess<DisplacementDim, ConstitutiveTraits>>{
+          *_jacobian_assembler},
       process_data_(std::move(process_data))
 {
     ProcessLib::Reflection::addReflectedIntegrationPointWriters<
diff --git a/ProcessLib/VectorMatrixAssembler.cpp b/ProcessLib/VectorMatrixAssembler.cpp
index a6afd7f2a8b925f4959b52a6dff56a77e5eff272..2998ba40c985ecb0ea8fcde5560e397b02805b60 100644
--- a/ProcessLib/VectorMatrixAssembler.cpp
+++ b/ProcessLib/VectorMatrixAssembler.cpp
@@ -21,8 +21,8 @@
 namespace ProcessLib
 {
 VectorMatrixAssembler::VectorMatrixAssembler(
-    std::unique_ptr<AbstractJacobianAssembler>&& jacobian_assembler)
-    : _jacobian_assembler(std::move(jacobian_assembler))
+    AbstractJacobianAssembler& jacobian_assembler)
+    : _jacobian_assembler(jacobian_assembler)
 {
 }
 
@@ -133,7 +133,7 @@ void VectorMatrixAssembler::assembleWithJacobian(
     {
         auto const local_x = x[process_id]->get(indices);
         auto const local_xdot = xdot[process_id]->get(indices);
-        _jacobian_assembler->assembleWithJacobian(
+        _jacobian_assembler.assembleWithJacobian(
             local_assembler, t, dt, local_x, local_xdot, _local_M_data,
             _local_K_data, _local_b_data, _local_Jac_data);
     }
@@ -147,7 +147,7 @@ void VectorMatrixAssembler::assembleWithJacobian(
             getCoupledLocalSolutions(xdot, indices_of_processes);
         auto const local_xdot = MathLib::toVector(local_coupled_xdots);
 
-        _jacobian_assembler->assembleWithJacobianForStaggeredScheme(
+        _jacobian_assembler.assembleWithJacobianForStaggeredScheme(
             local_assembler, t, dt, local_x, local_xdot, process_id,
             _local_M_data, _local_K_data, _local_b_data, _local_Jac_data);
     }
diff --git a/ProcessLib/VectorMatrixAssembler.h b/ProcessLib/VectorMatrixAssembler.h
index a9621887cf655896f11b9d08d05a8a82b88cc399..e6a871a06da87684bf5eacd1cadc64b8ce14f056 100644
--- a/ProcessLib/VectorMatrixAssembler.h
+++ b/ProcessLib/VectorMatrixAssembler.h
@@ -34,7 +34,7 @@ class VectorMatrixAssembler final
 {
 public:
     explicit VectorMatrixAssembler(
-        std::unique_ptr<AbstractJacobianAssembler>&& jacobian_assembler);
+        AbstractJacobianAssembler& jacobian_assembler);
 
     void preAssemble(const std::size_t mesh_item_id,
                      LocalAssemblerInterface& local_assembler,
@@ -73,7 +73,7 @@ private:
     std::vector<double> _local_Jac_data;
 
     //! Used to assemble the Jacobian.
-    std::unique_ptr<AbstractJacobianAssembler> _jacobian_assembler;
+    AbstractJacobianAssembler& _jacobian_assembler;
 
     Assembly::LocalMatrixOutput _local_output;
 };
diff --git a/Tests/Data/TH2M/submesh_residuum_assembly/p_G.xml b/Tests/Data/TH2M/submesh_residuum_assembly/p_G.xml
index c9c204aab9b1fc165894dfd03d1f462ea68a85bc..cd8022ca8472db6fc8d5b7571523998ac5bc019e 100644
--- a/Tests/Data/TH2M/submesh_residuum_assembly/p_G.xml
+++ b/Tests/Data/TH2M/submesh_residuum_assembly/p_G.xml
@@ -108,7 +108,7 @@
                 <absolute_tolerance>1e-15</absolute_tolerance>
                 <relative_tolerance>0</relative_tolerance>
             </vtkdiff>
-            
+
             <vtkdiff>
                 <regex>p_G.*_ts_[0-9]*_.*\.vtu</regex>
                 <field>gas_pressure_interpolated</field>
@@ -121,11 +121,11 @@
                 <absolute_tolerance>1e-15</absolute_tolerance>
                 <relative_tolerance>0</relative_tolerance>
             </vtkdiff>
-            
+
             <vtkdiff>
                 <regex>p_G.*_ts_[0-9]*_.*\.vtu</regex>
                 <field>capillary_pressure_interpolated</field>
-                <absolute_tolerance>5e-10</absolute_tolerance>
+                <absolute_tolerance>5.3e-10</absolute_tolerance>
                 <relative_tolerance>0</relative_tolerance>
             </vtkdiff>
             <vtkdiff>
@@ -134,7 +134,7 @@
                 <absolute_tolerance>1e-15</absolute_tolerance>
                 <relative_tolerance>0</relative_tolerance>
             </vtkdiff>
-            
+
             <vtkdiff>
                 <regex>p_G.*_ts_[0-9]*_.*\.vtu</regex>
                 <field>displacement</field>
diff --git a/Tests/Data/TH2M/submesh_residuum_assembly/u.xml b/Tests/Data/TH2M/submesh_residuum_assembly/u.xml
index 1c98385ec8c288ad3b96d1060b6b9120b4fbc4ce..cc7ef6a6c77aa3ee2af9e9c2f3519b9087d980bb 100644
--- a/Tests/Data/TH2M/submesh_residuum_assembly/u.xml
+++ b/Tests/Data/TH2M/submesh_residuum_assembly/u.xml
@@ -94,11 +94,11 @@
                 <absolute_tolerance>1e-15</absolute_tolerance>
                 <relative_tolerance>0</relative_tolerance>
             </vtkdiff>
-            
+
             <vtkdiff>
                 <regex>u.*_ts_[0-9]*_.*\.vtu</regex>
                 <field>gas_pressure_interpolated</field>
-                <absolute_tolerance>4.5e-13</absolute_tolerance>
+                <absolute_tolerance>4.6e-13</absolute_tolerance>
                 <relative_tolerance>0</relative_tolerance>
             </vtkdiff>
             <vtkdiff>
@@ -107,11 +107,11 @@
                 <absolute_tolerance>1e-15</absolute_tolerance>
                 <relative_tolerance>0</relative_tolerance>
             </vtkdiff>
-            
+
             <vtkdiff>
                 <regex>u.*_ts_[0-9]*_.*\.vtu</regex>
                 <field>capillary_pressure_interpolated</field>
-                <absolute_tolerance>4.5e-8</absolute_tolerance>
+                <absolute_tolerance>4.6e-8</absolute_tolerance>
                 <relative_tolerance>0</relative_tolerance>
             </vtkdiff>
             <vtkdiff>
@@ -120,7 +120,7 @@
                 <absolute_tolerance>1e-15</absolute_tolerance>
                 <relative_tolerance>0</relative_tolerance>
             </vtkdiff>
-            
+
             <vtkdiff>
                 <regex>u.*_ts_[0-9]*_.*\.vtu</regex>
                 <field>displacement</field>
diff --git a/Tests/Data/ThermoRichardsMechanics/MFront/A2/A2.xml b/Tests/Data/ThermoRichardsMechanics/MFront/A2/A2.xml
index b36cd7a69dabbcbc4b9bee987081aa8ae52729c6..9d05717fdb6cbd4100376dcfa0ac4d7fc55655ba 100644
--- a/Tests/Data/ThermoRichardsMechanics/MFront/A2/A2.xml
+++ b/Tests/Data/ThermoRichardsMechanics/MFront/A2/A2.xml
@@ -109,6 +109,6 @@
 
     <!-- Relax the tolerance a bit -->
     <replace sel="/*/test_definition/vtkdiff[field=&quot;NodalForces&quot;]/absolute_tolerance/text()">
-        6e-10
+        7e-10
     </replace>
 </OpenGeoSysProjectDiff>
diff --git a/Tests/Python/test_ogs_asm_threads.py b/Tests/Python/test_ogs_asm_threads.py
new file mode 100644
index 0000000000000000000000000000000000000000..253902d6aca2494332e7afd6ac0229feb144a5fb
--- /dev/null
+++ b/Tests/Python/test_ogs_asm_threads.py
@@ -0,0 +1,88 @@
+import tempfile
+import os
+import platform
+
+import pytest
+import ogs.simulator as sim
+
+
+def run(prjpath, outdir, expect_successful):
+    arguments = ["ogs", prjpath, "-o", outdir]
+
+    try:
+        while True:
+            print("Python OpenGeoSys.init ...")
+            status = sim.initialize(arguments)
+
+            if status != 0:
+                break
+
+            print("Python OpenGeoSys.executeSimulation ...")
+            status = sim.executeSimulation()
+
+            break
+    finally:
+        print("Python OpenGeoSys.finalize() ...")
+        sim.finalize()
+
+    status_expected = 0 if expect_successful else 1
+    assert status == status_expected
+
+
+def check_simulation_results_exist(outdir):
+    assert os.path.exists(
+        os.path.join(outdir, "anisotropic_thermal_expansion_ts_1_t_1000000.000000.vtu")
+    )
+
+
+@pytest.mark.parametrize(
+    "asm_threads_parameter",
+    [
+        # first entry: how to set OGS_ASM_THREADS
+        # second entry: should OGS run successfully?
+        (False, True),  # do not set env var
+        ("1", True),  # positive integer
+        ("1  ", True),  # positive integer with trailing spaces
+        ("   1", True),  # positive integer with leading spaces
+        ("    1   ", True),  # positive integer surrounded by spaces
+        #
+        ("", False),  # set but empty
+        (" ", False),  # blank string
+        ("1.1", False),  # floating point number
+        ("0", False),  # zero
+        ("-1", False),  # minus one (has a special meaning sometimes)
+        ("-13", False),  # another negative number
+        ("4 5", False),  # list of integers
+        ("4x", False),  # integer and garbage
+        ("zxyf", False),  # garbage
+        ("!23", False),  # garbage and integer
+    ],
+)
+def test_ogs_asm_threads_env_var(monkeypatch, asm_threads_parameter):
+    srcdir = os.path.join(os.path.dirname(__file__), "..", "..")
+    prjpath = os.path.join(
+        srcdir,
+        # fast running model with TRM process (OpenMP parallelized)
+        "Tests/Data/ThermoRichardsMechanics/anisotropic_thermal_expansion/aniso_expansion.prj",
+    )
+
+    asm_threads_setting, expect_ogs_success = asm_threads_parameter
+
+    if platform.system() == "Windows" and asm_threads_setting == "":
+        # Empty env var not supported on Windows!
+        return
+
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        # https://docs.pytest.org/en/6.2.x/reference.html#pytest.MonkeyPatch
+        with monkeypatch.context() as ctx:
+            # prepare environment
+            if asm_threads_setting != False:
+                ctx.setenv("OGS_ASM_THREADS", asm_threads_setting)
+
+            ctx.chdir(tmpdirname)
+
+            # run and test
+            run(prjpath, tmpdirname, expect_ogs_success)
+
+            if expect_ogs_success:
+                check_simulation_results_exist(tmpdirname)
diff --git a/scripts/cmake/test/AddTest.cmake b/scripts/cmake/test/AddTest.cmake
index f250004f96ff29b5e013c702a597baf65d4d54f5..b006b4ac41050e8ed86fce846354c62c5963d414 100644
--- a/scripts/cmake/test/AddTest.cmake
+++ b/scripts/cmake/test/AddTest.cmake
@@ -84,8 +84,6 @@ function(AddTest)
 
     set(AddTest_SOURCE_PATH "${Data_SOURCE_DIR}/${AddTest_PATH}")
     set(AddTest_BINARY_PATH "${Data_BINARY_DIR}/${AddTest_PATH}")
-    file(MAKE_DIRECTORY ${AddTest_BINARY_PATH})
-    file(TO_NATIVE_PATH "${AddTest_BINARY_PATH}" AddTest_BINARY_PATH_NATIVE)
     set(AddTest_STDOUT_FILE_PATH
         "${AddTest_BINARY_PATH}/${AddTest_NAME}_stdout.txt"
     )
@@ -108,9 +106,6 @@ function(AddTest)
     endif()
 
     if("${AddTest_EXECUTABLE}" STREQUAL "ogs")
-        set(AddTest_EXECUTABLE_ARGS -o ${AddTest_BINARY_PATH_NATIVE}
-                                    ${AddTest_EXECUTABLE_ARGS}
-        )
         set(AddTest_WORKING_DIRECTORY ${AddTest_SOURCE_PATH})
     endif()
 
@@ -220,6 +215,192 @@ function(AddTest)
         set(SELECTED_DIFF_TOOL_PATH $<TARGET_FILE:xdmfdiff>)
     endif()
 
+    # -----------
+    if(TARGET ${AddTest_EXECUTABLE})
+        set(AddTest_EXECUTABLE_PARSED $<TARGET_FILE:${AddTest_EXECUTABLE}>)
+    else()
+        set(AddTest_EXECUTABLE_PARSED ${AddTest_EXECUTABLE})
+    endif()
+
+    # Run the wrapper
+    if(DEFINED AddTest_WRAPPER)
+        set(AddTest_WRAPPER_STRING "-${AddTest_WRAPPER}")
+    endif()
+    set(TEST_NAME
+        "${AddTest_EXECUTABLE}-${AddTest_NAME}${AddTest_WRAPPER_STRING}"
+    )
+
+    # Process placeholders <PATH> <SOURCE_PATH> and <BUILD_PATH>
+    string(REPLACE "<PATH>" "${AddTest_PATH}" AddTest_WORKING_DIRECTORY
+                   "${AddTest_WORKING_DIRECTORY}"
+    )
+    string(REPLACE "<PATH>" "${AddTest_PATH}" AddTest_EXECUTABLE_ARGS
+                   "${AddTest_EXECUTABLE_ARGS}"
+    )
+
+    string(REPLACE "<SOURCE_PATH>" "${Data_SOURCE_DIR}/${AddTest_PATH}"
+                   AddTest_WORKING_DIRECTORY "${AddTest_WORKING_DIRECTORY}"
+    )
+    string(REPLACE "<SOURCE_PATH>" "${Data_SOURCE_DIR}/${AddTest_PATH}"
+                   AddTest_EXECUTABLE_ARGS "${AddTest_EXECUTABLE_ARGS}"
+    )
+    string(REPLACE "<BUILD_PATH>" "${Data_BINARY_DIR}/${AddTest_PATH}"
+                   AddTest_WORKING_DIRECTORY "${AddTest_WORKING_DIRECTORY}"
+    )
+    string(REPLACE "<BUILD_PATH>" "${Data_BINARY_DIR}/${AddTest_PATH}"
+                   AddTest_EXECUTABLE_ARGS "${AddTest_EXECUTABLE_ARGS}"
+    )
+
+    current_dir_as_list(ProcessLib labels)
+    if(${AddTest_RUNTIME} LESS_EQUAL ${ogs.ctest.large_runtime})
+        list(APPEND labels default)
+    else()
+        list(APPEND labels large)
+    endif()
+
+    _add_test(${TEST_NAME})
+
+    # OpenMP tests for specific processes only. TODO (CL) Once all processes can
+    # be assembled OpenMP parallel, the condition should be removed.
+    if("${labels}" MATCHES "TH2M|ThermoRichards")
+        _add_test(${TEST_NAME}-omp)
+        _set_omp_test_properties()
+    endif()
+
+    if(AddTest_PYTHON_PACKAGES)
+        list(APPEND labels python_modules)
+        if(OGS_USE_PIP)
+            # Info has to be passed by global property because it is not
+            # possible to set cache variables from inside a function.
+            set_property(
+                GLOBAL APPEND PROPERTY AddTest_PYTHON_PACKAGES
+                                       ${AddTest_PYTHON_PACKAGES}
+            )
+        else()
+            message(
+                STATUS
+                    "Warning: Benchmark ${AddTest_NAME} requires these "
+                    "Python packages: ${AddTest_PYTHON_PACKAGES}!\n Make sure to "
+                    "have them installed in your current Python environment OR "
+                    "set OGS_USE_PIP=ON!"
+            )
+        endif()
+    endif()
+
+    add_dependencies(ctest ${AddTest_EXECUTABLE})
+    add_dependencies(ctest-large ${AddTest_EXECUTABLE})
+
+    if(OGS_WRITE_BENCHMARK_COMMANDS)
+        string(
+            REPLACE
+                ";"
+                " "
+                _cmd
+                "${AddTest_WRAPPER} ${AddTest_WRAPPER_ARGS} ${AddTest_EXECUTABLE} ${AddTest_EXECUTABLE_ARGS}"
+        )
+        string(REPLACE "${Data_BINARY_DIR}/" "" _cmd "${_cmd}")
+        string(REPLACE "-o" "" _cmd "${_cmd}")
+        # string(STRIP "${_cmd}" _cmd)
+        set(_benchmark_run_commands
+            "${_benchmark_run_commands} ${AddTest_EXECUTABLE} ; ${_cmd} ; ${TEST_NAME}\n"
+            CACHE INTERNAL ""
+        )
+    endif()
+
+    if(NOT AddTest_TESTER OR OGS_COVERAGE)
+        return()
+    endif()
+
+    # Run the tester
+    _add_test_tester(${TEST_NAME})
+    if("${labels}" MATCHES "TH2M|ThermoRichards")
+        _add_test_tester(${TEST_NAME}-omp)
+    endif()
+
+endfunction()
+
+# Add a ctest and sets properties
+macro(_add_test TEST_NAME)
+
+    set(_binary_path ${AddTest_BINARY_PATH})
+    if("${TEST_NAME}" MATCHES "-omp")
+        set(_binary_path ${_binary_path}-omp)
+    endif()
+
+    file(TO_NATIVE_PATH "${_binary_path}" AddTest_BINARY_PATH_NATIVE)
+    file(MAKE_DIRECTORY ${_binary_path})
+    set(_exe_args ${AddTest_EXECUTABLE_ARGS})
+    if("${AddTest_EXECUTABLE}" STREQUAL "ogs")
+        set(_exe_args -o ${AddTest_BINARY_PATH_NATIVE}
+                      ${AddTest_EXECUTABLE_ARGS}
+        )
+    endif()
+
+    add_test(
+        NAME ${TEST_NAME}
+        COMMAND
+            ${CMAKE_COMMAND} -DEXECUTABLE=${AddTest_EXECUTABLE_PARSED}
+            "-DEXECUTABLE_ARGS=${_exe_args}" # Quoted because
+            # passed as list see https://stackoverflow.com/a/33248574/80480
+            -DBINARY_PATH=${_binary_path} -DWRAPPER_COMMAND=${WRAPPER_COMMAND}
+            "-DWRAPPER_ARGS=${AddTest_WRAPPER_ARGS}"
+            "-DFILES_TO_DELETE=${FILES_TO_DELETE}"
+            -DWORKING_DIRECTORY=${AddTest_WORKING_DIRECTORY}
+            -DLOG_FILE=${PROJECT_BINARY_DIR}/logs/${TEST_NAME}.txt -P
+            ${PROJECT_SOURCE_DIR}/scripts/cmake/test/AddTestWrapper.cmake
+    )
+
+    if(DEFINED AddTest_DEPENDS)
+        if(NOT (TEST ${AddTest_DEPENDS} OR TARGET ${AddTest_DEPENDS}))
+            message(
+                FATAL_ERROR
+                    "AddTest ${TEST_NAME}: dependency ${AddTest_DEPENDS} does not exist!"
+            )
+        endif()
+        set_tests_properties(${TEST_NAME} PROPERTIES DEPENDS ${AddTest_DEPENDS})
+    endif()
+    if(DEFINED MPI_PROCESSORS)
+        set_tests_properties(
+            ${TEST_NAME} PROPERTIES PROCESSORS ${MPI_PROCESSORS}
+        )
+    endif()
+
+    set_tests_properties(
+        ${TEST_NAME}
+        PROPERTIES ${AddTest_PROPERTIES}
+                   COST
+                   ${AddTest_RUNTIME}
+                   DISABLED
+                   ${AddTest_DISABLED}
+                   LABELS
+                   "${labels}"
+    )
+endmacro()
+
+# Sets number of threads, adds label 'omp' and adds a dependency to the
+# non-OpenMP test because both write to the same output files.
+macro(_set_omp_test_properties)
+    get_test_property(${TEST_NAME}-omp ENVIRONMENT _environment)
+    if(NOT _environment)
+        set(_environment "")
+    endif()
+    set_tests_properties(
+        ${TEST_NAME}-omp
+        PROPERTIES ENVIRONMENT "OGS_ASM_THREADS=4;${_environment}" PROCESSORS 4
+                   LABELS "${labels};omp"
+    )
+endmacro()
+
+# Adds subsequent tester ctest.
+macro(_add_test_tester TEST_NAME)
+
+    set(_binary_path ${AddTest_BINARY_PATH})
+    if("${TEST_NAME}" MATCHES "-omp")
+        set(_binary_path ${_binary_path}-omp)
+    endif()
+
+    set(TESTER_NAME "${TEST_NAME}-${AddTest_TESTER}")
+
     if(AddTest_TESTER STREQUAL "diff")
         foreach(FILE ${AddTest_DIFF_DATA})
             get_filename_component(FILE_EXPECTED ${FILE} NAME)
@@ -228,7 +409,7 @@ function(AddTest)
                 TESTER_COMMAND
                 "${SELECTED_DIFF_TOOL_PATH} \
                 ${TESTER_ARGS} ${AddTest_TESTER_ARGS} ${AddTest_SOURCE_PATH}/${FILE_EXPECTED} \
-                ${AddTest_BINARY_PATH}/${FILE}"
+                ${_binary_path}/${FILE}"
             )
             list(APPEND FILES_TO_DELETE "${FILE}")
 
@@ -267,7 +448,7 @@ Use six arguments version of AddTest with absolute and relative tolerances"
                     TESTER_COMMAND
                     "${SELECTED_DIFF_TOOL_PATH} \
                 ${AddTest_SOURCE_PATH}/${REFERENCE_VTK_FILE} \
-                ${AddTest_BINARY_PATH}/${VTK_FILE} \
+                ${_binary_path}/${VTK_FILE} \
                 -a ${NAME_A} -b ${NAME_B} \
                 ${TESTER_ARGS} ${AddTest_TESTER_ARGS}"
                 )
@@ -307,7 +488,7 @@ Use six arguments version of AddTest with absolute and relative tolerances"
                         TESTER_COMMAND
                         "${SELECTED_DIFF_TOOL_PATH} \
                     ${AddTest_SOURCE_PATH}/${REFERENCE_VTK_FILE} \
-                    ${AddTest_BINARY_PATH}/${VTK_FILE} \
+                    ${_binary_path}/${VTK_FILE} \
                     -a ${NAME_A} -b ${NAME_B} \
                     --abs ${ABS_TOL} --rel ${REL_TOL} \
                     ${TESTER_ARGS} ${AddTest_TESTER_ARGS}"
@@ -340,7 +521,7 @@ Use six arguments version of AddTest with absolute and relative tolerances"
                     TESTER_COMMAND
                     "${SELECTED_DIFF_TOOL_PATH} -m \
                 ${AddTest_SOURCE_PATH}/${REFERENCE_VTK_FILE} \
-                ${AddTest_BINARY_PATH}/${VTK_FILE} \
+                ${_binary_path}/${VTK_FILE} \
                 --abs ${ABS_TOLERANCE}"
                 )
             endforeach()
@@ -376,7 +557,7 @@ Use six arguments version of AddTest with absolute and relative tolerances"
                 --abs ${ABS_TOL} --rel ${REL_TOL} \
                 ${TESTER_ARGS} ${AddTest_TESTER_ARGS}\
                 ${AddTest_SOURCE_PATH}/${FILE_EXPECTED} \
-                ${AddTest_BINARY_PATH}/${GML_FILE}"
+                ${_binary_path}/${GML_FILE}"
             )
             list(APPEND FILES_TO_DELETE "${GML_FILE}")
         endforeach()
@@ -386,150 +567,13 @@ Use six arguments version of AddTest with absolute and relative tolerances"
         )
     endif()
 
-    # -----------
-    if(TARGET ${AddTest_EXECUTABLE})
-        set(AddTest_EXECUTABLE_PARSED $<TARGET_FILE:${AddTest_EXECUTABLE}>)
-    else()
-        set(AddTest_EXECUTABLE_PARSED ${AddTest_EXECUTABLE})
-    endif()
-
-    # Run the wrapper
-    if(DEFINED AddTest_WRAPPER)
-        set(AddTest_WRAPPER_STRING "-${AddTest_WRAPPER}")
-    endif()
-    set(TEST_NAME
-        "${AddTest_EXECUTABLE}-${AddTest_NAME}${AddTest_WRAPPER_STRING}"
-    )
-
-    # Process placeholders <PATH> <SOURCE_PATH> and <BUILD_PATH>
-    string(REPLACE "<PATH>" "${AddTest_PATH}" AddTest_WORKING_DIRECTORY
-                   "${AddTest_WORKING_DIRECTORY}"
-    )
-    string(REPLACE "<PATH>" "${AddTest_PATH}" AddTest_EXECUTABLE_ARGS
-                   "${AddTest_EXECUTABLE_ARGS}"
-    )
-
-    string(REPLACE "<SOURCE_PATH>" "${Data_SOURCE_DIR}/${AddTest_PATH}"
-                   AddTest_WORKING_DIRECTORY "${AddTest_WORKING_DIRECTORY}"
-    )
-    string(REPLACE "<SOURCE_PATH>" "${Data_SOURCE_DIR}/${AddTest_PATH}"
-                   AddTest_EXECUTABLE_ARGS "${AddTest_EXECUTABLE_ARGS}"
-    )
-    string(REPLACE "<BUILD_PATH>" "${Data_BINARY_DIR}/${AddTest_PATH}"
-                   AddTest_WORKING_DIRECTORY "${AddTest_WORKING_DIRECTORY}"
-    )
-    string(REPLACE "<BUILD_PATH>" "${Data_BINARY_DIR}/${AddTest_PATH}"
-                   AddTest_EXECUTABLE_ARGS "${AddTest_EXECUTABLE_ARGS}"
-    )
-
-    add_test(
-        NAME ${TEST_NAME}
-        COMMAND
-            ${CMAKE_COMMAND} -DEXECUTABLE=${AddTest_EXECUTABLE_PARSED}
-            "-DEXECUTABLE_ARGS=${AddTest_EXECUTABLE_ARGS}" # Quoted because
-                                                           # passed as list see
-                                                           # https://stackoverflow.com/a/33248574/80480
-            -DBINARY_PATH=${AddTest_BINARY_PATH}
-            -DWRAPPER_COMMAND=${WRAPPER_COMMAND}
-            "-DWRAPPER_ARGS=${AddTest_WRAPPER_ARGS}"
-            "-DFILES_TO_DELETE=${FILES_TO_DELETE}"
-            -DWORKING_DIRECTORY=${AddTest_WORKING_DIRECTORY}
-            -DLOG_FILE=${PROJECT_BINARY_DIR}/logs/${TEST_NAME}.txt -P
-            ${PROJECT_SOURCE_DIR}/scripts/cmake/test/AddTestWrapper.cmake
-    )
-    if(DEFINED AddTest_DEPENDS)
-        if(NOT (TEST ${AddTest_DEPENDS} OR TARGET ${AddTest_DEPENDS}))
-            message(
-                FATAL_ERROR
-                    "AddTest ${TEST_NAME}: dependency ${AddTest_DEPENDS} does not exist!"
-            )
-        endif()
-        set_tests_properties(${TEST_NAME} PROPERTIES DEPENDS ${AddTest_DEPENDS})
-    endif()
-    if(DEFINED MPI_PROCESSORS)
-        set_tests_properties(
-            ${TEST_NAME} PROPERTIES PROCESSORS ${MPI_PROCESSORS}
-        )
-    endif()
-
-    current_dir_as_list(ProcessLib labels)
-    if(${AddTest_RUNTIME} LESS_EQUAL ${ogs.ctest.large_runtime})
-        list(APPEND labels default)
-    else()
-        list(APPEND labels large)
-    endif()
-
-    if(AddTest_PYTHON_PACKAGES)
-        list(APPEND labels python_modules)
-        if(OGS_USE_PIP)
-            # Info has to be passed by global property because it is not
-            # possible to set cache variables from inside a function.
-            set_property(
-                GLOBAL APPEND PROPERTY AddTest_PYTHON_PACKAGES
-                                       ${AddTest_PYTHON_PACKAGES}
-            )
-        else()
-            message(
-                STATUS
-                    "Warning: Benchmark ${AddTest_NAME} requires these "
-                    "Python packages: ${AddTest_PYTHON_PACKAGES}!\n Make sure to "
-                    "have them installed in your current Python environment OR "
-                    "set OGS_USE_PIP=ON!"
-            )
-        endif()
-    endif()
-
-    set_tests_properties(
-        ${TEST_NAME}
-        PROPERTIES ${AddTest_PROPERTIES}
-                   COST
-                   ${AddTest_RUNTIME}
-                   DISABLED
-                   ${AddTest_DISABLED}
-                   LABELS
-                   "${labels}"
-    )
-    # Disabled for the moment, does not work with CI under load
-    # ~~~
-    # if(NOT OGS_COVERAGE)
-    #     set_tests_properties(${TEST_NAME} PROPERTIES TIMEOUT ${timeout})
-    # endif()
-    # ~~~
-
-    add_dependencies(ctest ${AddTest_EXECUTABLE})
-    add_dependencies(ctest-large ${AddTest_EXECUTABLE})
-
-    if(OGS_WRITE_BENCHMARK_COMMANDS)
-        string(
-            REPLACE
-                ";"
-                " "
-                _cmd
-                "${AddTest_WRAPPER} ${AddTest_WRAPPER_ARGS} ${AddTest_EXECUTABLE} ${AddTest_EXECUTABLE_ARGS}"
-        )
-        string(REPLACE "${Data_BINARY_DIR}/" "" _cmd "${_cmd}")
-        string(REPLACE "-o" "" _cmd "${_cmd}")
-        # string(STRIP "${_cmd}" _cmd)
-        set(_benchmark_run_commands
-            "${_benchmark_run_commands} ${AddTest_EXECUTABLE} ; ${_cmd} ; ${TEST_NAME}\n"
-            CACHE INTERNAL ""
-        )
-    endif()
-
-    if(NOT AddTest_TESTER OR OGS_COVERAGE)
-        return()
-    endif()
-
-    # Run the tester
-    set(TESTER_NAME "${TEST_NAME}-${AddTest_TESTER}")
     add_test(
         NAME ${TESTER_NAME}
         COMMAND
             ${CMAKE_COMMAND} -DSOURCE_PATH=${AddTest_SOURCE_PATH}
-            -DBINARY_PATH=${${AddTest_BINARY_PATH}}
             -DSELECTED_DIFF_TOOL_PATH=${SELECTED_DIFF_TOOL_PATH}
-            "-DTESTER_COMMAND=${TESTER_COMMAND}"
-            -DBINARY_PATH=${AddTest_BINARY_PATH} -DGLOB_MODE=${GLOB_MODE}
+            "-DTESTER_COMMAND=${TESTER_COMMAND}" -DBINARY_PATH=${_binary_path}
+            -DGLOB_MODE=${GLOB_MODE}
             -DLOG_FILE_BASE=${PROJECT_BINARY_DIR}/logs/${TESTER_NAME} -P
             ${PROJECT_SOURCE_DIR}/scripts/cmake/test/AddTestTester.cmake
             --debug-output
@@ -539,5 +583,4 @@ Use six arguments version of AddTest with absolute and relative tolerances"
         ${TESTER_NAME} PROPERTIES DEPENDS ${TEST_NAME} DISABLED
                                   ${AddTest_DISABLED} LABELS "tester;${labels}"
     )
-
-endfunction()
+endmacro()
diff --git a/scripts/cmake/test/NotebookTest.cmake b/scripts/cmake/test/NotebookTest.cmake
index e53cc792358d80a08120cb43861ac004ea9f78a6..f19a3e16361e745f875decea34539f47b108a4f4 100644
--- a/scripts/cmake/test/NotebookTest.cmake
+++ b/scripts/cmake/test/NotebookTest.cmake
@@ -86,8 +86,6 @@ function(NotebookTest)
          ${NotebookTest_PROPERTIES}
     )
 
-    message(STATUS "Labels of ${TEST_NAME}:\n      ${labels}")
-
     set_tests_properties(
         ${TEST_NAME}
         PROPERTIES ${_props}
diff --git a/scripts/cmake/test/OgsTest.cmake b/scripts/cmake/test/OgsTest.cmake
index b89ac1162edceab89234c6419be8aa01cc13319d..a5de9de7e0e0e80f2f3984c1f408bb2028068210 100644
--- a/scripts/cmake/test/OgsTest.cmake
+++ b/scripts/cmake/test/OgsTest.cmake
@@ -54,10 +54,6 @@ function(OgsTest)
     endif()
 
     set(OgsTest_SOURCE_DIR "${Data_SOURCE_DIR}/${OgsTest_DIR}")
-    set(OgsTest_BINARY_DIR "${Data_BINARY_DIR}/${OgsTest_DIR}")
-    file(MAKE_DIRECTORY ${OgsTest_BINARY_DIR})
-    file(TO_NATIVE_PATH "${OgsTest_BINARY_DIR}" OgsTest_BINARY_DIR_NATIVE)
-
     set(TEST_NAME "ogs-${OgsTest_DIR}/${OgsTest_NAME_WE}")
     # Add wrapper postfix (-mpi for mpirun).
     if(OgsTest_WRAPPER)
@@ -70,8 +66,34 @@ function(OgsTest)
     set(_exe_args -r ${OgsTest_SOURCE_DIR}
                   ${OgsTest_SOURCE_DIR}/${OgsTest_NAME}
     )
-    string(REPLACE "/" "_" TEST_NAME_UNDERSCORE ${TEST_NAME})
 
+    current_dir_as_list(ProcessLib labels)
+    if(${OgsTest_RUNTIME} LESS_EQUAL ${ogs.ctest.large_runtime})
+        list(APPEND labels default)
+    else()
+        list(APPEND labels large)
+    endif()
+
+    _ogs_add_test(${TEST_NAME})
+
+    # OpenMP tests for specific processes only. TODO (CL) Once all processes can
+    # be assembled OpenMP parallel, the condition should be removed.
+    if("${labels}" MATCHES "TH2M|ThermoRichards")
+        _ogs_add_test(${TEST_NAME}-omp)
+        _set_omp_test_properties()
+    endif()
+endfunction()
+
+# Add a ctest and sets properties
+macro(_ogs_add_test TEST_NAME)
+    if("${TEST_NAME}" MATCHES "-omp")
+        set(OgsTest_BINARY_DIR "${Data_BINARY_DIR}/${OgsTest_DIR}-omp")
+    else()
+        set(OgsTest_BINARY_DIR "${Data_BINARY_DIR}/${OgsTest_DIR}")
+    endif()
+    file(MAKE_DIRECTORY ${OgsTest_BINARY_DIR})
+    file(TO_NATIVE_PATH "${OgsTest_BINARY_DIR}" OgsTest_BINARY_DIR_NATIVE)
+    string(REPLACE "/" "_" TEST_NAME_UNDERSCORE ${TEST_NAME})
     add_test(
         NAME ${TEST_NAME}
         COMMAND
@@ -83,18 +105,6 @@ function(OgsTest)
             -P ${PROJECT_SOURCE_DIR}/scripts/cmake/test/OgsTestWrapper.cmake
     )
 
-    # For debugging: message("Adding test with NAME ${TEST_NAME}
-    # WORKING_DIRECTORY ${OgsTest_BINARY_DIR} COMMAND ${OgsTest_WRAPPER}
-    # $<TARGET_FILE:ogs> -r ${OgsTest_SOURCE_DIR}
-    # ${OgsTest_SOURCE_DIR}/${OgsTest_NAME})
-
-    current_dir_as_list(ProcessLib labels)
-    if(${OgsTest_RUNTIME} LESS_EQUAL ${ogs.ctest.large_runtime})
-        list(APPEND labels default)
-    else()
-        list(APPEND labels large)
-    endif()
-
     set_tests_properties(
         ${TEST_NAME}
         PROPERTIES ${OgsTest_PROPERTIES}
@@ -107,7 +117,6 @@ function(OgsTest)
                    LABELS
                    "${labels}"
     )
-    # Disabled for the moment, does not work with CI under load if(NOT
-    # OGS_COVERAGE) set_tests_properties(${TEST_NAME} PROPERTIES TIMEOUT
-    # ${timeout}) endif()
-endfunction()
+endmacro()
+
+# macro(_set_omp_test_properties) defined in AddTest.cmake