diff --git a/BaseLib/ThreadException.h b/BaseLib/ThreadException.h
index a259d9a051a5ab10adfad88cb5577ef8091ee388..0bc221f8ed333cf016a3850f3552d582c1494dd6 100644
--- a/BaseLib/ThreadException.h
+++ b/BaseLib/ThreadException.h
@@ -14,6 +14,11 @@
 
 /// Initial code from
 /// https://stackoverflow.com/questions/11828539/elegant-exception-handling-in-openmp
+///
+/// The implementation does not guarantee, that while the exception is being
+/// checked for nullptr in the bool-conversion operator, the capture is not
+/// changing the pointer. This might lead to an exception being lost if there
+/// are two exceptions thrown simultaneously.
 class ThreadException
 {
 public:
@@ -31,6 +36,8 @@ public:
         }
     }
 
+    explicit operator bool() const noexcept { return exception_ != nullptr; }
+
 private:
     std::exception_ptr exception_ = nullptr;
     std::mutex lock_;
diff --git a/ProcessLib/Assembly/MatrixOutput.cpp b/ProcessLib/Assembly/MatrixOutput.cpp
index 56ed505c69fba0a82792b42759721834c3a0f89c..5aabd69dc9fbcd58d0f91cf610be0812063b88e2 100644
--- a/ProcessLib/Assembly/MatrixOutput.cpp
+++ b/ProcessLib/Assembly/MatrixOutput.cpp
@@ -175,10 +175,9 @@ GlobalMatrixOutput::GlobalMatrixOutput()
 }
 
 void GlobalMatrixOutput::operator()(double const t, int const process_id,
-                                    GlobalMatrix const* M,
-                                    GlobalMatrix const* K,
-                                    GlobalVector const& b,
-                                    GlobalMatrix const* const Jac)
+                                    GlobalMatrix const& M,
+                                    GlobalMatrix const& K,
+                                    GlobalVector const& b)
 {
     if (!do_output_)
     {
@@ -188,22 +187,20 @@ void GlobalMatrixOutput::operator()(double const t, int const process_id,
 #ifndef USE_PETSC
     ++counter_;
 
-    if (M)
     {
         auto fh = openGlobalMatrixOutputFile(filenamePrefix_, counter_, t,
                                              process_id, "M", "mat");
 
         fh << "M ";
-        outputGlobalMatrix(*M, fh);
+        outputGlobalMatrix(M, fh);
     }
 
-    if (K)
     {
         auto fh = openGlobalMatrixOutputFile(filenamePrefix_, counter_, t,
                                              process_id, "K", "mat");
 
         fh << "K ";
-        outputGlobalMatrix(*K, fh);
+        outputGlobalMatrix(K, fh);
     }
 
     {
@@ -213,21 +210,47 @@ void GlobalMatrixOutput::operator()(double const t, int const process_id,
         fh << "b ";
         outputGlobalVector(b, fh);
     }
+#else
+    // do nothing, warning message already printed in the constructor
+    (void)t;
+    (void)process_id;
+    (void)M;
+    (void)K;
+    (void)b;
+#endif
+}
+
+void GlobalMatrixOutput::operator()(double const t, int const process_id,
+                                    GlobalVector const& b,
+                                    GlobalMatrix const& Jac)
+{
+    if (!do_output_)
+    {
+        return;
+    }
+
+#ifndef USE_PETSC
+    ++counter_;
+
+    {
+        auto fh = openGlobalMatrixOutputFile(filenamePrefix_, counter_, t,
+                                             process_id, "b", "vec");
+
+        fh << "b ";
+        outputGlobalVector(b, fh);
+    }
 
-    if (Jac)
     {
         auto fh = openGlobalMatrixOutputFile(filenamePrefix_, counter_, t,
                                              process_id, "Jac", "mat");
 
         fh << "Jac ";
-        outputGlobalMatrix(*Jac, fh);
+        outputGlobalMatrix(Jac, fh);
     }
 #else
     // do nothing, warning message already printed in the constructor
     (void)t;
     (void)process_id;
-    (void)M;
-    (void)K;
     (void)b;
     (void)Jac;
 #endif
diff --git a/ProcessLib/Assembly/MatrixOutput.h b/ProcessLib/Assembly/MatrixOutput.h
index 2b3d285f7b116a17bbd6afe91d0418669ff3a368..855dbf33475421de2c5b8011978693b4d4da80f0 100644
--- a/ProcessLib/Assembly/MatrixOutput.h
+++ b/ProcessLib/Assembly/MatrixOutput.h
@@ -24,9 +24,11 @@ struct GlobalMatrixOutput
 {
     GlobalMatrixOutput();
 
-    void operator()(double const t, int const process_id, GlobalMatrix const* M,
-                    GlobalMatrix const* K, GlobalVector const& b,
-                    GlobalMatrix const* const Jac = nullptr);
+    void operator()(double const t, int const process_id, GlobalMatrix const& M,
+                    GlobalMatrix const& K, GlobalVector const& b);
+
+    void operator()(double const t, int const process_id, GlobalVector const& b,
+                    GlobalMatrix const& Jac);
 
 private:
     std::string filenamePrefix_;
diff --git a/ProcessLib/Assembly/ParallelVectorMatrixAssembler.cpp b/ProcessLib/Assembly/ParallelVectorMatrixAssembler.cpp
index e799f5153ce608df89a744a6d3b2a99a4d252cd1..a78a8e7aa6cfa182b65ee9c7e7fea3c335047c65 100644
--- a/ProcessLib/Assembly/ParallelVectorMatrixAssembler.cpp
+++ b/ProcessLib/Assembly/ParallelVectorMatrixAssembler.cpp
@@ -12,6 +12,8 @@
 
 #include <cstdlib>
 #include <fstream>
+#include <range/v3/view/iota.hpp>
+#include <vector>
 
 #include "BaseLib/StringTools.h"
 #include "BaseLib/ThreadException.h"
@@ -19,6 +21,7 @@
 #include "ProcessLib/Assembly/MatrixAssemblyStats.h"
 #include "ProcessLib/Assembly/MatrixElementCache.h"
 #include "ProcessLib/Assembly/MatrixOutput.h"
+#include "ProcessLib/CoupledSolutionsForStaggeredScheme.h"
 
 namespace
 {
@@ -28,11 +31,11 @@ void assembleWithJacobianOneElement(
     const NumLib::LocalToGlobalIndexMap& dof_table, const double t,
     const double dt, const GlobalVector& x, const GlobalVector& x_prev,
     std::vector<double>& local_b_data, std::vector<double>& local_Jac_data,
-    std::vector<GlobalIndexType>& indices,
     ProcessLib::AbstractJacobianAssembler& jacobian_assembler,
     ProcessLib::Assembly::MultiMatrixElementCache& cache)
 {
-    indices = NumLib::getIndices(mesh_item_id, dof_table);
+    std::vector<GlobalIndexType> const& indices =
+        NumLib::getIndices(mesh_item_id, dof_table);
 
     local_b_data.clear();
     local_Jac_data.clear();
@@ -54,6 +57,118 @@ void assembleWithJacobianOneElement(
     cache.add(local_b_data, local_Jac_data, indices);
 }
 
+void assembleWithJacobianForStaggeredSchemeOneElement(
+    const std::size_t mesh_item_id,
+    ProcessLib::LocalAssemblerInterface& local_assembler,
+    std::vector<NumLib::LocalToGlobalIndexMap const*> const& dof_tables,
+    const double t, const double dt, std::vector<GlobalVector*> const& x,
+    std::vector<GlobalVector*> const& x_prev, int const process_id,
+    std::vector<double>& local_b_data, std::vector<double>& local_Jac_data,
+    ProcessLib::AbstractJacobianAssembler& jacobian_assembler,
+    ProcessLib::Assembly::MultiMatrixElementCache& cache)
+{
+    std::vector<std::vector<GlobalIndexType>> indices_of_processes;
+    indices_of_processes.reserve(dof_tables.size());
+    transform(cbegin(dof_tables), cend(dof_tables),
+              back_inserter(indices_of_processes),
+              [&](auto const* dof_table)
+              { return NumLib::getIndices(mesh_item_id, *dof_table); });
+
+    auto local_coupled_xs =
+        ProcessLib::getCoupledLocalSolutions(x, indices_of_processes);
+    auto const local_x = MathLib::toVector(local_coupled_xs);
+
+    auto local_coupled_x_prevs =
+        ProcessLib::getCoupledLocalSolutions(x_prev, indices_of_processes);
+    auto const local_x_prev = MathLib::toVector(local_coupled_x_prevs);
+
+    std::vector<GlobalIndexType> const& indices =
+        indices_of_processes[process_id];
+
+    local_b_data.clear();
+    local_Jac_data.clear();
+
+    jacobian_assembler.assembleWithJacobianForStaggeredScheme(
+        local_assembler, t, dt, local_x, local_x_prev, process_id, local_b_data,
+        local_Jac_data);
+
+    if (local_Jac_data.empty())
+    {
+        OGS_FATAL(
+            "No Jacobian has been assembled! This might be due to "
+            "programming errors in the local assembler of the "
+            "current process.");
+    }
+
+    cache.add(local_b_data, local_Jac_data, indices);
+}
+
+/// Returns a vector of active element ids with corresponding local assemblers.
+std::vector<
+    std::tuple<std::ptrdiff_t,
+               std::reference_wrapper<ProcessLib::LocalAssemblerInterface>>>
+collectActiveLocalAssemblers(
+    BaseLib::PolymorphicRandomAccessContainerView<
+        ProcessLib::LocalAssemblerInterface> const& local_assemblers,
+    std::vector<std::size_t> const& active_elements)
+{
+    auto create_ids_asm_pairs = [&](auto const& element_ids)
+    {
+        std::vector<std::tuple<
+            std::ptrdiff_t,
+            std::reference_wrapper<ProcessLib::LocalAssemblerInterface>>>
+            result;
+        result.reserve(static_cast<std::size_t>(element_ids.size()));
+        for (auto const id : element_ids)
+        {
+            result.push_back({id, local_assemblers[id]});
+        }
+        return result;
+    };
+
+    if (active_elements.empty())
+    {
+        return create_ids_asm_pairs(ranges::views::iota(
+            static_cast<std::size_t>(0), local_assemblers.size()));
+    }
+    return create_ids_asm_pairs(active_elements);
+}
+
+void runAssembly(
+    std::vector<std::tuple<
+        std::ptrdiff_t,
+        std::reference_wrapper<ProcessLib::LocalAssemblerInterface>>> const&
+        ids_local_assemblers,
+    ThreadException& exception,
+    auto local_matrix_output,
+    auto assemble)
+{
+    std::ptrdiff_t n_elements =
+        static_cast<std::ptrdiff_t>(ids_local_assemblers.size());
+
+#pragma omp for nowait
+    for (std::ptrdiff_t i = 0; i < n_elements; ++i)
+    {
+        if (exception)
+        {
+            continue;
+        }
+        auto [element_id, loc_asm] = ids_local_assemblers[i];
+
+        try
+        {
+            assemble(element_id, loc_asm);
+        }
+        catch (...)
+        {
+            exception.capture();
+            continue;
+        }
+
+        local_matrix_output(element_id);
+    }
+}
+
 int getNumberOfThreads()
 {
     char const* const num_threads_env = std::getenv("OGS_ASM_THREADS");
@@ -120,29 +235,12 @@ void ParallelVectorMatrixAssembler::assembleWithJacobian(
     GlobalVector& b, GlobalMatrix& Jac)
 {
     // checks //////////////////////////////////////////////////////////////////
-    if (process_id != 0)
-    {
-        OGS_FATAL("Process id is not 0 but {}", process_id);
-    }
-
-    if (dof_tables.size() != 1)
-    {
-        OGS_FATAL("More than 1 dof table");
-    }
-    auto const& dof_table = *(dof_tables.front());
-
-    if (xs.size() != 1)
+    if (dof_tables.size() != xs.size())
     {
-        OGS_FATAL("More than 1 solution vector");
+        OGS_FATAL("Different number of DOF tables and solution vectors.");
     }
-    auto const& x = *xs.front();
-
-    if (x_prevs.size() != 1)
-    {
-        OGS_FATAL("More than 1 x_prev vector");
-    }
-    auto const& x_prev = *x_prevs.front();
 
+    std::size_t const number_of_processes = xs.size();
     // algorithm ///////////////////////////////////////////////////////////////
 
     auto stats = CumulativeStats<MultiStats>::create();
@@ -150,7 +248,6 @@ void ParallelVectorMatrixAssembler::assembleWithJacobian(
     ConcurrentMatrixView Jac_view(Jac);
 
     ThreadException exception;
-    bool assembly_error = false;
 #pragma omp parallel num_threads(num_threads_)
     {
 #ifdef _OPENMP
@@ -164,7 +261,6 @@ void ParallelVectorMatrixAssembler::assembleWithJacobian(
         // reallocations.
         std::vector<double> local_b_data;
         std::vector<double> local_Jac_data;
-        std::vector<GlobalIndexType> indices;
 
         // copy to avoid concurrent access
         auto const jac_asm = jacobian_assembler_.copy();
@@ -173,83 +269,50 @@ void ParallelVectorMatrixAssembler::assembleWithJacobian(
         MultiMatrixElementCache cache{b_view, Jac_view,
                                       stats_this_thread->data};
 
-        // TODO corner case: what if all elements on a submesh are deactivated?
-        if (active_elements.empty())
+        auto local_matrix_output = [&](std::ptrdiff_t element_id)
         {
-            // due to MSVC++ error:
-            // error C3016: 'element_id': index variable in OpenMP 'for'
-            // statement must have signed integral type
-            std::ptrdiff_t const n_loc_asm =
-                static_cast<std::ptrdiff_t>(local_assemblers.size());
+            local_matrix_output_(t, process_id, element_id, local_b_data,
+                                 local_Jac_data);
+        };
 
-#pragma omp for nowait
-            for (std::ptrdiff_t element_id = 0; element_id < n_loc_asm;
-                 ++element_id)
-            {
-                if (assembly_error)
-                {
-                    continue;
-                }
-                auto& loc_asm = local_assemblers[element_id];
+        // TODO corner case: what if all elements on a submesh are deactivated?
 
-                try
+        // Monolithic scheme
+        if (number_of_processes == 1)
+        {
+            assert(process_id == 0);
+            auto const& dof_table = *dof_tables[0];
+            auto const& x = *xs[0];
+            auto const& x_prev = *x_prevs[0];
+
+            runAssembly(
+                collectActiveLocalAssemblers(local_assemblers, active_elements),
+                exception, local_matrix_output,
+                [&](auto element_id, auto& loc_asm)
                 {
                     assembleWithJacobianOneElement(
                         element_id, loc_asm, dof_table, t, dt, x, x_prev,
-                        local_b_data, local_Jac_data, indices, *jac_asm, cache);
-                }
-                catch (...)
-                {
-                    exception.capture();
-                    assembly_error = true;
-                    continue;
-                }
-
-                local_matrix_output_(t, process_id, element_id, local_b_data,
-                                     local_Jac_data);
-            }
+                        local_b_data, local_Jac_data, *jac_asm, cache);
+                });
         }
-        else
+        else  // Staggered scheme
         {
-            // due to MSVC++ error:
-            // error C3016: 'i': index variable in OpenMP 'for' statement must
-            // have signed integral type
-            std::ptrdiff_t const n_act_elem =
-                static_cast<std::ptrdiff_t>(active_elements.size());
-
-#pragma omp for nowait
-            for (std::ptrdiff_t i = 0; i < n_act_elem; ++i)
-            {
-                if (assembly_error)
-                {
-                    continue;
-                }
-
-                auto const element_id = active_elements[i];
-                auto& loc_asm = local_assemblers[element_id];
-
-                try
+            runAssembly(
+                collectActiveLocalAssemblers(local_assemblers, active_elements),
+                exception, local_matrix_output,
+                [&](auto element_id, auto& loc_asm)
                 {
-                    assembleWithJacobianOneElement(
-                        element_id, loc_asm, dof_table, t, dt, x, x_prev,
-                        local_b_data, local_Jac_data, indices, *jac_asm, cache);
-                }
-                catch (...)
-                {
-                    exception.capture();
-                    assembly_error = true;
-                    continue;
-                }
-
-                local_matrix_output_(t, process_id, element_id, local_b_data,
-                                     local_Jac_data);
-            }
+                    assembleWithJacobianForStaggeredSchemeOneElement(
+                        element_id, loc_asm, dof_tables, t, dt, xs, x_prevs,
+                        process_id, local_b_data, local_Jac_data, *jac_asm,
+                        cache);
+                });
         }
-    }  // OpenMP parallel section
+    }
 
     stats->print();
 
-    global_matrix_output_(t, process_id, nullptr, nullptr, b, &Jac);
+    global_matrix_output_(t, process_id, b, Jac);
     exception.rethrow();
 }
 }  // namespace ProcessLib::Assembly
diff --git a/ProcessLib/AssemblyMixin.cpp b/ProcessLib/AssemblyMixin.cpp
index 2b955a2364459b3e3e100eacb14fdb2a77d4232d..bca2c4a68599056432ed315d1cb17331d1c2f0e7 100644
--- a/ProcessLib/AssemblyMixin.cpp
+++ b/ProcessLib/AssemblyMixin.cpp
@@ -10,6 +10,10 @@
 
 #include "AssemblyMixin.h"
 
+#include <range/v3/view/enumerate.hpp>
+#include <range/v3/view/for_each.hpp>
+#include <range/v3/view/transform.hpp>
+#include <range/v3/view/zip.hpp>
 #include <unordered_set>
 
 #include "MeshLib/Utils/getOrCreateMeshProperty.h"
@@ -18,30 +22,79 @@
 
 namespace
 {
-std::vector<std::reference_wrapper<MeshLib::PropertyVector<double>>>
-createResiduumVectors(
-    MeshLib::Mesh& mesh,
-    std::vector<std::string> const& residuum_names,
-    std::vector<std::reference_wrapper<ProcessLib::ProcessVariable>>
-        pvs)
-{
-    auto const num_residua = residuum_names.size();
 
-    std::vector<std::reference_wrapper<MeshLib::PropertyVector<double>>>
-        residuum_vectors;
-    residuum_vectors.reserve(num_residua);
+void checkResiduumNamesVsProcessVariables(
+    std::vector<std::vector<std::string>> const& per_process_residuum_names,
+    std::vector<
+        std::vector<std::reference_wrapper<ProcessLib::ProcessVariable>>> const&
+        per_process_pvs)
+{
+    if (per_process_pvs.size() != per_process_residuum_names.size())
+    {
+        OGS_FATAL(
+            "The number of passed residuum names ({}) does not match the "
+            "number of processes ({}).",
+            per_process_residuum_names.size(), per_process_pvs.size());
+    }
 
-    for (std::size_t i = 0; i < num_residua; ++i)
+    auto check_sizes = [](std::size_t const process_id, auto const& rns_pvs)
     {
-        auto const& residuum_name = residuum_names[i];
-        auto const& pv = pvs[i].get();
+        auto const& [rns, pvs] = rns_pvs;
 
-        residuum_vectors.emplace_back(*MeshLib::getOrCreateMeshProperty<double>(
-            mesh, residuum_name, MeshLib::MeshItemType::Node,
-            pv.getNumberOfGlobalComponents()));
+        if (rns.size() != pvs.size())
+        {
+            OGS_FATAL(
+                "The number of passed residuum names ({}) does not match the "
+                "number of process variables ({}) for process {}.",
+                rns.size(), pvs.size(), process_id);
+        }
+    };
+    for (auto const& [process_id, rns_pvs] :
+         ranges::views::zip(per_process_residuum_names, per_process_pvs) |
+             ranges::views::enumerate)
+    {
+        check_sizes(process_id, rns_pvs);
     }
+}
 
-    return residuum_vectors;
+std::vector<
+    std::vector<std::reference_wrapper<MeshLib::PropertyVector<double>>>>
+createResiduumVectors(
+    MeshLib::Mesh& mesh,
+    std::vector<std::vector<std::string>> const& per_process_residuum_names,
+    std::vector<
+        std::vector<std::reference_wrapper<ProcessLib::ProcessVariable>>> const&
+        per_process_pvs)
+{
+    checkResiduumNamesVsProcessVariables(per_process_residuum_names,
+                                         per_process_pvs);
+
+    auto create_mesh_property_for_residuum =
+        [&mesh](std::pair<std::string const&,
+                          ProcessLib::ProcessVariable const&> const&
+                    residuum_name_process_variable)
+        -> std::reference_wrapper<MeshLib::PropertyVector<double>>
+    {
+        auto const& [rn, pv] = residuum_name_process_variable;
+        return *MeshLib::getOrCreateMeshProperty<double>(
+            mesh, rn, MeshLib::MeshItemType::Node,
+            pv.getNumberOfGlobalComponents());
+    };
+
+    auto create_mesh_properties =
+        [&create_mesh_property_for_residuum](auto const& rns_pvs)
+    {
+        auto const& [rns, pvs] = rns_pvs;
+        return ranges::views::zip(rns, pvs) |
+               ranges::views::transform(create_mesh_property_for_residuum) |
+               ranges::to<std::vector<
+                   std::reference_wrapper<MeshLib::PropertyVector<double>>>>;
+    };
+
+    return ranges::views::zip(per_process_residuum_names, per_process_pvs) |
+           ranges::views::transform(create_mesh_properties) |
+           ranges::to<std::vector<std::vector<
+               std::reference_wrapper<MeshLib::PropertyVector<double>>>>>;
 }
 }  // namespace
 
@@ -49,7 +102,8 @@ namespace ProcessLib
 {
 SubmeshAssemblyData::SubmeshAssemblyData(
     MeshLib::Mesh const& mesh,
-    std::vector<std::reference_wrapper<MeshLib::PropertyVector<double>>>&&
+    std::vector<
+        std::vector<std::reference_wrapper<MeshLib::PropertyVector<double>>>>&&
         residuum_vectors)
     : bulk_element_ids{*MeshLib::bulkElementIDs(mesh)},
       bulk_node_ids{*MeshLib::bulkNodeIDs(mesh)},
@@ -58,24 +112,14 @@ SubmeshAssemblyData::SubmeshAssemblyData(
 }
 
 void AssemblyMixinBase::initializeAssemblyOnSubmeshes(
-    const int process_id,
     MeshLib::Mesh& bulk_mesh,
     std::vector<std::reference_wrapper<MeshLib::Mesh>> const& submeshes,
-    std::vector<std::string> const& residuum_names,
-    std::vector<std::reference_wrapper<ProcessVariable>> const& pvs)
+    std::vector<std::vector<std::string>> const& residuum_names,
+    std::vector<std::vector<std::reference_wrapper<ProcessVariable>>> const&
+        pvs)
 {
     DBUG("AssemblyMixinBase initializeSubmeshOutput().");
 
-    auto const num_residua = residuum_names.size();
-
-    if (pvs.size() != num_residua)
-    {
-        OGS_FATAL(
-            "The number of passed residuum names does not match the number "
-            "of process variables for process id {}: {} != {}",
-            process_id, num_residua, pvs.size());
-    }
-
     submesh_assembly_data_.reserve(submeshes.size());
     for (auto& mesh_ref : submeshes)
     {
@@ -184,16 +228,18 @@ void AssemblyMixinBase::copyResiduumVectorsToBulkMesh(
 }
 
 void AssemblyMixinBase::copyResiduumVectorsToSubmesh(
+    int const process_id,
     GlobalVector const& rhs,
     NumLib::LocalToGlobalIndexMap const& local_to_global_index_map,
     SubmeshAssemblyData const& sad)
 {
-    for (std::size_t variable_id = 0; variable_id < sad.residuum_vectors.size();
+    auto const& residuum_vectors = sad.residuum_vectors[process_id];
+    for (std::size_t variable_id = 0; variable_id < residuum_vectors.size();
          ++variable_id)
     {
         transformVariableFromGlobalVector(
             rhs, variable_id, local_to_global_index_map,
-            sad.residuum_vectors[variable_id].get(), sad.bulk_node_ids,
+            residuum_vectors[variable_id].get(), sad.bulk_node_ids,
             std::negate<double>());
     }
 }
diff --git a/ProcessLib/AssemblyMixin.h b/ProcessLib/AssemblyMixin.h
index 72e4f1a397f329d5d4cbd16b7fae9d6bbe09b91b..6b4bc09511fc86d3da2973553ad7f3104e82efe8 100644
--- a/ProcessLib/AssemblyMixin.h
+++ b/ProcessLib/AssemblyMixin.h
@@ -24,13 +24,15 @@ struct SubmeshAssemblyData
 {
     explicit SubmeshAssemblyData(
         MeshLib::Mesh const& mesh,
-        std::vector<std::reference_wrapper<MeshLib::PropertyVector<double>>>&&
+        std::vector<std::vector<
+            std::reference_wrapper<MeshLib::PropertyVector<double>>>>&&
             residuum_vectors);
 
     MeshLib::PropertyVector<std::size_t> const& bulk_element_ids;
     MeshLib::PropertyVector<std::size_t> const& bulk_node_ids;
     std::vector<std::size_t> active_element_ids;
-    std::vector<std::reference_wrapper<MeshLib::PropertyVector<double>>>
+    std::vector<
+        std::vector<std::reference_wrapper<MeshLib::PropertyVector<double>>>>
         residuum_vectors;
 };
 
@@ -50,11 +52,11 @@ protected:
         : pvma_{jacobian_assembler} {};
 
     void initializeAssemblyOnSubmeshes(
-        const int process_id,
         MeshLib::Mesh& bulk_mesh,
         std::vector<std::reference_wrapper<MeshLib::Mesh>> const& submeshes,
-        std::vector<std::string> const& residuum_names,
-        std::vector<std::reference_wrapper<ProcessVariable>> const& pvs);
+        std::vector<std::vector<std::string>> const& residuum_names,
+        std::vector<std::vector<std::reference_wrapper<ProcessVariable>>> const&
+            pvs);
 
     void updateActiveElements(ProcessLib::Process const& process);
 
@@ -65,6 +67,7 @@ protected:
             residuum_vectors);
 
     static void copyResiduumVectorsToSubmesh(
+        int const process_id,
         GlobalVector const& rhs,
         NumLib::LocalToGlobalIndexMap const& local_to_global_index_map,
         SubmeshAssemblyData const& sad);
@@ -74,7 +77,8 @@ private:
 
 protected:
     std::vector<SubmeshAssemblyData> submesh_assembly_data_;
-    std::vector<std::reference_wrapper<MeshLib::PropertyVector<double>>>
+    std::vector<
+        std::vector<std::reference_wrapper<MeshLib::PropertyVector<double>>>>
         residuum_vectors_bulk_;
 
     /// ID of the b vector on submeshes, cf. NumLib::VectorProvider.
@@ -122,13 +126,12 @@ public:
      * and staggered coupling.
      */
     void initializeAssemblyOnSubmeshes(
-        const int process_id,
         std::vector<std::reference_wrapper<MeshLib::Mesh>> const& submeshes,
-        std::vector<std::string> const& residuum_names)
+        std::vector<std::vector<std::string>> const& residuum_names)
     {
         AssemblyMixinBase::initializeAssemblyOnSubmeshes(
-            process_id, derived().getMesh(), submeshes, residuum_names,
-            derived().getProcessVariables(process_id));
+            derived().getMesh(), submeshes, residuum_names,
+            derived().getProcessVariables());
     }
 
     void updateActiveElements()
@@ -161,9 +164,8 @@ public:
         DBUG("AssemblyMixin assembleWithJacobian(t={}, dt={}, process_id={}).",
              t, dt, process_id);
 
-        // TODO why not getDOFTables(x.size()); ?
-        std::vector<NumLib::LocalToGlobalIndexMap const*> const dof_tables{
-            derived()._local_to_global_index_map.get()};
+        std::vector<NumLib::LocalToGlobalIndexMap const*> const dof_tables =
+            derived().getDOFTables(x.size());
 
         auto const& loc_asms = derived().local_assemblers_;
 
@@ -183,25 +185,20 @@ public:
                 MathLib::LinAlg::axpy(b, 1.0, b_submesh);
 
                 AssemblyMixinBase::copyResiduumVectorsToSubmesh(
-                    b_submesh, *(dof_tables.front()), sad);
+                    process_id, b_submesh, *(dof_tables[process_id]), sad);
             }
 
             NumLib::GlobalVectorProvider::provider.releaseVector(b_submesh);
         }
         else
         {
-            // convention: process variable 0 governs where assembly takes
-            // place (active element IDs)
-            ProcessLib::ProcessVariable const& pv =
-                derived().getProcessVariables(process_id)[0];
-
-            pvma_.assembleWithJacobian(loc_asms, pv.getActiveElementIDs(),
-                                       dof_tables, t, dt, x, x_prev, process_id,
-                                       b, Jac);
+            pvma_.assembleWithJacobian(
+                loc_asms, derived().getActiveElementIDs(), dof_tables, t, dt, x,
+                x_prev, process_id, b, Jac);
         }
 
         AssemblyMixinBase::copyResiduumVectorsToBulkMesh(
-            b, *(dof_tables.front()), residuum_vectors_bulk_);
+            b, *(dof_tables[process_id]), residuum_vectors_bulk_[process_id]);
     }
 
 private:
diff --git a/ProcessLib/CreateTimeLoop.cpp b/ProcessLib/CreateTimeLoop.cpp
index 0b22415787a7e8e2df50f860c3480879ea5d8b83..49b5bc64e85b28a65b6cbc82b5b5ee24add67179 100644
--- a/ProcessLib/CreateTimeLoop.cpp
+++ b/ProcessLib/CreateTimeLoop.cpp
@@ -12,6 +12,7 @@
 
 #include <algorithm>
 #include <range/v3/algorithm/any_of.hpp>
+#include <range/v3/view/join.hpp>
 #include <string>
 
 #include "BaseLib/ConfigTree.h"
@@ -61,7 +62,7 @@ std::unique_ptr<TimeLoop> createTimeLoop(
             auto const& residuum_vector_names =
                 process->initializeAssemblyOnSubmeshes(smroc.meshes);
 
-            for (auto const& name : residuum_vector_names)
+            for (auto const& name : residuum_vector_names | ranges::views::join)
             {
                 smroc.output.doNotProjectFromBulkMeshToSubmeshes(
                     name, MeshLib::MeshItemType::Node);
diff --git a/ProcessLib/HydroMechanics/HydroMechanicsProcess.cpp b/ProcessLib/HydroMechanics/HydroMechanicsProcess.cpp
index d5a123a71abe403aaf7f3fe70f58f45e304454b4..eabd64447c1aabc68c4eee6c655bd53e10f86703 100644
--- a/ProcessLib/HydroMechanics/HydroMechanicsProcess.cpp
+++ b/ProcessLib/HydroMechanics/HydroMechanicsProcess.cpp
@@ -41,25 +41,21 @@ HydroMechanicsProcess<DisplacementDim>::HydroMechanicsProcess(
     : Process(std::move(name), mesh, std::move(jacobian_assembler), parameters,
               integration_order, std::move(process_variables),
               std::move(secondary_variables), use_monolithic_scheme),
-      _process_data(std::move(process_data))
+      AssemblyMixin<HydroMechanicsProcess<DisplacementDim>>{
+          *_jacobian_assembler},
+      process_data_(std::move(process_data))
 {
-    _nodal_forces = MeshLib::getOrCreateMeshProperty<double>(
-        mesh, "NodalForces", MeshLib::MeshItemType::Node, DisplacementDim);
-
-    _hydraulic_flow = MeshLib::getOrCreateMeshProperty<double>(
-        mesh, "MassFlowRate", MeshLib::MeshItemType::Node, 1);
-
     _integration_point_writer.emplace_back(
         std::make_unique<MeshLib::IntegrationPointWriter>(
             "sigma_ip",
             static_cast<int>(mesh.getDimension() == 2 ? 4 : 6) /*n components*/,
-            integration_order, _local_assemblers, &LocalAssemblerIF::getSigma));
+            integration_order, local_assemblers_, &LocalAssemblerIF::getSigma));
 
     _integration_point_writer.emplace_back(
         std::make_unique<MeshLib::IntegrationPointWriter>(
             "epsilon_ip",
             static_cast<int>(mesh.getDimension() == 2 ? 4 : 6) /*n components*/,
-            integration_order, _local_assemblers,
+            integration_order, local_assemblers_,
             &LocalAssemblerIF::getEpsilon));
 
     if (!_use_monolithic_scheme)
@@ -67,7 +63,7 @@ HydroMechanicsProcess<DisplacementDim>::HydroMechanicsProcess(
         _integration_point_writer.emplace_back(
             std::make_unique<MeshLib::IntegrationPointWriter>(
                 "strain_rate_variable_ip", 1, integration_order,
-                _local_assemblers, &LocalAssemblerIF::getStrainRateVariable));
+                local_assemblers_, &LocalAssemblerIF::getStrainRateVariable));
     }
 }
 
@@ -84,7 +80,7 @@ HydroMechanicsProcess<DisplacementDim>::getMatrixSpecifications(
 {
     // For the monolithic scheme or the M process (deformation) in the staggered
     // scheme.
-    if (process_id == _process_data.mechanics_related_process_id)
+    if (process_id == process_data_.mechanics_related_process_id)
     {
         auto const& l = *_local_to_global_index_map;
         return {l.dofSizeWithoutGhosts(), l.dofSizeWithoutGhosts(),
@@ -92,9 +88,9 @@ HydroMechanicsProcess<DisplacementDim>::getMatrixSpecifications(
     }
 
     // For staggered scheme and H process (pressure).
-    auto const& l = *_local_to_global_index_map_with_base_nodes;
+    auto const& l = *local_to_global_index_map_with_base_nodes_;
     return {l.dofSizeWithoutGhosts(), l.dofSizeWithoutGhosts(),
-            &l.getGhostIndices(), &_sparsity_pattern_with_linear_element};
+            &l.getGhostIndices(), &sparsity_pattern_with_linear_element_};
 }
 
 template <int DisplacementDim>
@@ -102,28 +98,28 @@ void HydroMechanicsProcess<DisplacementDim>::constructDofTable()
 {
     // Create single component dof in every of the mesh's nodes.
     _mesh_subset_all_nodes = std::make_unique<MeshLib::MeshSubset>(
-        _mesh, _mesh.getNodes(), _process_data.use_taylor_hood_elements);
+        _mesh, _mesh.getNodes(), process_data_.use_taylor_hood_elements);
 
     // Create single component dof in the mesh's base nodes.
-    _base_nodes = MeshLib::getBaseNodes(_mesh.getElements());
-    _mesh_subset_base_nodes = std::make_unique<MeshLib::MeshSubset>(
-        _mesh, _base_nodes, _process_data.use_taylor_hood_elements);
+    base_nodes_ = MeshLib::getBaseNodes(_mesh.getElements());
+    mesh_subset_base_nodes_ = std::make_unique<MeshLib::MeshSubset>(
+        _mesh, base_nodes_, process_data_.use_taylor_hood_elements);
 
     // TODO move the two data members somewhere else.
     // for extrapolation of secondary variables of stress or strain
     std::vector<MeshLib::MeshSubset> all_mesh_subsets_single_component{
         *_mesh_subset_all_nodes};
-    _local_to_global_index_map_single_component =
+    local_to_global_index_map_single_component_ =
         std::make_unique<NumLib::LocalToGlobalIndexMap>(
             std::move(all_mesh_subsets_single_component),
             // by location order is needed for output
             NumLib::ComponentOrder::BY_LOCATION);
 
-    if (_process_data.isMonolithicSchemeUsed())
+    if (process_data_.isMonolithicSchemeUsed())
     {
         // For pressure, which is the first
         std::vector<MeshLib::MeshSubset> all_mesh_subsets{
-            *_mesh_subset_base_nodes};
+            *mesh_subset_base_nodes_};
 
         // For displacement.
         const int monolithic_process_id = 0;
@@ -160,18 +156,18 @@ void HydroMechanicsProcess<DisplacementDim>::constructDofTable()
         // For pressure equation.
         // Collect the mesh subsets with base nodes in a vector.
         std::vector<MeshLib::MeshSubset> all_mesh_subsets_base_nodes{
-            *_mesh_subset_base_nodes};
-        _local_to_global_index_map_with_base_nodes =
+            *mesh_subset_base_nodes_};
+        local_to_global_index_map_with_base_nodes_ =
             std::make_unique<NumLib::LocalToGlobalIndexMap>(
                 std::move(all_mesh_subsets_base_nodes),
                 // by location order is needed for output
                 NumLib::ComponentOrder::BY_LOCATION);
 
-        _sparsity_pattern_with_linear_element = NumLib::computeSparsityPattern(
-            *_local_to_global_index_map_with_base_nodes, _mesh);
+        sparsity_pattern_with_linear_element_ = NumLib::computeSparsityPattern(
+            *local_to_global_index_map_with_base_nodes_, _mesh);
 
         assert(_local_to_global_index_map);
-        assert(_local_to_global_index_map_with_base_nodes);
+        assert(local_to_global_index_map_with_base_nodes_);
     }
 }
 
@@ -183,9 +179,9 @@ void HydroMechanicsProcess<DisplacementDim>::initializeConcreteProcess(
 {
     ProcessLib::createLocalAssemblersHM<DisplacementDim,
                                         HydroMechanicsLocalAssembler>(
-        mesh.getElements(), dof_table, _local_assemblers,
+        mesh.getElements(), dof_table, local_assemblers_,
         NumLib::IntegrationOrder{integration_order}, mesh.isAxiallySymmetric(),
-        _process_data);
+        process_data_);
 
     auto add_secondary_variable = [&](std::string const& name,
                                       int const num_components,
@@ -194,7 +190,7 @@ void HydroMechanicsProcess<DisplacementDim>::initializeConcreteProcess(
         _secondary_variables.addSecondaryVariable(
             name,
             makeExtrapolator(num_components, getExtrapolator(),
-                             _local_assemblers,
+                             local_assemblers_,
                              std::move(get_ip_values_function)));
     };
 
@@ -215,45 +211,45 @@ void HydroMechanicsProcess<DisplacementDim>::initializeConcreteProcess(
     // enable output of internal variables defined by material models
     //
     ProcessLib::Deformation::solidMaterialInternalToSecondaryVariables<
-        LocalAssemblerIF>(_process_data.solid_materials,
+        LocalAssemblerIF>(process_data_.solid_materials,
                           add_secondary_variable);
 
-    _process_data.pressure_interpolated =
+    process_data_.pressure_interpolated =
         MeshLib::getOrCreateMeshProperty<double>(
             const_cast<MeshLib::Mesh&>(mesh), "pressure_interpolated",
             MeshLib::MeshItemType::Node, 1);
 
-    _process_data.principal_stress_vector[0] =
+    process_data_.principal_stress_vector[0] =
         MeshLib::getOrCreateMeshProperty<double>(
             const_cast<MeshLib::Mesh&>(mesh), "principal_stress_vector_1",
             MeshLib::MeshItemType::Cell, 3);
 
-    _process_data.principal_stress_vector[1] =
+    process_data_.principal_stress_vector[1] =
         MeshLib::getOrCreateMeshProperty<double>(
             const_cast<MeshLib::Mesh&>(mesh), "principal_stress_vector_2",
             MeshLib::MeshItemType::Cell, 3);
 
-    _process_data.principal_stress_vector[2] =
+    process_data_.principal_stress_vector[2] =
         MeshLib::getOrCreateMeshProperty<double>(
             const_cast<MeshLib::Mesh&>(mesh), "principal_stress_vector_3",
             MeshLib::MeshItemType::Cell, 3);
 
-    _process_data.principal_stress_values =
+    process_data_.principal_stress_values =
         MeshLib::getOrCreateMeshProperty<double>(
             const_cast<MeshLib::Mesh&>(mesh), "principal_stress_values",
             MeshLib::MeshItemType::Cell, 3);
 
-    _process_data.permeability = MeshLib::getOrCreateMeshProperty<double>(
+    process_data_.permeability = MeshLib::getOrCreateMeshProperty<double>(
         const_cast<MeshLib::Mesh&>(mesh), "permeability",
         MeshLib::MeshItemType::Cell,
         MathLib::KelvinVector::kelvin_vector_dimensions(DisplacementDim));
 
     setIPDataInitialConditions(_integration_point_writer, mesh.getProperties(),
-                               _local_assemblers);
+                               local_assemblers_);
 
     // Initialize local assemblers after all variables have been set.
     GlobalExecutor::executeMemberOnDereferenced(&LocalAssemblerIF::initialize,
-                                                _local_assemblers,
+                                                local_assemblers_,
                                                 *_local_to_global_index_map);
 }
 
@@ -261,7 +257,7 @@ template <int DisplacementDim>
 void HydroMechanicsProcess<DisplacementDim>::initializeBoundaryConditions(
     std::map<int, std::shared_ptr<MaterialPropertyLib::Medium>> const& media)
 {
-    if (_process_data.isMonolithicSchemeUsed())
+    if (process_data_.isMonolithicSchemeUsed())
     {
         const int process_id_of_hydromechanics = 0;
         initializeProcessBoundaryConditionsAndSourceTerms(
@@ -273,7 +269,7 @@ void HydroMechanicsProcess<DisplacementDim>::initializeBoundaryConditions(
     // for the equations of pressure
     const int hydraulic_process_id = 0;
     initializeProcessBoundaryConditionsAndSourceTerms(
-        *_local_to_global_index_map_with_base_nodes, hydraulic_process_id,
+        *local_to_global_index_map_with_base_nodes_, hydraulic_process_id,
         media);
 
     // for the equations of deformation.
@@ -298,7 +294,7 @@ void HydroMechanicsProcess<DisplacementDim>::assembleConcreteProcess(
         _local_to_global_index_map.get()};
 
     GlobalExecutor::executeSelectedMemberDereferenced(
-        _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
+        _global_assembler, &VectorMatrixAssembler::assemble, local_assemblers_,
         getActiveElementIDs(), dof_table, t, dt, x, x_prev, process_id, &M, &K,
         &b);
 }
@@ -311,7 +307,7 @@ void HydroMechanicsProcess<DisplacementDim>::
         GlobalVector& b, GlobalMatrix& Jac)
 {
     // For the monolithic scheme
-    bool const use_monolithic_scheme = _process_data.isMonolithicSchemeUsed();
+    bool const use_monolithic_scheme = process_data_.isMonolithicSchemeUsed();
     if (use_monolithic_scheme)
     {
         DBUG(
@@ -321,7 +317,7 @@ void HydroMechanicsProcess<DisplacementDim>::
     else
     {
         // For the staggered scheme
-        if (process_id == _process_data.hydraulic_process_id)
+        if (process_id == process_data_.hydraulic_process_id)
         {
             DBUG(
                 "Assemble the Jacobian equations of liquid fluid process in "
@@ -335,35 +331,8 @@ void HydroMechanicsProcess<DisplacementDim>::
         }
     }
 
-    auto const dof_tables = getDOFTables(x.size());
-    GlobalExecutor::executeSelectedMemberDereferenced(
-        _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, getActiveElementIDs(), dof_tables, t, dt, x, x_prev,
-        process_id, &b, &Jac);
-
-    auto copyRhs = [&](int const variable_id, auto& output_vector)
-    {
-        if (use_monolithic_scheme)
-        {
-            transformVariableFromGlobalVector(b, variable_id, *dof_tables[0],
-                                              output_vector,
-                                              std::negate<double>());
-        }
-        else
-        {
-            transformVariableFromGlobalVector(b, 0, *dof_tables[process_id],
-                                              output_vector,
-                                              std::negate<double>());
-        }
-    };
-    if (process_id == _process_data.hydraulic_process_id)
-    {
-        copyRhs(0, *_hydraulic_flow);
-    }
-    if (process_id == _process_data.mechanics_related_process_id)
-    {
-        copyRhs(1, *_nodal_forces);
-    }
+    AssemblyMixin<HydroMechanicsProcess<DisplacementDim>>::assembleWithJacobian(
+        t, dt, x, x_prev, process_id, b, Jac);
 }
 
 template <int DisplacementDim>
@@ -376,19 +345,41 @@ void HydroMechanicsProcess<DisplacementDim>::preTimestepConcreteProcess(
     if (hasMechanicalProcess(process_id))
     {
         GlobalExecutor::executeSelectedMemberOnDereferenced(
-            &LocalAssemblerIF::preTimestep, _local_assemblers,
+            &LocalAssemblerIF::preTimestep, local_assemblers_,
             getActiveElementIDs(), *_local_to_global_index_map, *x[process_id],
             t, dt);
     }
 }
 
+template <int DisplacementDim>
+std::vector<std::vector<std::string>>
+HydroMechanicsProcess<DisplacementDim>::initializeAssemblyOnSubmeshes(
+    std::vector<std::reference_wrapper<MeshLib::Mesh>> const& meshes)
+{
+    INFO("HydroMechanicsProcess initializeSubmeshOutput().");
+    std::vector<std::vector<std::string>> per_process_residuum_names;
+    if (_process_variables.size() == 1)  // monolithic
+    {
+        per_process_residuum_names = {{"MassFlowRate", "NodalForces"}};
+    }
+    else  // staggered
+    {
+        per_process_residuum_names = {{"MassFlowRate"}, {"NodalForces"}};
+    }
+
+    AssemblyMixin<HydroMechanicsProcess<DisplacementDim>>::
+        initializeAssemblyOnSubmeshes(meshes, per_process_residuum_names);
+
+    return per_process_residuum_names;
+}
+
 template <int DisplacementDim>
 void HydroMechanicsProcess<DisplacementDim>::postTimestepConcreteProcess(
     std::vector<GlobalVector*> const& x,
     std::vector<GlobalVector*> const& x_prev, double const t, double const dt,
     const int process_id)
 {
-    if (process_id != _process_data.hydraulic_process_id)
+    if (process_id != process_data_.hydraulic_process_id)
     {
         return;
     }
@@ -396,7 +387,7 @@ void HydroMechanicsProcess<DisplacementDim>::postTimestepConcreteProcess(
     DBUG("PostTimestep HydroMechanicsProcess.");
 
     GlobalExecutor::executeSelectedMemberOnDereferenced(
-        &LocalAssemblerIF::postTimestep, _local_assemblers,
+        &LocalAssemblerIF::postTimestep, local_assemblers_,
         getActiveElementIDs(), getDOFTables(x.size()), x, x_prev, t, dt,
         process_id);
 }
@@ -411,7 +402,7 @@ void HydroMechanicsProcess<DisplacementDim>::postNonLinearSolverConcreteProcess(
 
     // Calculate strain, stress or other internal variables of mechanics.
     GlobalExecutor::executeSelectedMemberOnDereferenced(
-        &LocalAssemblerIF::postNonLinearSolver, _local_assemblers,
+        &LocalAssemblerIF::postNonLinearSolver, local_assemblers_,
         getActiveElementIDs(), getDOFTables(x.size()), x, x_prev, t, dt,
         process_id);
 }
@@ -423,7 +414,7 @@ void HydroMechanicsProcess<DisplacementDim>::
                                         int const process_id)
 {
     // So far, this function only sets the initial stress using the input data.
-    if (process_id != _process_data.mechanics_related_process_id)
+    if (process_id != process_data_.mechanics_related_process_id)
     {
         return;
     }
@@ -431,7 +422,7 @@ void HydroMechanicsProcess<DisplacementDim>::
     DBUG("Set initial conditions of HydroMechanicsProcess.");
 
     GlobalExecutor::executeSelectedMemberOnDereferenced(
-        &LocalAssemblerIF::setInitialConditions, _local_assemblers,
+        &LocalAssemblerIF::setInitialConditions, local_assemblers_,
         getActiveElementIDs(), getDOFTables(x.size()), x, t, process_id);
 }
 
@@ -440,7 +431,7 @@ void HydroMechanicsProcess<DisplacementDim>::computeSecondaryVariableConcrete(
     double const t, double const dt, std::vector<GlobalVector*> const& x,
     GlobalVector const& x_prev, const int process_id)
 {
-    if (process_id != _process_data.hydraulic_process_id)
+    if (process_id != process_data_.hydraulic_process_id)
     {
         return;
     }
@@ -448,7 +439,7 @@ void HydroMechanicsProcess<DisplacementDim>::computeSecondaryVariableConcrete(
     DBUG("Compute the secondary variables for HydroMechanicsProcess.");
 
     GlobalExecutor::executeSelectedMemberOnDereferenced(
-        &LocalAssemblerIF::computeSecondaryVariable, _local_assemblers,
+        &LocalAssemblerIF::computeSecondaryVariable, local_assemblers_,
         getActiveElementIDs(), getDOFTables(x.size()), t, dt, x, x_prev,
         process_id);
 }
@@ -458,7 +449,7 @@ std::tuple<NumLib::LocalToGlobalIndexMap*, bool>
 HydroMechanicsProcess<DisplacementDim>::getDOFTableForExtrapolatorData() const
 {
     const bool manage_storage = false;
-    return std::make_tuple(_local_to_global_index_map_single_component.get(),
+    return std::make_tuple(local_to_global_index_map_single_component_.get(),
                            manage_storage);
 }
 
@@ -472,7 +463,7 @@ HydroMechanicsProcess<DisplacementDim>::getDOFTable(const int process_id) const
     }
 
     // For the equation of pressure
-    return *_local_to_global_index_map_with_base_nodes;
+    return *local_to_global_index_map_with_base_nodes_;
 }
 
 template class HydroMechanicsProcess<2>;
diff --git a/ProcessLib/HydroMechanics/HydroMechanicsProcess.h b/ProcessLib/HydroMechanics/HydroMechanicsProcess.h
index 775fcd7f49ecdc3ffa59b3964e61e98feedada22..033e1f02db5454daedb8400f2d4ac98b98db7aaa 100644
--- a/ProcessLib/HydroMechanics/HydroMechanicsProcess.h
+++ b/ProcessLib/HydroMechanics/HydroMechanicsProcess.h
@@ -12,6 +12,7 @@
 
 #include "HydroMechanicsProcessData.h"
 #include "LocalAssemblerInterface.h"
+#include "ProcessLib/AssemblyMixin.h"
 #include "ProcessLib/Process.h"
 
 namespace ProcessLib
@@ -23,8 +24,12 @@ namespace HydroMechanics
 /// The mixture momentum balance and the mixture mass balance are solved under
 /// fully saturated conditions.
 template <int DisplacementDim>
-class HydroMechanicsProcess final : public Process
+class HydroMechanicsProcess final
+    : public Process,
+      private AssemblyMixin<HydroMechanicsProcess<DisplacementDim>>
 {
+    friend class AssemblyMixin<HydroMechanicsProcess<DisplacementDim>>;
+
 public:
     HydroMechanicsProcess(
         std::string name,
@@ -108,29 +113,33 @@ private:
     NumLib::LocalToGlobalIndexMap const& getDOFTable(
         const int process_id) const override;
 
+    std::vector<std::vector<std::string>> initializeAssemblyOnSubmeshes(
+        std::vector<std::reference_wrapper<MeshLib::Mesh>> const& meshes)
+        override;
+
     bool isMonolithicSchemeUsed() const override
     {
-        return _process_data.isMonolithicSchemeUsed();
+        return process_data_.isMonolithicSchemeUsed();
     }
 
 private:
-    std::vector<MeshLib::Node*> _base_nodes;
-    std::unique_ptr<MeshLib::MeshSubset const> _mesh_subset_base_nodes;
-    HydroMechanicsProcessData<DisplacementDim> _process_data;
+    std::vector<MeshLib::Node*> base_nodes_;
+    std::unique_ptr<MeshLib::MeshSubset const> mesh_subset_base_nodes_;
+    HydroMechanicsProcessData<DisplacementDim> process_data_;
 
-    std::vector<std::unique_ptr<LocalAssemblerIF>> _local_assemblers;
+    std::vector<std::unique_ptr<LocalAssemblerIF>> local_assemblers_;
 
     std::unique_ptr<NumLib::LocalToGlobalIndexMap>
-        _local_to_global_index_map_single_component;
+        local_to_global_index_map_single_component_;
 
     /// Local to global index mapping for base nodes, which is used for linear
     /// interpolation for pressure in the staggered scheme.
     std::unique_ptr<NumLib::LocalToGlobalIndexMap>
-        _local_to_global_index_map_with_base_nodes;
+        local_to_global_index_map_with_base_nodes_;
 
     /// Sparsity pattern for the flow equation, and it is initialized only if
     /// the staggered scheme is used.
-    GlobalSparsityPattern _sparsity_pattern_with_linear_element;
+    GlobalSparsityPattern sparsity_pattern_with_linear_element_;
 
     void computeSecondaryVariableConcrete(double const t, double const dt,
                                           std::vector<GlobalVector*> const& x,
@@ -147,11 +156,8 @@ private:
     /// process has process_id == 1 in the staggered scheme.
     bool hasMechanicalProcess(int const process_id) const
     {
-        return process_id == _process_data.mechanics_related_process_id;
+        return process_id == process_data_.mechanics_related_process_id;
     }
-
-    MeshLib::PropertyVector<double>* _nodal_forces = nullptr;
-    MeshLib::PropertyVector<double>* _hydraulic_flow = nullptr;
 };
 
 extern template class HydroMechanicsProcess<2>;
diff --git a/ProcessLib/LargeDeformation/LargeDeformationProcess.cpp b/ProcessLib/LargeDeformation/LargeDeformationProcess.cpp
index bf5a13101ff98b31c240f041c689c7a87ebb72a4..f6c587105182584fccb8025de0032e85082bd216 100644
--- a/ProcessLib/LargeDeformation/LargeDeformationProcess.cpp
+++ b/ProcessLib/LargeDeformation/LargeDeformationProcess.cpp
@@ -145,7 +145,7 @@ void LargeDeformationProcess<DisplacementDim>::assembleConcreteProcess(
         getActiveElementIDs(), dof_table, t, dt, x, x_prev, process_id, &M, &K,
         &b);
 
-    _global_output(t, process_id, &M, &K, b);
+    _global_output(t, process_id, M, K, b);
 }
 
 template <int DisplacementDim>
@@ -168,7 +168,7 @@ void LargeDeformationProcess<DisplacementDim>::
     transformVariableFromGlobalVector(b, 0, *_local_to_global_index_map,
                                       *_nodal_forces, std::negate<double>());
 
-    _global_output(t, process_id, nullptr, nullptr, b, &Jac);
+    _global_output(t, process_id, b, Jac);
 }
 
 template <int DisplacementDim>
diff --git a/ProcessLib/Process.h b/ProcessLib/Process.h
index b8c4fd9ed64b77eb76548a8296a3b2392bd1f436..954a71821f086a5e45d265f86eff1bf23926886d 100644
--- a/ProcessLib/Process.h
+++ b/ProcessLib/Process.h
@@ -151,6 +151,13 @@ public:
     }
 
     MeshLib::Mesh& getMesh() const { return _mesh; }
+
+    std::vector<std::vector<std::reference_wrapper<ProcessVariable>>> const&
+    getProcessVariables() const
+    {
+        return _process_variables;
+    }
+
     std::vector<std::reference_wrapper<ProcessVariable>> const&
     getProcessVariables(const int process_id) const
     {
diff --git a/ProcessLib/SmallDeformation/SmallDeformationProcess.cpp b/ProcessLib/SmallDeformation/SmallDeformationProcess.cpp
index b588aef7a42083ef6a8f6ad4dd4d371b011d12cf..8e9ea109e5130d0a14fcb488cd31a0c6c369c760 100644
--- a/ProcessLib/SmallDeformation/SmallDeformationProcess.cpp
+++ b/ProcessLib/SmallDeformation/SmallDeformationProcess.cpp
@@ -147,7 +147,7 @@ void SmallDeformationProcess<DisplacementDim>::assembleConcreteProcess(
         getActiveElementIDs(), dof_table, t, dt, x, x_prev, process_id, &M, &K,
         &b);
 
-    _global_output(t, process_id, &M, &K, b);
+    _global_output(t, process_id, M, K, b);
 }
 
 template <int DisplacementDim>
@@ -170,7 +170,7 @@ void SmallDeformationProcess<DisplacementDim>::
     transformVariableFromGlobalVector(b, 0, *_local_to_global_index_map,
                                       *_nodal_forces, std::negate<double>());
 
-    _global_output(t, process_id, nullptr, nullptr, b, &Jac);
+    _global_output(t, process_id, b, Jac);
 }
 
 template <int DisplacementDim>
diff --git a/ProcessLib/SubmeshAssemblySupport.h b/ProcessLib/SubmeshAssemblySupport.h
index 669b3029c3418b7a57173d0119094ca9e4c0b719..e188674ddf661dd87153663c52432ad6ca3201dc 100644
--- a/ProcessLib/SubmeshAssemblySupport.h
+++ b/ProcessLib/SubmeshAssemblySupport.h
@@ -32,8 +32,10 @@ public:
     /// \attention \c meshes must be a must be a non-overlapping cover of the
     /// entire simulation domain (bulk mesh)!
     ///
-    /// \return The names of the residuum vectors that will be assembled.
-    virtual std::vector<std::string> initializeAssemblyOnSubmeshes(
+    /// \return The names of the residuum vectors that will be assembled for
+    /// each process: outer vector of size 1 for monolithic schemes and greater
+    /// for staggered schemes.
+    virtual std::vector<std::vector<std::string>> initializeAssemblyOnSubmeshes(
         std::vector<std::reference_wrapper<MeshLib::Mesh>> const& meshes)
     {
         DBUG(
diff --git a/ProcessLib/TH2M/TH2MProcess.cpp b/ProcessLib/TH2M/TH2MProcess.cpp
index c7f66f32522cb681da24cf5ce46d044c069c9b1a..3f7e384cadeb18af92e19f754b698e8eeaf29ba9 100644
--- a/ProcessLib/TH2M/TH2MProcess.cpp
+++ b/ProcessLib/TH2M/TH2MProcess.cpp
@@ -316,17 +316,17 @@ void TH2MProcess<DisplacementDim>::computeSecondaryVariableConcrete(
 }
 
 template <int DisplacementDim>
-std::vector<std::string>
+std::vector<std::vector<std::string>>
 TH2MProcess<DisplacementDim>::initializeAssemblyOnSubmeshes(
     std::vector<std::reference_wrapper<MeshLib::Mesh>> const& meshes)
 {
     INFO("TH2M process initializeSubmeshOutput().");
-    const int process_id = 0;
-    std::vector<std::string> residuum_names{
-        "GasMassFlowRate", "LiquidMassFlowRate", "HeatFlowRate", "NodalForces"};
+    std::vector<std::vector<std::string>> residuum_names{
+        {"GasMassFlowRate", "LiquidMassFlowRate", "HeatFlowRate",
+         "NodalForces"}};
 
     AssemblyMixin<TH2MProcess<DisplacementDim>>::initializeAssemblyOnSubmeshes(
-        process_id, meshes, residuum_names);
+        meshes, residuum_names);
 
     return residuum_names;
 }
diff --git a/ProcessLib/TH2M/TH2MProcess.h b/ProcessLib/TH2M/TH2MProcess.h
index 6b8942efd4411dcd75074fe61452df056788ba4a..96ff5ddb1a2e36b10ff9bc015f13c40c60486682 100644
--- a/ProcessLib/TH2M/TH2MProcess.h
+++ b/ProcessLib/TH2M/TH2MProcess.h
@@ -105,7 +105,7 @@ private:
     NumLib::LocalToGlobalIndexMap const& getDOFTable(
         const int process_id) const override;
 
-    std::vector<std::string> initializeAssemblyOnSubmeshes(
+    std::vector<std::vector<std::string>> initializeAssemblyOnSubmeshes(
         std::vector<std::reference_wrapper<MeshLib::Mesh>> const& meshes)
         override;
 
diff --git a/ProcessLib/ThermalTwoPhaseFlowWithPP/ThermalTwoPhaseFlowWithPPProcess.cpp b/ProcessLib/ThermalTwoPhaseFlowWithPP/ThermalTwoPhaseFlowWithPPProcess.cpp
index 20165029fdda7c147ce11078f14ef6a307fdab55..a7df17686ffb75db872bbe6c89e809097ae2a5a1 100644
--- a/ProcessLib/ThermalTwoPhaseFlowWithPP/ThermalTwoPhaseFlowWithPPProcess.cpp
+++ b/ProcessLib/ThermalTwoPhaseFlowWithPP/ThermalTwoPhaseFlowWithPPProcess.cpp
@@ -96,7 +96,7 @@ void ThermalTwoPhaseFlowWithPPProcess::assembleConcreteProcess(
         getActiveElementIDs(), dof_table, t, dt, x, x_prev, process_id, &M, &K,
         &b);
 
-    _global_output(t, process_id, &M, &K, b);
+    _global_output(t, process_id, M, K, b);
 }
 
 void ThermalTwoPhaseFlowWithPPProcess::assembleWithJacobianConcreteProcess(
@@ -115,7 +115,7 @@ void ThermalTwoPhaseFlowWithPPProcess::assembleWithJacobianConcreteProcess(
         _local_assemblers, getActiveElementIDs(), dof_table, t, dt, x, x_prev,
         process_id, &b, &Jac);
 
-    _global_output(t, process_id, nullptr, nullptr, b, &Jac);
+    _global_output(t, process_id, b, Jac);
 }
 
 void ThermalTwoPhaseFlowWithPPProcess::preTimestepConcreteProcess(
diff --git a/ProcessLib/ThermoRichardsMechanics/ThermoRichardsMechanicsProcess.cpp b/ProcessLib/ThermoRichardsMechanics/ThermoRichardsMechanicsProcess.cpp
index 3813235dd1bfc89445f697f364d6ab01ef2ed13a..c763ac46da494aa8d16c2d3e152c0d49e21603a2 100644
--- a/ProcessLib/ThermoRichardsMechanics/ThermoRichardsMechanicsProcess.cpp
+++ b/ProcessLib/ThermoRichardsMechanics/ThermoRichardsMechanicsProcess.cpp
@@ -239,19 +239,18 @@ void ThermoRichardsMechanicsProcess<DisplacementDim, ConstitutiveTraits>::
 }
 
 template <int DisplacementDim, typename ConstitutiveTraits>
-std::vector<std::string>
+std::vector<std::vector<std::string>>
 ThermoRichardsMechanicsProcess<DisplacementDim, ConstitutiveTraits>::
     initializeAssemblyOnSubmeshes(
         std::vector<std::reference_wrapper<MeshLib::Mesh>> const& meshes)
 {
     INFO("TRM process initializeSubmeshOutput().");
-    const int process_id = 0;
-    std::vector<std::string> residuum_names{"HeatFlowRate", "MassFlowRate",
-                                            "NodalForces"};
+    std::vector<std::vector<std::string>> residuum_names{
+        {"HeatFlowRate", "MassFlowRate", "NodalForces"}};
 
     AssemblyMixin<
         ThermoRichardsMechanicsProcess<DisplacementDim, ConstitutiveTraits>>::
-        initializeAssemblyOnSubmeshes(process_id, meshes, residuum_names);
+        initializeAssemblyOnSubmeshes(meshes, residuum_names);
 
     return residuum_names;
 }
diff --git a/ProcessLib/ThermoRichardsMechanics/ThermoRichardsMechanicsProcess.h b/ProcessLib/ThermoRichardsMechanics/ThermoRichardsMechanicsProcess.h
index a13c2b0d33896f9cb9a390e6d55d541bc53e9925..6ae284ce890c20b636d34beadc8eb81c730c6307 100644
--- a/ProcessLib/ThermoRichardsMechanics/ThermoRichardsMechanicsProcess.h
+++ b/ProcessLib/ThermoRichardsMechanics/ThermoRichardsMechanicsProcess.h
@@ -199,7 +199,7 @@ private:
     NumLib::LocalToGlobalIndexMap const& getDOFTable(
         const int process_id) const override;
 
-    std::vector<std::string> initializeAssemblyOnSubmeshes(
+    std::vector<std::vector<std::string>> initializeAssemblyOnSubmeshes(
         std::vector<std::reference_wrapper<MeshLib::Mesh>> const& meshes)
         override;
 
diff --git a/scripts/cmake/test/AddTest.cmake b/scripts/cmake/test/AddTest.cmake
index 9d35517e9e2df5e9713f257d2269ced6895ceb53..0ab5b41d5d85fd7c930a6f21325b7dfd0856f291 100644
--- a/scripts/cmake/test/AddTest.cmake
+++ b/scripts/cmake/test/AddTest.cmake
@@ -292,7 +292,7 @@ function(AddTest)
 
     # OpenMP tests for specific processes only. TODO (CL) Once all processes can
     # be assembled OpenMP parallel, the condition should be removed.
-    if("${labels}" MATCHES "TH2M|ThermoRichards")
+    if("${labels}" MATCHES "TH2M|ThermoRichardsMechanics|^HydroMechanics")
         _add_test(${TEST_NAME}-omp)
         _set_omp_test_properties()
     endif()
@@ -323,7 +323,7 @@ function(AddTest)
 
     # Run the tester
     _add_test_tester(${TEST_NAME})
-    if("${labels}" MATCHES "TH2M|ThermoRichards")
+    if("${labels}" MATCHES "TH2M|ThermoRichardsMechanics|^HydroMechanics")
         _add_test_tester(${TEST_NAME}-omp)
     endif()
 
diff --git a/scripts/cmake/test/OgsTest.cmake b/scripts/cmake/test/OgsTest.cmake
index 900bce228745c182758e997f049ba73bd179a3e4..8449f9d5b371dcb1fcca6711e9d1caaa6fb9fc5c 100644
--- a/scripts/cmake/test/OgsTest.cmake
+++ b/scripts/cmake/test/OgsTest.cmake
@@ -84,7 +84,7 @@ function(OgsTest)
 
     # OpenMP tests for specific processes only. TODO (CL) Once all processes can
     # be assembled OpenMP parallel, the condition should be removed.
-    if("${labels}" MATCHES "TH2M|ThermoRichards")
+    if("${labels}" MATCHES "TH2M|ThermoRichardsMechanics|^HydroMechanics")
         _ogs_add_test(${TEST_NAME}-omp)
         _set_omp_test_properties()
     endif()