diff --git a/Documentation/ProjectFile/prj/time_loop/global_process_coupling/convergence_criteria/i_convergence_criteria.md b/Documentation/ProjectFile/prj/time_loop/global_process_coupling/convergence_criteria/i_convergence_criteria.md
new file mode 100644
index 0000000000000000000000000000000000000000..e9c1a0d5361756589a4fd1e1c6af0f11c29342a3
--- /dev/null
+++ b/Documentation/ProjectFile/prj/time_loop/global_process_coupling/convergence_criteria/i_convergence_criteria.md
@@ -0,0 +1,2 @@
+Defines the convergence criteria of all coupled processes in the staggered
+ scheme.
\ No newline at end of file
diff --git a/Documentation/ProjectFile/prj/time_loop/global_process_coupling/convergence_criteria/t_convergence_criterion.md b/Documentation/ProjectFile/prj/time_loop/global_process_coupling/convergence_criteria/t_convergence_criterion.md
new file mode 100644
index 0000000000000000000000000000000000000000..dc4ea7fcf433db0046a84382ca56b8f729407e26
--- /dev/null
+++ b/Documentation/ProjectFile/prj/time_loop/global_process_coupling/convergence_criteria/t_convergence_criterion.md
@@ -0,0 +1 @@
+Defines the convergence criterion of each process in the staggered scheme.
\ No newline at end of file
diff --git a/Documentation/ProjectFile/prj/time_loop/global_process_coupling/i_global_process_coupling.md b/Documentation/ProjectFile/prj/time_loop/global_process_coupling/i_global_process_coupling.md
index 5263a21d1d8a82be5f52ed369b02b84ab11cfb19..5b4e2f190e8e92b61697d31ae647c0a846afb20c 100644
--- a/Documentation/ProjectFile/prj/time_loop/global_process_coupling/i_global_process_coupling.md
+++ b/Documentation/ProjectFile/prj/time_loop/global_process_coupling/i_global_process_coupling.md
@@ -1 +1 @@
-Global convergence criteria of the coupling iterations.
\ No newline at end of file
+Defines the global convergence controls for the staggered scheme.
\ No newline at end of file
diff --git a/Documentation/ProjectFile/prj/time_loop/global_process_coupling/t_convergence_criterion.md b/Documentation/ProjectFile/prj/time_loop/global_process_coupling/t_convergence_criterion.md
deleted file mode 100644
index cc1a434ed44fc6d0350448e5bc465705350a08f2..0000000000000000000000000000000000000000
--- a/Documentation/ProjectFile/prj/time_loop/global_process_coupling/t_convergence_criterion.md
+++ /dev/null
@@ -1,2 +0,0 @@
-Defines the convergence criteria of the global un-coupling loop of the staggered
- scheme.
diff --git a/Documentation/bibliography.bib b/Documentation/bibliography.bib
index 2e11dca3f5aeaa3130bc67ca2bc8f4536487381e..ac3fe78673864fea14cdd21f7456e881163a122d 100644
--- a/Documentation/bibliography.bib
+++ b/Documentation/bibliography.bib
@@ -130,3 +130,17 @@
   year={2015},
   publisher={Elsevier}
 }
+
+@article{kimTchJua2011,
+  title = "Stability and convergence of sequential methods for coupled flow and
+            geomechanics: Drained and undrained splits",
+  journal = "Computer Methods in Applied Mechanics and Engineering",
+  volume = "200",
+  number = "23-24",
+  pages = "2094--2116",
+  year = "2011",
+  note = "",
+  doi = "https://doi.org/10.1016/j.cma.2011.02.011",
+  url = "https://www.sciencedirect.com/science/article/pii/S0045782511000466",
+  author = "Kim, J. and Tchelepi, H.A. and Juanes, R"
+}
\ No newline at end of file
diff --git a/NumLib/DOF/MatrixProviderUser.h b/NumLib/DOF/MatrixProviderUser.h
index 4be6e8efe07202bc4e76a9c6fa7d9b119304d215..f12f7b8dadc92607ec16b3f9baf2bb4e2b815f86 100644
--- a/NumLib/DOF/MatrixProviderUser.h
+++ b/NumLib/DOF/MatrixProviderUser.h
@@ -19,7 +19,8 @@ namespace NumLib
 class MatrixSpecificationsProvider
 {
 public:
-    virtual MathLib::MatrixSpecifications getMatrixSpecifications() const = 0;
+    virtual MathLib::MatrixSpecifications getMatrixSpecifications(
+        const int process_id) const = 0;
 
     virtual ~MatrixSpecificationsProvider() = default;
 };
diff --git a/NumLib/ODESolver/TimeDiscretizedODESystem.cpp b/NumLib/ODESolver/TimeDiscretizedODESystem.cpp
index aa4b0818444dd47b2e7ad8039f1ff1a7a2c7276c..5c597a88e5f55e557feb2cf37291a4d7d642ae02 100644
--- a/NumLib/ODESolver/TimeDiscretizedODESystem.cpp
+++ b/NumLib/ODESolver/TimeDiscretizedODESystem.cpp
@@ -40,19 +40,20 @@ namespace NumLib
 {
 TimeDiscretizedODESystem<ODESystemTag::FirstOrderImplicitQuasilinear,
                          NonlinearSolverTag::Newton>::
-    TimeDiscretizedODESystem(ODE& ode, TimeDisc& time_discretization)
+    TimeDiscretizedODESystem(const int process_id,
+                             ODE& ode, TimeDisc& time_discretization)
     : _ode(ode),
       _time_disc(time_discretization),
       _mat_trans(createMatrixTranslator<ODETag>(time_discretization))
 {
     _Jac = &NumLib::GlobalMatrixProvider::provider.getMatrix(
-        _ode.getMatrixSpecifications(), _Jac_id);
+        _ode.getMatrixSpecifications(process_id), _Jac_id);
     _M = &NumLib::GlobalMatrixProvider::provider.getMatrix(
-        _ode.getMatrixSpecifications(), _M_id);
+        _ode.getMatrixSpecifications(process_id), _M_id);
     _K = &NumLib::GlobalMatrixProvider::provider.getMatrix(
-        _ode.getMatrixSpecifications(), _K_id);
+        _ode.getMatrixSpecifications(process_id), _K_id);
     _b = &NumLib::GlobalVectorProvider::provider.getVector(
-        _ode.getMatrixSpecifications(), _b_id);
+        _ode.getMatrixSpecifications(process_id), _b_id);
 }
 
 TimeDiscretizedODESystem<
@@ -152,17 +153,18 @@ void TimeDiscretizedODESystem<ODESystemTag::FirstOrderImplicitQuasilinear,
 
 TimeDiscretizedODESystem<ODESystemTag::FirstOrderImplicitQuasilinear,
                          NonlinearSolverTag::Picard>::
-    TimeDiscretizedODESystem(ODE& ode, TimeDisc& time_discretization)
+    TimeDiscretizedODESystem(const int process_id, ODE& ode,
+                             TimeDisc& time_discretization)
     : _ode(ode),
       _time_disc(time_discretization),
       _mat_trans(createMatrixTranslator<ODETag>(time_discretization))
 {
     _M = &NumLib::GlobalMatrixProvider::provider.getMatrix(
-        ode.getMatrixSpecifications(), _M_id);
+        ode.getMatrixSpecifications(process_id), _M_id);
     _K = &NumLib::GlobalMatrixProvider::provider.getMatrix(
-        ode.getMatrixSpecifications(), _K_id);
+        ode.getMatrixSpecifications(process_id), _K_id);
     _b = &NumLib::GlobalVectorProvider::provider.getVector(
-        ode.getMatrixSpecifications(), _b_id);
+        ode.getMatrixSpecifications(process_id), _b_id);
 }
 
 TimeDiscretizedODESystem<
diff --git a/NumLib/ODESolver/TimeDiscretizedODESystem.h b/NumLib/ODESolver/TimeDiscretizedODESystem.h
index a17cb034befe80cc09feef54ecd2c1493830f4e8..0533c66334712996eedd298ea379b02e0003a7a5 100644
--- a/NumLib/ODESolver/TimeDiscretizedODESystem.h
+++ b/NumLib/ODESolver/TimeDiscretizedODESystem.h
@@ -73,15 +73,16 @@ public:
 
     /*! Constructs a new instance.
      *
+     * \param process_id ID of the ODE to be solved.
      * \param ode the ODE to be wrapped.
      * \param time_discretization the time discretization to be used.
      */
-    explicit TimeDiscretizedODESystem(ODE& ode, TimeDisc& time_discretization);
+    explicit TimeDiscretizedODESystem(const int process_id, ODE& ode,
+                                      TimeDisc& time_discretization);
 
     ~TimeDiscretizedODESystem() override;
 
-    void assemble(const GlobalVector& x_new_timestep)
-                  override;
+    void assemble(const GlobalVector& x_new_timestep) override;
 
     void getResidual(GlobalVector const& x_new_timestep,
                      GlobalVector& res) const override;
@@ -114,9 +115,10 @@ public:
     }
 
     TimeDisc& getTimeDiscretization() override { return _time_disc; }
-    MathLib::MatrixSpecifications getMatrixSpecifications() const override
+    MathLib::MatrixSpecifications getMatrixSpecifications(
+        const int process_id) const override
     {
-        return _ode.getMatrixSpecifications();
+        return _ode.getMatrixSpecifications(process_id);
     }
 
 private:
@@ -169,12 +171,12 @@ public:
      * \param ode the ODE to be wrapped.
      * \param time_discretization the time discretization to be used.
      */
-    explicit TimeDiscretizedODESystem(ODE& ode, TimeDisc& time_discretization);
+    explicit TimeDiscretizedODESystem(const int process_id, ODE& ode,
+                                      TimeDisc& time_discretization);
 
     ~TimeDiscretizedODESystem() override;
 
-    void assemble(const GlobalVector& x_new_timestep)
-                  override;
+    void assemble(const GlobalVector& x_new_timestep) override;
 
     void getA(GlobalMatrix& A) const override
     {
@@ -212,9 +214,10 @@ public:
     }
 
     TimeDisc& getTimeDiscretization() override { return _time_disc; }
-    MathLib::MatrixSpecifications getMatrixSpecifications() const override
+    MathLib::MatrixSpecifications getMatrixSpecifications(
+        const int process_id) const override
     {
-        return _ode.getMatrixSpecifications();
+        return _ode.getMatrixSpecifications(process_id);
     }
 
 private:
diff --git a/ProcessLib/AbstractJacobianAssembler.h b/ProcessLib/AbstractJacobianAssembler.h
index db456d9fb84a13f2c3fbb6fa626536d120e658c0..11a91b18b7f34070f32f075a60c5e08a0eeeba3f 100644
--- a/ProcessLib/AbstractJacobianAssembler.h
+++ b/ProcessLib/AbstractJacobianAssembler.h
@@ -32,7 +32,7 @@ public:
 
     //! Assembles the Jacobian, the matrices \f$M\f$ and \f$K\f$, and the vector
     //! \f$b\f$ with coupling.
-    virtual void assembleWithJacobianAndCoupling(
+    virtual void assembleWithJacobianForStaggeredScheme(
         LocalAssemblerInterface& /*local_assembler*/, double const /*t*/,
         std::vector<double> const& /*local_xdot*/, const double /*dxdot_dx*/,
         const double /*dx_dx*/, std::vector<double>& /*local_M_data*/,
diff --git a/ProcessLib/AnalyticalJacobianAssembler.cpp b/ProcessLib/AnalyticalJacobianAssembler.cpp
index 837f867a657a11cfc3a52f60c87bab552b2e7e25..fd686e3d187412b7b3aee8611aa4e2eeba58d792 100644
--- a/ProcessLib/AnalyticalJacobianAssembler.cpp
+++ b/ProcessLib/AnalyticalJacobianAssembler.cpp
@@ -8,6 +8,7 @@
  */
 
 #include "AnalyticalJacobianAssembler.h"
+#include "CoupledSolutionsForStaggeredScheme.h"
 #include "LocalAssemblerInterface.h"
 
 namespace ProcessLib
@@ -17,11 +18,24 @@ void AnalyticalJacobianAssembler::assembleWithJacobian(
     std::vector<double> const& local_x, std::vector<double> const& local_xdot,
     const double dxdot_dx, const double dx_dx,
     std::vector<double>& local_M_data, std::vector<double>& local_K_data,
-    std::vector<double>& local_b_data,
-    std::vector<double>& local_Jac_data)
+    std::vector<double>& local_b_data, std::vector<double>& local_Jac_data)
 {
     local_assembler.assembleWithJacobian(t, local_x, local_xdot, dxdot_dx,
                                          dx_dx, local_M_data, local_K_data,
                                          local_b_data, local_Jac_data);
 }
+
+void AnalyticalJacobianAssembler::assembleWithJacobianForStaggeredScheme(
+    LocalAssemblerInterface& local_assembler, double const t,
+    std::vector<double> const& local_xdot, const double dxdot_dx,
+    const double dx_dx, std::vector<double>& local_M_data,
+    std::vector<double>& local_K_data, std::vector<double>& local_b_data,
+    std::vector<double>& local_Jac_data,
+    LocalCoupledSolutions const& local_coupled_solutions)
+{
+    local_assembler.assembleWithJacobianForStaggeredScheme(
+        t, local_xdot, dxdot_dx, dx_dx, local_M_data, local_K_data,
+        local_b_data, local_Jac_data, local_coupled_solutions);
+}
+
 }  // ProcessLib
diff --git a/ProcessLib/AnalyticalJacobianAssembler.h b/ProcessLib/AnalyticalJacobianAssembler.h
index 19da3228c0b387b12fb19e23d44140650d2ef432..005f3b0ae2287ca30e0f59cdabaf6d0423807755 100644
--- a/ProcessLib/AnalyticalJacobianAssembler.h
+++ b/ProcessLib/AnalyticalJacobianAssembler.h
@@ -18,6 +18,9 @@ class ConfigTree;
 
 namespace ProcessLib
 {
+
+struct LocalCoupledSolutions;
+
 //! Assembles the Jacobian matrix using a provided "analytical" method from the
 //! local assembler.
 class AnalyticalJacobianAssembler final : public AbstractJacobianAssembler
@@ -34,6 +37,14 @@ public:
         const double dx_dx, std::vector<double>& local_M_data,
         std::vector<double>& local_K_data, std::vector<double>& local_b_data,
         std::vector<double>& local_Jac_data) override;
+
+    void assembleWithJacobianForStaggeredScheme(
+        LocalAssemblerInterface& local_assembler,
+        double const t, std::vector<double> const& local_xdot,
+        const double dxdot_dx, const double dx_dx,
+        std::vector<double>& local_M_data, std::vector<double>& local_K_data,
+        std::vector<double>& local_b_data, std::vector<double>& local_Jac_data,
+        LocalCoupledSolutions const& local_coupled_solutions) override;
 };
 
 }  // ProcessLib
diff --git a/ProcessLib/ComponentTransport/ComponentTransportProcess.cpp b/ProcessLib/ComponentTransport/ComponentTransportProcess.cpp
index b342e461faa955bdddb00e22be6ebe5f974cc97a..0d8a1a4eb2d357d8a6ddb0c833f7fd825f9e284a 100644
--- a/ProcessLib/ComponentTransport/ComponentTransportProcess.cpp
+++ b/ProcessLib/ComponentTransport/ComponentTransportProcess.cpp
@@ -41,7 +41,8 @@ void ComponentTransportProcess::initializeConcreteProcess(
     MeshLib::Mesh const& mesh,
     unsigned const integration_order)
 {
-    ProcessLib::ProcessVariable const& pv = getProcessVariables()[0];
+    const int process_id = 0;
+    ProcessLib::ProcessVariable const& pv = getProcessVariables(process_id)[0];
     ProcessLib::createLocalAssemblers<LocalAssemblerData>(
         mesh.getDimension(), mesh.getElements(), dof_table,
         pv.getShapeFunctionOrder(), _local_assemblers,
@@ -62,13 +63,13 @@ void ComponentTransportProcess::assembleConcreteProcess(
     GlobalVector& b)
 {
     DBUG("Assemble ComponentTransportProcess.");
-    if (!_use_monolithic_scheme)
-        setCoupledSolutionsOfPreviousTimeStep();
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+        dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_table, t, x, M, K, b, _coupled_solutions);
 }
 
 void ComponentTransportProcess::assembleWithJacobianConcreteProcess(
@@ -78,13 +79,12 @@ void ComponentTransportProcess::assembleWithJacobianConcreteProcess(
 {
     DBUG("AssembleWithJacobian ComponentTransportProcess.");
 
-    if (!_use_monolithic_scheme)
-        setCoupledSolutionsOfPreviousTimeStep();
-
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+        dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_table, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 }
 
diff --git a/ProcessLib/CoupledSolutionsForStaggeredScheme.cpp b/ProcessLib/CoupledSolutionsForStaggeredScheme.cpp
index 1da175880d096f59540cb93f4a6a2ffd560eae98..9706bcfeaf594878154dccd259a0509e7b10836c 100644
--- a/ProcessLib/CoupledSolutionsForStaggeredScheme.cpp
+++ b/ProcessLib/CoupledSolutionsForStaggeredScheme.cpp
@@ -30,30 +30,37 @@ CoupledSolutionsForStaggeredScheme::CoupledSolutionsForStaggeredScheme(
 
 std::vector<std::vector<double>> getPreviousLocalSolutions(
     const CoupledSolutionsForStaggeredScheme& cpl_xs,
-    const std::vector<GlobalIndexType>& indices)
+    const std::vector<std::vector<GlobalIndexType>>& indices)
 {
+    if (cpl_xs.coupled_xs_t0.empty())
+        return {};
+
     const auto number_of_coupled_solutions = cpl_xs.coupled_xs.size();
     std::vector<std::vector<double>> local_xs_t0;
     local_xs_t0.reserve(number_of_coupled_solutions);
 
+    int coupling_id = 0;
     for (auto const& x_t0 : cpl_xs.coupled_xs_t0)
     {
-        local_xs_t0.emplace_back(x_t0->get(indices));
+        local_xs_t0.emplace_back(x_t0->get(indices[coupling_id]));
+        coupling_id++;
     }
     return local_xs_t0;
 }
 
 std::vector<std::vector<double>> getCurrentLocalSolutions(
     const CoupledSolutionsForStaggeredScheme& cpl_xs,
-    const std::vector<GlobalIndexType>& indices)
+    const std::vector<std::vector<GlobalIndexType>>& indices)
 {
     const auto number_of_coupled_solutions = cpl_xs.coupled_xs.size();
     std::vector<std::vector<double>> local_xs_t1;
     local_xs_t1.reserve(number_of_coupled_solutions);
 
+    int coupling_id = 0;
     for (auto const& x_t1 : cpl_xs.coupled_xs)
     {
-        local_xs_t1.emplace_back(x_t1.get().get(indices));
+        local_xs_t1.emplace_back(x_t1.get().get(indices[coupling_id]));
+        coupling_id++;
     }
     return local_xs_t1;
 }
diff --git a/ProcessLib/CoupledSolutionsForStaggeredScheme.h b/ProcessLib/CoupledSolutionsForStaggeredScheme.h
index 272ceb44efe62afc1feb7d86cbf58caaa8d47ceb..ee800490f717959e84898cb0e8dfc5ade65c26a4 100644
--- a/ProcessLib/CoupledSolutionsForStaggeredScheme.h
+++ b/ProcessLib/CoupledSolutionsForStaggeredScheme.h
@@ -12,7 +12,6 @@
 
 #pragma once
 
-#include <functional>
 #include <utility>
 #include <vector>
 
@@ -74,11 +73,25 @@ struct LocalCoupledSolutions
     std::vector<std::vector<double>> const local_coupled_xs;
 };
 
+/**
+ * Fetch the nodal solutions of all coupled processes of the previous time step
+ * of an element.
+ * @param cpl_xs  Solutions of all coupled equations.
+ * @param indices Nodal indices of an element.
+ * @return Nodal solutions of the previous time step of an element
+ */
 std::vector<std::vector<double>> getPreviousLocalSolutions(
     const CoupledSolutionsForStaggeredScheme& cpl_xs,
-    const std::vector<GlobalIndexType>& indices);
+    const std::vector<std::vector<GlobalIndexType>>& indices);
 
+/**
+ * Fetch the nodal solutions of all coupled processes of the current time step
+ * of an element.
+ * @param cpl_xs  Solutions of all coupled equations.
+ * @param indices Nodal indices of an element.
+ * @return Nodal solutions of the current time step of an element
+ */
 std::vector<std::vector<double>> getCurrentLocalSolutions(
     const CoupledSolutionsForStaggeredScheme& cpl_xs,
-    const std::vector<GlobalIndexType>& indices);
+    const std::vector<std::vector<GlobalIndexType>>& indices);
 }  // end of ProcessLib
diff --git a/ProcessLib/GroundwaterFlow/GroundwaterFlowProcess.cpp b/ProcessLib/GroundwaterFlow/GroundwaterFlowProcess.cpp
index 522861a1f969fa70d5a16374f6acc342760579fc..24c16c7fba114ff7935d9e830cd15a392d602ff1 100644
--- a/ProcessLib/GroundwaterFlow/GroundwaterFlowProcess.cpp
+++ b/ProcessLib/GroundwaterFlow/GroundwaterFlowProcess.cpp
@@ -45,7 +45,8 @@ void GroundwaterFlowProcess::initializeConcreteProcess(
     MeshLib::Mesh const& mesh,
     unsigned const integration_order)
 {
-    ProcessLib::ProcessVariable const& pv = getProcessVariables()[0];
+    const int process_id = 0;
+    ProcessLib::ProcessVariable const& pv = getProcessVariables(process_id)[0];
     ProcessLib::createLocalAssemblers<LocalAssemblerData>(
         mesh.getDimension(), mesh.getElements(), dof_table,
         pv.getShapeFunctionOrder(), _local_assemblers,
@@ -66,10 +67,12 @@ void GroundwaterFlowProcess::assembleConcreteProcess(const double t,
 {
     DBUG("Assemble GroundwaterFlowProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+        dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_table, t, x, M, K, b, _coupled_solutions);
 }
 
 void GroundwaterFlowProcess::assembleWithJacobianConcreteProcess(
@@ -79,10 +82,12 @@ void GroundwaterFlowProcess::assembleWithJacobianConcreteProcess(
 {
     DBUG("AssembleWithJacobian GroundwaterFlowProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+        dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_table, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 }
 
diff --git a/ProcessLib/GroundwaterFlow/GroundwaterFlowProcess.h b/ProcessLib/GroundwaterFlow/GroundwaterFlowProcess.h
index c3d7fcce2fa4d6982e09973954a319b3f0979fab..0072d92359a1f3791f63b6ba4d01bdc50280190d 100644
--- a/ProcessLib/GroundwaterFlow/GroundwaterFlowProcess.h
+++ b/ProcessLib/GroundwaterFlow/GroundwaterFlowProcess.h
@@ -60,8 +60,15 @@ public:
         return _local_assemblers[element_id]->getFlux(p, local_x);
     }
 
-    void postTimestepConcreteProcess(GlobalVector const& x) override
+    void postTimestepConcreteProcess(GlobalVector const& x,
+                                     int const process_id) override
     {
+        //For this single process, process_id is always zero.
+        if (process_id != 0)
+        {
+            OGS_FATAL("The condition of process_id = 0 must be satisfied for "
+                      "GroundwaterFlowProcess, which is a single process." );
+        }
         if (_balance_mesh) // computing the balance is optional
         {
             std::vector<double> init_values(
@@ -71,7 +78,9 @@ public:
                                        init_values);
             auto balance = ProcessLib::CalculateSurfaceFlux(
                 *_balance_mesh,
-                getProcessVariables()[0].get().getNumberOfComponents(),
+                getProcessVariables(process_id)[0]
+                    .get()
+                    .getNumberOfComponents(),
                 _integration_order);
 
             auto* const balance_pv =
diff --git a/ProcessLib/HT/HTProcess.cpp b/ProcessLib/HT/HTProcess.cpp
index 0849524005bbb8a9dedc55d5658fa507b0064d26..a4846b8052152f161fe876f4a00e9e113ac25e5f 100644
--- a/ProcessLib/HT/HTProcess.cpp
+++ b/ProcessLib/HT/HTProcess.cpp
@@ -11,6 +11,8 @@
 
 #include <cassert>
 
+#include "NumLib/DOF/LocalToGlobalIndexMap.h"
+
 #include "ProcessLib/Utils/CreateLocalAssemblers.h"
 
 #include "HTMaterialProperties.h"
@@ -46,7 +48,12 @@ void HTProcess::initializeConcreteProcess(
     MeshLib::Mesh const& mesh,
     unsigned const integration_order)
 {
-    ProcessLib::ProcessVariable const& pv = getProcessVariables()[0];
+    // For the staggered scheme, both processes are assumed to use the same
+    // element order. Therefore the order of shape function can be fetched from
+    // any set of the sets of process variables of the coupled processes. Here,
+    // we take the one from the first process by setting process_id = 0.
+    const int process_id = 0;
+    ProcessLib::ProcessVariable const& pv = getProcessVariables(process_id)[0];
 
     if (_use_monolithic_scheme)
     {
@@ -78,9 +85,12 @@ void HTProcess::assembleConcreteProcess(const double t,
                                         GlobalMatrix& K,
                                         GlobalVector& b)
 {
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+        dof_tables;
     if (_use_monolithic_scheme)
     {
         DBUG("Assemble HTProcess.");
+        dof_tables.emplace_back(*_local_to_global_index_map);
     }
     else
     {
@@ -97,12 +107,14 @@ void HTProcess::assembleConcreteProcess(const double t,
                 "fluid flow process within HTProcess.");
         }
         setCoupledSolutionsOfPreviousTimeStep();
+        dof_tables.emplace_back(*_local_to_global_index_map);
+        dof_tables.emplace_back(*_local_to_global_index_map);
     }
 
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_tables, t, x, M, K, b, _coupled_solutions);
 }
 
 void HTProcess::assembleWithJacobianConcreteProcess(
@@ -112,15 +124,23 @@ void HTProcess::assembleWithJacobianConcreteProcess(
 {
     DBUG("AssembleWithJacobian HTProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+        dof_tables;
     if (!_use_monolithic_scheme)
     {
         setCoupledSolutionsOfPreviousTimeStep();
+        dof_tables.emplace_back(std::ref(*_local_to_global_index_map));
+    }
+    else
+    {
+        dof_tables.emplace_back(std::ref(*_local_to_global_index_map));
+        dof_tables.emplace_back(std::ref(*_local_to_global_index_map));
     }
 
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_tables, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 }
 
@@ -160,5 +180,54 @@ void HTProcess::setCoupledTermForTheStaggeredSchemeToLocalAssemblers()
         _local_assemblers, _coupled_solutions);
 }
 
+std::tuple<NumLib::LocalToGlobalIndexMap*, bool>
+    HTProcess::getDOFTableForExtrapolatorData() const
+{
+    if (!_use_monolithic_scheme)
+    {
+        // For single-variable-single-component processes reuse the existing DOF
+        // table.
+        const bool manage_storage = false;
+        return std::make_tuple(_local_to_global_index_map.get(),
+                               manage_storage);
+    }
+
+    // Otherwise construct a new DOF table.
+    std::vector<MeshLib::MeshSubsets> all_mesh_subsets_single_component;
+    all_mesh_subsets_single_component.emplace_back(
+        _mesh_subset_all_nodes.get());
+
+    const bool manage_storage = true;
+    return std::make_tuple(new NumLib::LocalToGlobalIndexMap(
+        std::move(all_mesh_subsets_single_component),
+        // by location order is needed for output
+        NumLib::ComponentOrder::BY_LOCATION), manage_storage);
+}
+
+void HTProcess::setCoupledSolutionsOfPreviousTimeStep()
+{
+    const auto number_of_coupled_solutions =
+        _coupled_solutions->coupled_xs.size();
+    _coupled_solutions->coupled_xs_t0.clear();
+    _coupled_solutions->coupled_xs_t0.reserve(number_of_coupled_solutions);
+    const int process_id = _coupled_solutions->process_id;
+    for (std::size_t i = 0; i < number_of_coupled_solutions; i++)
+    {
+        const auto& x_t0 = _xs_previous_timestep[process_id];
+        if (x_t0 == nullptr)
+        {
+            OGS_FATAL(
+                "Memory is not allocated for the global vector "
+                "of the solution of the previous time step for the ."
+                "staggered scheme.\n It can be done by overriding "
+                "Process::preTimestepConcreteProcess"
+                "(ref. HTProcess::preTimestepConcreteProcess) ");
+        }
+
+        MathLib::LinAlg::setLocalAccessibleVector(*x_t0);
+        _coupled_solutions->coupled_xs_t0.emplace_back(x_t0.get());
+    }
+}
+
 }  // namespace HT
 }  // namespace ProcessLib
diff --git a/ProcessLib/HT/HTProcess.h b/ProcessLib/HT/HTProcess.h
index 6c836d78e9f521e4e6130c294f77a9de418f49ab..bdd86282f87a043fc7a79d357023f1e73a2ac209 100644
--- a/ProcessLib/HT/HTProcess.h
+++ b/ProcessLib/HT/HTProcess.h
@@ -14,6 +14,11 @@
 #include "NumLib/Extrapolation/LocalLinearLeastSquaresExtrapolator.h"
 #include "ProcessLib/Process.h"
 
+namespace NumLib
+{
+class LocalToGlobalIndexMap;
+}
+
 namespace ProcessLib
 {
 namespace HT
@@ -62,13 +67,6 @@ public:
     bool isLinear() const override { return false; }
     //! @}
 
-    // Get the solution of the previous time step.
-    GlobalVector* getPreviousTimeStepSolution(
-        const int process_id) const override
-    {
-        return _xs_previous_timestep[process_id].get();
-    }
-
     void setCoupledTermForTheStaggeredSchemeToLocalAssemblers() override;
 
 private:
@@ -90,6 +88,16 @@ private:
                                     double const dt,
                                     const int process_id) override;
 
+    /// Set the solutions of the previous time step to the coupled term.
+    /// It only performs for the staggered scheme.
+    void setCoupledSolutionsOfPreviousTimeStep();
+
+    /**
+     * @copydoc ProcessLib::Process::getDOFTableForExtrapolatorData()
+     */
+    std::tuple<NumLib::LocalToGlobalIndexMap*, bool>
+        getDOFTableForExtrapolatorData() const override;
+
     const std::unique_ptr<HTMaterialProperties> _material_properties;
 
     std::vector<std::unique_ptr<HTLocalAssemblerInterface>> _local_assemblers;
diff --git a/ProcessLib/HT/StaggeredHTFEM-impl.h b/ProcessLib/HT/StaggeredHTFEM-impl.h
index f13e73e70b834fb33683b49e335bbdbbf76d8701..7c12558ec2bf51da46ad34b52f92f7c62832457a 100644
--- a/ProcessLib/HT/StaggeredHTFEM-impl.h
+++ b/ProcessLib/HT/StaggeredHTFEM-impl.h
@@ -22,11 +22,11 @@ namespace HT
 template <typename ShapeFunction, typename IntegrationMethod,
           unsigned GlobalDim>
 void StaggeredHTFEM<ShapeFunction, IntegrationMethod, GlobalDim>::
-    assembleWithCoupledTerm(double const t,
-                            std::vector<double>& local_M_data,
-                            std::vector<double>& local_K_data,
-                            std::vector<double>& local_b_data,
-                            LocalCoupledSolutions const& coupled_xs)
+    assembleForStaggeredScheme(double const t,
+                               std::vector<double>& local_M_data,
+                               std::vector<double>& local_K_data,
+                               std::vector<double>& local_b_data,
+                               LocalCoupledSolutions const& coupled_xs)
 {
     if (coupled_xs.process_id == 0)
     {
@@ -277,8 +277,10 @@ StaggeredHTFEM<ShapeFunction, IntegrationMethod, GlobalDim>::
 {
     auto const indices = NumLib::getIndices(this->_element.getID(), dof_table);
     assert(!indices.empty());
-    auto const local_xs =
-        getCurrentLocalSolutions(*(this->_coupled_solutions), indices);
+    std::vector<std::vector<GlobalIndexType>> indices_of_all_coupled_processes =
+        {indices, indices};
+    auto const local_xs = getCurrentLocalSolutions(
+        *(this->_coupled_solutions), indices_of_all_coupled_processes);
 
     return this->getIntPtDarcyVelocityLocal(t, local_xs[0], local_xs[1], cache);
 }
diff --git a/ProcessLib/HT/StaggeredHTFEM.h b/ProcessLib/HT/StaggeredHTFEM.h
index 947452a97c447fe452483e80972d5d51033acdc4..d9296be17add4161be26819fc6ae357818240d33 100644
--- a/ProcessLib/HT/StaggeredHTFEM.h
+++ b/ProcessLib/HT/StaggeredHTFEM.h
@@ -62,7 +62,7 @@ public:
     {
     }
 
-    void assembleWithCoupledTerm(
+    void assembleForStaggeredScheme(
         double const t, std::vector<double>& local_M_data,
         std::vector<double>& local_K_data, std::vector<double>& local_b_data,
         LocalCoupledSolutions const& coupled_xs) override;
diff --git a/ProcessLib/HeatConduction/HeatConductionProcess.cpp b/ProcessLib/HeatConduction/HeatConductionProcess.cpp
index 758faddb405d9c36dbe1f27083156a239b95e82c..6213e31ffc242739fd3368446deb4ed243f947a6 100644
--- a/ProcessLib/HeatConduction/HeatConductionProcess.cpp
+++ b/ProcessLib/HeatConduction/HeatConductionProcess.cpp
@@ -39,7 +39,8 @@ void HeatConductionProcess::initializeConcreteProcess(
     MeshLib::Mesh const& mesh,
     unsigned const integration_order)
 {
-    ProcessLib::ProcessVariable const& pv = getProcessVariables()[0];
+    const int process_id = 0;
+    ProcessLib::ProcessVariable const& pv = getProcessVariables(process_id)[0];
     ProcessLib::createLocalAssemblers<LocalAssemblerData>(
         mesh.getDimension(), mesh.getElements(), dof_table,
         pv.getShapeFunctionOrder(), _local_assemblers,
@@ -77,10 +78,12 @@ void HeatConductionProcess::assembleConcreteProcess(const double t,
 {
     DBUG("Assemble HeatConductionProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+        dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_table, t, x, M, K, b, _coupled_solutions);
 }
 
 void HeatConductionProcess::assembleWithJacobianConcreteProcess(
@@ -90,10 +93,12 @@ void HeatConductionProcess::assembleWithJacobianConcreteProcess(
 {
     DBUG("AssembleWithJacobian HeatConductionProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+        dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_table, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 }
 
diff --git a/ProcessLib/HydroMechanics/CreateHydroMechanicsProcess.cpp b/ProcessLib/HydroMechanics/CreateHydroMechanicsProcess.cpp
index d615b6f887958c1dfd83d7e815ba390bc86b6c65..61782e91cc3a90717f19fa5da681e41dee02bc0a 100644
--- a/ProcessLib/HydroMechanics/CreateHydroMechanicsProcess.cpp
+++ b/ProcessLib/HydroMechanics/CreateHydroMechanicsProcess.cpp
@@ -187,11 +187,13 @@ std::unique_ptr<Process> createHydroMechanicsProcess(
             config.getConfigParameter<std::vector<double>>(
                 "specific_body_force");
         if (b.size() != DisplacementDim)
+        {
             OGS_FATAL(
                 "The size of the specific body force vector does not match the "
                 "displacement dimension. Vector size is %d, displacement "
                 "dimension is %d",
                 b.size(), DisplacementDim);
+        }
 
         std::copy_n(b.data(), b.size(), specific_body_force.data());
     }
diff --git a/ProcessLib/HydroMechanics/HydroMechanicsFEM-impl.h b/ProcessLib/HydroMechanics/HydroMechanicsFEM-impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..0fabb9d555b6a46554e0a04870b8b2954e0a5066
--- /dev/null
+++ b/ProcessLib/HydroMechanics/HydroMechanicsFEM-impl.h
@@ -0,0 +1,576 @@
+/**
+ * \copyright
+ * Copyright (c) 2012-2018, OpenGeoSys Community (http://www.opengeosys.org)
+ *            Distributed under a Modified BSD License.
+ *              See accompanying file LICENSE.txt or
+ *              http://www.opengeosys.org/project/license
+ *
+ *  \file   HydroMechanicsFEM-impl.h
+ *  Created on November 29, 2017, 2:03 PM
+ */
+
+#pragma once
+
+#include "HydroMechanicsFEM.h"
+
+#include "MaterialLib/SolidModels/KelvinVector.h"
+#include "NumLib/Function/Interpolation.h"
+
+namespace ProcessLib
+{
+namespace HydroMechanics
+{
+template <typename ShapeFunctionDisplacement, typename ShapeFunctionPressure,
+          typename IntegrationMethod, int DisplacementDim>
+HydroMechanicsLocalAssembler<ShapeFunctionDisplacement, ShapeFunctionPressure,
+                             IntegrationMethod, DisplacementDim>::
+    HydroMechanicsLocalAssembler(
+        MeshLib::Element const& e,
+        std::size_t const /*local_matrix_size*/,
+        bool const is_axially_symmetric,
+        unsigned const integration_order,
+        HydroMechanicsProcessData<DisplacementDim>& process_data)
+    : _process_data(process_data),
+      _integration_method(integration_order),
+      _element(e),
+      _is_axially_symmetric(is_axially_symmetric)
+{
+    unsigned const n_integration_points =
+        _integration_method.getNumberOfPoints();
+
+    _ip_data.reserve(n_integration_points);
+    _secondary_data.N_u.resize(n_integration_points);
+
+    auto const shape_matrices_u =
+        initShapeMatrices<ShapeFunctionDisplacement,
+                          ShapeMatricesTypeDisplacement, IntegrationMethod,
+                          DisplacementDim>(e, is_axially_symmetric,
+                                           _integration_method);
+
+    auto const shape_matrices_p =
+        initShapeMatrices<ShapeFunctionPressure, ShapeMatricesTypePressure,
+                          IntegrationMethod, DisplacementDim>(
+            e, is_axially_symmetric, _integration_method);
+
+    for (unsigned ip = 0; ip < n_integration_points; ip++)
+    {
+        // displacement (subscript u)
+        _ip_data.emplace_back(*_process_data.material);
+        auto& ip_data = _ip_data[ip];
+        auto const& sm_u = shape_matrices_u[ip];
+        _ip_data[ip].integration_weight =
+            _integration_method.getWeightedPoint(ip).getWeight() *
+            sm_u.integralMeasure * sm_u.detJ;
+
+        // Initialize current time step values
+        ip_data.sigma_eff.setZero(kelvin_vector_size);
+        ip_data.eps.setZero(kelvin_vector_size);
+
+        // Previous time step values are not initialized and are set later.
+        ip_data.eps_prev.resize(kelvin_vector_size);
+        ip_data.sigma_eff_prev.resize(kelvin_vector_size);
+
+        ip_data.N_u_op = ShapeMatricesTypeDisplacement::template MatrixType<
+            DisplacementDim, displacement_size>::Zero(DisplacementDim,
+                                                      displacement_size);
+        for (int i = 0; i < DisplacementDim; ++i)
+            ip_data.N_u_op
+                .template block<1, displacement_size / DisplacementDim>(
+                    i, i * displacement_size / DisplacementDim)
+                .noalias() = sm_u.N;
+
+        ip_data.N_u = sm_u.N;
+        ip_data.dNdx_u = sm_u.dNdx;
+
+        ip_data.N_p = shape_matrices_p[ip].N;
+        ip_data.dNdx_p = shape_matrices_p[ip].dNdx;
+
+        _secondary_data.N_u[ip] = shape_matrices_u[ip].N;
+    }
+}
+
+template <typename ShapeFunctionDisplacement, typename ShapeFunctionPressure,
+          typename IntegrationMethod, int DisplacementDim>
+void HydroMechanicsLocalAssembler<ShapeFunctionDisplacement,
+                                  ShapeFunctionPressure, IntegrationMethod,
+                                  DisplacementDim>::
+    assembleWithJacobian(double const t, std::vector<double> const& local_x,
+                         std::vector<double> const& local_xdot,
+                         const double /*dxdot_dx*/, const double /*dx_dx*/,
+                         std::vector<double>& /*local_M_data*/,
+                         std::vector<double>& /*local_K_data*/,
+                         std::vector<double>& local_rhs_data,
+                         std::vector<double>& local_Jac_data)
+{
+    assert(local_x.size() == pressure_size + displacement_size);
+
+    auto p = Eigen::Map<typename ShapeMatricesTypePressure::template VectorType<
+        pressure_size> const>(local_x.data() + pressure_index, pressure_size);
+
+    auto u =
+        Eigen::Map<typename ShapeMatricesTypeDisplacement::template VectorType<
+            displacement_size> const>(local_x.data() + displacement_index,
+                                      displacement_size);
+
+    auto p_dot =
+        Eigen::Map<typename ShapeMatricesTypePressure::template VectorType<
+            pressure_size> const>(local_xdot.data() + pressure_index,
+                                  pressure_size);
+    auto u_dot =
+        Eigen::Map<typename ShapeMatricesTypeDisplacement::template VectorType<
+            displacement_size> const>(local_xdot.data() + displacement_index,
+                                      displacement_size);
+
+    auto local_Jac = MathLib::createZeroedMatrix<
+        typename ShapeMatricesTypeDisplacement::template MatrixType<
+            displacement_size + pressure_size,
+            displacement_size + pressure_size>>(
+        local_Jac_data, displacement_size + pressure_size,
+        displacement_size + pressure_size);
+
+    auto local_rhs = MathLib::createZeroedVector<
+        typename ShapeMatricesTypeDisplacement::template VectorType<
+            displacement_size + pressure_size>>(
+        local_rhs_data, displacement_size + pressure_size);
+
+    typename ShapeMatricesTypePressure::NodalMatrixType laplace_p =
+        ShapeMatricesTypePressure::NodalMatrixType::Zero(pressure_size,
+                                                         pressure_size);
+
+    typename ShapeMatricesTypePressure::NodalMatrixType storage_p =
+        ShapeMatricesTypePressure::NodalMatrixType::Zero(pressure_size,
+                                                         pressure_size);
+
+    typename ShapeMatricesTypeDisplacement::template MatrixType<
+        displacement_size, pressure_size>
+        Kup = ShapeMatricesTypeDisplacement::template MatrixType<
+            displacement_size, pressure_size>::Zero(displacement_size,
+                                                    pressure_size);
+
+    double const& dt = _process_data.dt;
+
+    SpatialPosition x_position;
+    x_position.setElementID(_element.getID());
+
+    unsigned const n_integration_points =
+        _integration_method.getNumberOfPoints();
+    for (unsigned ip = 0; ip < n_integration_points; ip++)
+    {
+        x_position.setIntegrationPoint(ip);
+        auto const& w = _ip_data[ip].integration_weight;
+
+        auto const& N_u_op = _ip_data[ip].N_u_op;
+
+        auto const& N_u = _ip_data[ip].N_u;
+        auto const& dNdx_u = _ip_data[ip].dNdx_u;
+
+        auto const& N_p = _ip_data[ip].N_p;
+        auto const& dNdx_p = _ip_data[ip].dNdx_p;
+
+        auto const x_coord =
+            interpolateXCoordinate<ShapeFunctionDisplacement,
+                                   ShapeMatricesTypeDisplacement>(_element,
+                                                                  N_u);
+        auto const B =
+            LinearBMatrix::computeBMatrix<DisplacementDim,
+                                          ShapeFunctionDisplacement::NPOINTS,
+                                          typename BMatricesType::BMatrixType>(
+                dNdx_u, N_u, x_coord, _is_axially_symmetric);
+
+        auto& eps = _ip_data[ip].eps;
+        auto const& sigma_eff = _ip_data[ip].sigma_eff;
+
+        double const S = _process_data.specific_storage(t, x_position)[0];
+        double const K_over_mu =
+            _process_data.intrinsic_permeability(t, x_position)[0] /
+            _process_data.fluid_viscosity(t, x_position)[0];
+        auto const alpha = _process_data.biot_coefficient(t, x_position)[0];
+        auto const rho_sr = _process_data.solid_density(t, x_position)[0];
+        auto const rho_fr = _process_data.fluid_density(t, x_position)[0];
+        auto const porosity = _process_data.porosity(t, x_position)[0];
+        auto const& b = _process_data.specific_body_force;
+        auto const& identity2 = MaterialLib::SolidModels::Invariants<
+            KelvinVectorDimensions<DisplacementDim>::value>::identity2;
+
+        //
+        // displacement equation, displacement part
+        //
+        eps.noalias() = B * u;
+
+        auto C = _ip_data[ip].updateConstitutiveRelation(t, x_position, dt, u);
+
+        local_Jac
+            .template block<displacement_size, displacement_size>(
+                displacement_index, displacement_index)
+            .noalias() += B.transpose() * C * B * w;
+
+        double const rho = rho_sr * (1 - porosity) + porosity * rho_fr;
+        local_rhs.template segment<displacement_size>(displacement_index)
+            .noalias() -=
+            (B.transpose() * sigma_eff - N_u_op.transpose() * rho * b) * w;
+
+        //
+        // displacement equation, pressure part
+        //
+        Kup.noalias() += B.transpose() * alpha * identity2 * N_p * w;
+
+        //
+        // pressure equation, pressure part.
+        //
+        laplace_p.noalias() += dNdx_p.transpose() * K_over_mu * dNdx_p * w;
+
+        storage_p.noalias() += N_p.transpose() * S * N_p * w;
+
+        local_rhs.template segment<pressure_size>(pressure_index).noalias() +=
+            dNdx_p.transpose() * rho_fr * K_over_mu * b * w;
+
+        //
+        // pressure equation, displacement part.
+        //
+        // Reusing Kup.transpose().
+    }
+    // displacement equation, pressure part
+    local_Jac
+        .template block<displacement_size, pressure_size>(displacement_index,
+                                                          pressure_index)
+        .noalias() = -Kup;
+
+    // pressure equation, pressure part.
+    local_Jac
+        .template block<pressure_size, pressure_size>(pressure_index,
+                                                      pressure_index)
+        .noalias() = laplace_p + storage_p / dt;
+
+    // pressure equation, displacement part.
+    local_Jac
+        .template block<pressure_size, displacement_size>(pressure_index,
+                                                          displacement_index)
+        .noalias() = Kup.transpose() / dt;
+
+    // pressure equation
+    local_rhs.template segment<pressure_size>(pressure_index).noalias() -=
+        laplace_p * p + storage_p * p_dot + Kup.transpose() * u_dot;
+
+    // displacement equation
+    local_rhs.template segment<displacement_size>(displacement_index)
+        .noalias() += Kup * p;
+}
+
+template <typename ShapeFunctionDisplacement, typename ShapeFunctionPressure,
+          typename IntegrationMethod, int DisplacementDim>
+std::vector<double> const&
+HydroMechanicsLocalAssembler<ShapeFunctionDisplacement, ShapeFunctionPressure,
+                             IntegrationMethod, DisplacementDim>::
+    getIntPtDarcyVelocity(const double t,
+                          GlobalVector const& current_solution,
+                          NumLib::LocalToGlobalIndexMap const& dof_table,
+                          std::vector<double>& cache) const
+{
+    auto const num_intpts = _ip_data.size();
+
+    auto const indices = NumLib::getIndices(_element.getID(), dof_table);
+    assert(!indices.empty());
+    auto const local_x = current_solution.get(indices);
+
+    cache.clear();
+    auto cache_matrix = MathLib::createZeroedMatrix<Eigen::Matrix<
+        double, DisplacementDim, Eigen::Dynamic, Eigen::RowMajor>>(
+        cache, DisplacementDim, num_intpts);
+
+    SpatialPosition pos;
+    pos.setElementID(_element.getID());
+
+    auto p = Eigen::Map<typename ShapeMatricesTypePressure::template VectorType<
+        pressure_size> const>(local_x.data() + pressure_index, pressure_size);
+
+    unsigned const n_integration_points =
+        _integration_method.getNumberOfPoints();
+
+    SpatialPosition x_position;
+    x_position.setElementID(_element.getID());
+    for (unsigned ip = 0; ip < n_integration_points; ip++)
+    {
+        x_position.setIntegrationPoint(ip);
+        double const K_over_mu =
+            _process_data.intrinsic_permeability(t, x_position)[0] /
+            _process_data.fluid_viscosity(t, x_position)[0];
+
+        auto const rho_fr = _process_data.fluid_density(t, x_position)[0];
+        auto const& b = _process_data.specific_body_force;
+
+        // Compute the velocity
+        auto const& dNdx_p = _ip_data[ip].dNdx_p;
+        cache_matrix.col(ip).noalias() =
+            -K_over_mu * dNdx_p * p - K_over_mu * rho_fr * b;
+    }
+
+    return cache;
+}
+
+template <typename ShapeFunctionDisplacement, typename ShapeFunctionPressure,
+          typename IntegrationMethod, int DisplacementDim>
+void HydroMechanicsLocalAssembler<ShapeFunctionDisplacement,
+                                  ShapeFunctionPressure, IntegrationMethod,
+                                  DisplacementDim>::
+    assembleWithJacobianForPressureEquations(
+        const double t, const std::vector<double>& local_xdot,
+        const double /*dxdot_dx*/, const double /*dx_dx*/,
+        std::vector<double>& /*local_M_data*/,
+        std::vector<double>& /*local_K_data*/,
+        std::vector<double>& local_b_data, std::vector<double>& local_Jac_data,
+        const LocalCoupledSolutions& local_coupled_solutions)
+{
+    auto const& local_p = local_coupled_solutions.local_coupled_xs[0];
+    auto const& local_u = local_coupled_solutions.local_coupled_xs[1];
+    assert(local_p.size() == pressure_size);
+    assert(local_u.size() == displacement_size);
+
+    auto const local_matrix_size = local_p.size();
+    auto local_rhs =
+        MathLib::createZeroedVector<typename ShapeMatricesTypeDisplacement::
+                                        template VectorType<pressure_size>>(
+            local_b_data, local_matrix_size);
+
+    SpatialPosition pos;
+    pos.setElementID(this->_element.getID());
+
+    auto p = Eigen::Map<typename ShapeMatricesTypePressure::template VectorType<
+        pressure_size> const>(local_p.data(), pressure_size);
+    auto u =
+        Eigen::Map<typename ShapeMatricesTypeDisplacement::template VectorType<
+            displacement_size> const>(local_u.data(), displacement_size);
+
+    auto p_dot =
+        Eigen::Map<typename ShapeMatricesTypePressure::template VectorType<
+            pressure_size> const>(local_xdot.data(), pressure_size);
+
+    auto local_Jac = MathLib::createZeroedMatrix<
+        typename ShapeMatricesTypeDisplacement::template MatrixType<
+            pressure_size, pressure_size>>(local_Jac_data, pressure_size,
+                                           pressure_size);
+    typename ShapeMatricesTypePressure::NodalMatrixType mass =
+        ShapeMatricesTypePressure::NodalMatrixType::Zero(pressure_size,
+                                                         pressure_size);
+
+    typename ShapeMatricesTypePressure::NodalMatrixType laplace =
+        ShapeMatricesTypePressure::NodalMatrixType::Zero(pressure_size,
+                                                         pressure_size);
+
+    double const& dt = _process_data.dt;
+
+    SpatialPosition x_position;
+    x_position.setElementID(_element.getID());
+
+    int const n_integration_points = _integration_method.getNumberOfPoints();
+    for (int ip = 0; ip < n_integration_points; ip++)
+    {
+        x_position.setIntegrationPoint(ip);
+        auto const& w = _ip_data[ip].integration_weight;
+
+        auto const& N_u = _ip_data[ip].N_u;
+        auto const& dNdx_u = _ip_data[ip].dNdx_u;
+
+        auto const& N_p = _ip_data[ip].N_p;
+        auto const& dNdx_p = _ip_data[ip].dNdx_p;
+
+        double const S = _process_data.specific_storage(t, x_position)[0];
+        double const K_over_mu =
+            _process_data.intrinsic_permeability(t, x_position)[0] /
+            _process_data.fluid_viscosity(t, x_position)[0];
+        auto const alpha_b = _process_data.biot_coefficient(t, x_position)[0];
+        auto const rho_fr = _process_data.fluid_density(t, x_position)[0];
+
+        laplace.noalias() += dNdx_p.transpose() * K_over_mu * dNdx_p * w;
+
+        mass.noalias() += N_p.transpose() * S * N_p * w;
+
+        auto const& b = _process_data.specific_body_force;
+        local_rhs.noalias() += dNdx_p.transpose() * rho_fr * K_over_mu * b * w;
+
+        auto& eps = _ip_data[ip].eps;
+        auto const x_coord =
+            interpolateXCoordinate<ShapeFunctionDisplacement,
+                                   ShapeMatricesTypeDisplacement>(_element,
+                                                                  N_u);
+        auto const B =
+            LinearBMatrix::computeBMatrix<DisplacementDim,
+                                          ShapeFunctionDisplacement::NPOINTS,
+                                          typename BMatricesType::BMatrixType>(
+                dNdx_u, N_u, x_coord, _is_axially_symmetric);
+
+        eps.noalias() = B * u;
+        auto& eps_prev = _ip_data[ip].eps_prev;
+        const double dv_dt =
+            (Invariants::trace(eps) - Invariants::trace(eps_prev)) / dt;
+        local_rhs.noalias() -= alpha_b * dv_dt * N_p * w;
+    }
+    local_Jac.noalias() = laplace + mass / dt;
+
+    local_rhs.noalias() -= laplace * p + mass * p_dot;
+}
+
+template <typename ShapeFunctionDisplacement, typename ShapeFunctionPressure,
+          typename IntegrationMethod, int DisplacementDim>
+void HydroMechanicsLocalAssembler<ShapeFunctionDisplacement,
+                                  ShapeFunctionPressure, IntegrationMethod,
+                                  DisplacementDim>::
+    assembleWithJacobianForDeformationEquations(
+        const double t, const std::vector<double>& /*local_xdot*/,
+        const double /*dxdot_dx*/, const double /*dx_dx*/,
+        std::vector<double>& /*local_M_data*/,
+        std::vector<double>& /*local_K_data*/,
+        std::vector<double>& local_b_data, std::vector<double>& local_Jac_data,
+        const LocalCoupledSolutions& local_coupled_solutions)
+{
+    auto const& local_p = local_coupled_solutions.local_coupled_xs[0];
+    auto const& local_u = local_coupled_solutions.local_coupled_xs[1];
+    assert(local_p.size() == pressure_size);
+    assert(local_u.size() == displacement_size);
+
+    auto u =
+        Eigen::Map<typename ShapeMatricesTypeDisplacement::template VectorType<
+            displacement_size> const>(local_u.data(), displacement_size);
+
+    auto local_Jac = MathLib::createZeroedMatrix<
+        typename ShapeMatricesTypeDisplacement::template MatrixType<
+            displacement_size, displacement_size>>(
+        local_Jac_data, displacement_size, displacement_size);
+
+    auto local_rhs =
+        MathLib::createZeroedVector<typename ShapeMatricesTypeDisplacement::
+                                        template VectorType<displacement_size>>(
+            local_b_data, displacement_size);
+
+    double const& dt = _process_data.dt;
+
+    SpatialPosition x_position;
+    x_position.setElementID(_element.getID());
+
+    int const n_integration_points = _integration_method.getNumberOfPoints();
+    for (int ip = 0; ip < n_integration_points; ip++)
+    {
+        x_position.setIntegrationPoint(ip);
+        auto const& w = _ip_data[ip].integration_weight;
+
+        auto const& N_u_op = _ip_data[ip].N_u_op;
+
+        auto const& N_u = _ip_data[ip].N_u;
+        auto const& dNdx_u = _ip_data[ip].dNdx_u;
+
+        auto const& N_p = _ip_data[ip].N_p;
+
+        auto const x_coord =
+            interpolateXCoordinate<ShapeFunctionDisplacement,
+                                   ShapeMatricesTypeDisplacement>(_element,
+                                                                  N_u);
+        auto const B =
+            LinearBMatrix::computeBMatrix<DisplacementDim,
+                                          ShapeFunctionDisplacement::NPOINTS,
+                                          typename BMatricesType::BMatrixType>(
+                dNdx_u, N_u, x_coord, _is_axially_symmetric);
+
+        auto& eps = _ip_data[ip].eps;
+        auto const& sigma_eff = _ip_data[ip].sigma_eff;
+
+        auto const alpha = _process_data.biot_coefficient(t, x_position)[0];
+        auto const rho_sr = _process_data.solid_density(t, x_position)[0];
+        auto const rho_fr = _process_data.fluid_density(t, x_position)[0];
+        auto const porosity = _process_data.porosity(t, x_position)[0];
+        auto const& b = _process_data.specific_body_force;
+        auto const& identity2 = MaterialLib::SolidModels::Invariants<
+            KelvinVectorDimensions<DisplacementDim>::value>::identity2;
+
+        eps.noalias() = B * u;
+
+        auto C = _ip_data[ip].updateConstitutiveRelation(t, x_position, dt, u);
+
+        local_Jac.noalias() += B.transpose() * C * B * w;
+
+        double p_at_xi = 0.;
+        NumLib::shapeFunctionInterpolate(local_p, N_p, p_at_xi);
+
+        double const rho = rho_sr * (1 - porosity) + porosity * rho_fr;
+        local_rhs.noalias() -=
+            (B.transpose() * (sigma_eff - alpha * identity2 * p_at_xi) -
+             N_u_op.transpose() * rho * b) *
+            w;
+    }
+}
+
+template <typename ShapeFunctionDisplacement, typename ShapeFunctionPressure,
+          typename IntegrationMethod, int DisplacementDim>
+void HydroMechanicsLocalAssembler<ShapeFunctionDisplacement,
+                                  ShapeFunctionPressure, IntegrationMethod,
+                                  DisplacementDim>::
+    assembleWithJacobianForStaggeredScheme(
+        const double t,
+        const std::vector<double>& local_xdot,
+        const double dxdot_dx,
+        const double dx_dx,
+        std::vector<double>& local_M_data,
+        std::vector<double>& local_K_data,
+        std::vector<double>& local_b_data,
+        std::vector<double>& local_Jac_data,
+        const LocalCoupledSolutions& local_coupled_solutions)
+{
+    // For the equations with pressure
+    if (local_coupled_solutions.process_id == 0)
+    {
+        assembleWithJacobianForPressureEquations(
+            t, local_xdot, dxdot_dx, dx_dx, local_M_data, local_K_data,
+            local_b_data, local_Jac_data, local_coupled_solutions);
+        return;
+    }
+
+    // For the equations with deformation
+    assembleWithJacobianForDeformationEquations(
+        t, local_xdot, dxdot_dx, dx_dx, local_M_data, local_K_data,
+        local_b_data, local_Jac_data, local_coupled_solutions);
+}
+
+template <typename ShapeFunctionDisplacement, typename ShapeFunctionPressure,
+          typename IntegrationMethod, int DisplacementDim>
+void HydroMechanicsLocalAssembler<ShapeFunctionDisplacement,
+                                  ShapeFunctionPressure, IntegrationMethod,
+                                  DisplacementDim>::
+    postNonLinearSolverConcrete(std::vector<double> const& local_x,
+                                double const t,
+                                bool const use_monolithic_scheme)
+{
+    const int displacement_offset =
+        use_monolithic_scheme ? displacement_index : 0;
+
+    auto u =
+        Eigen::Map<typename ShapeMatricesTypeDisplacement::template VectorType<
+            displacement_size> const>(local_x.data() + displacement_offset,
+                                      displacement_size);
+    double const& dt = _process_data.dt;
+    SpatialPosition x_position;
+    x_position.setElementID(_element.getID());
+
+    int const n_integration_points = _integration_method.getNumberOfPoints();
+    for (int ip = 0; ip < n_integration_points; ip++)
+    {
+        x_position.setIntegrationPoint(ip);
+        auto const& N_u = _ip_data[ip].N_u;
+        auto const& dNdx_u = _ip_data[ip].dNdx_u;
+
+        auto const x_coord =
+            interpolateXCoordinate<ShapeFunctionDisplacement,
+                                   ShapeMatricesTypeDisplacement>(_element,
+                                                                  N_u);
+        auto const B =
+            LinearBMatrix::computeBMatrix<DisplacementDim,
+                                          ShapeFunctionDisplacement::NPOINTS,
+                                          typename BMatricesType::BMatrixType>(
+                dNdx_u, N_u, x_coord, _is_axially_symmetric);
+
+        auto& eps = _ip_data[ip].eps;
+        eps.noalias() = B * u;
+
+        _ip_data[ip].updateConstitutiveRelation(t, x_position, dt, u);
+    }
+}
+
+}  // namespace HydroMechanics
+}  // namespace ProcessLib
diff --git a/ProcessLib/HydroMechanics/HydroMechanicsFEM.h b/ProcessLib/HydroMechanics/HydroMechanicsFEM.h
index 2958cb4b2fd6ad6bd00ae9ccd5d07c8f0c9f25de..5ee1922dc08ef3ca2c5e3b46a61a6c7b55dd40bf 100644
--- a/ProcessLib/HydroMechanics/HydroMechanicsFEM.h
+++ b/ProcessLib/HydroMechanics/HydroMechanicsFEM.h
@@ -110,6 +110,10 @@ public:
     using ShapeMatricesTypePressure =
         ShapeMatrixPolicyType<ShapeFunctionPressure, DisplacementDim>;
 
+    static int const KelvinVectorSize =
+        ProcessLib::KelvinVectorDimensions<DisplacementDim>::value;
+    using Invariants = MaterialLib::SolidModels::Invariants<KelvinVectorSize>;
+
     HydroMechanicsLocalAssembler(HydroMechanicsLocalAssembler const&) = delete;
     HydroMechanicsLocalAssembler(HydroMechanicsLocalAssembler&&) = delete;
 
@@ -118,65 +122,7 @@ public:
         std::size_t const /*local_matrix_size*/,
         bool const is_axially_symmetric,
         unsigned const integration_order,
-        HydroMechanicsProcessData<DisplacementDim>& process_data)
-        : _process_data(process_data),
-          _integration_method(integration_order),
-          _element(e),
-          _is_axially_symmetric(is_axially_symmetric)
-    {
-        unsigned const n_integration_points =
-            _integration_method.getNumberOfPoints();
-
-        _ip_data.reserve(n_integration_points);
-        _secondary_data.N_u.resize(n_integration_points);
-
-        auto const shape_matrices_u =
-            initShapeMatrices<ShapeFunctionDisplacement,
-                              ShapeMatricesTypeDisplacement, IntegrationMethod,
-                              DisplacementDim>(e, is_axially_symmetric,
-                                               _integration_method);
-
-        auto const shape_matrices_p =
-            initShapeMatrices<ShapeFunctionPressure, ShapeMatricesTypePressure,
-                              IntegrationMethod, DisplacementDim>(
-                e, is_axially_symmetric, _integration_method);
-
-        for (unsigned ip = 0; ip < n_integration_points; ip++)
-        {
-            // displacement (subscript u)
-            _ip_data.emplace_back(*_process_data.material);
-            auto& ip_data = _ip_data[ip];
-            auto const& sm_u = shape_matrices_u[ip];
-            _ip_data[ip].integration_weight =
-                _integration_method.getWeightedPoint(ip).getWeight() *
-                sm_u.integralMeasure * sm_u.detJ;
-
-            // Initialize current time step values
-            ip_data.sigma_eff.setZero(kelvin_vector_size);
-            ip_data.eps.setZero(kelvin_vector_size);
-
-            // Previous time step values are not initialized and are set later.
-            ip_data.eps_prev.resize(kelvin_vector_size);
-            ip_data.sigma_eff_prev.resize(kelvin_vector_size);
-
-            ip_data.N_u_op = ShapeMatricesTypeDisplacement::template MatrixType<
-                DisplacementDim, displacement_size>::Zero(DisplacementDim,
-                                                          displacement_size);
-            for (int i = 0; i < DisplacementDim; ++i)
-                ip_data.N_u_op
-                    .template block<1, displacement_size / DisplacementDim>(
-                        i, i * displacement_size / DisplacementDim)
-                    .noalias() = sm_u.N;
-
-            ip_data.N_u = sm_u.N;
-            ip_data.dNdx_u = sm_u.dNdx;
-
-            ip_data.N_p = shape_matrices_p[ip].N;
-            ip_data.dNdx_p = shape_matrices_p[ip].dNdx;
-
-            _secondary_data.N_u[ip] = shape_matrices_u[ip].N;
-        }
-    }
+        HydroMechanicsProcessData<DisplacementDim>& process_data);
 
     void assemble(double const /*t*/, std::vector<double> const& /*local_x*/,
                   std::vector<double>& /*local_M_data*/,
@@ -195,162 +141,14 @@ public:
                               std::vector<double>& /*local_M_data*/,
                               std::vector<double>& /*local_K_data*/,
                               std::vector<double>& local_rhs_data,
-                              std::vector<double>& local_Jac_data) override
-    {
-        assert(local_x.size() == pressure_size + displacement_size);
-
-        auto p =
-            Eigen::Map<typename ShapeMatricesTypePressure::template VectorType<
-                pressure_size> const>(local_x.data() + pressure_index,
-                                      pressure_size);
-
-        auto u = Eigen::Map<typename ShapeMatricesTypeDisplacement::
-                                template VectorType<displacement_size> const>(
-            local_x.data() + displacement_index, displacement_size);
-
-        auto p_dot =
-            Eigen::Map<typename ShapeMatricesTypePressure::template VectorType<
-                pressure_size> const>(local_xdot.data() + pressure_index,
-                                      pressure_size);
-        auto u_dot =
-            Eigen::Map<typename ShapeMatricesTypeDisplacement::
-                           template VectorType<displacement_size> const>(
-                local_xdot.data() + displacement_index, displacement_size);
-
-        auto local_Jac = MathLib::createZeroedMatrix<
-            typename ShapeMatricesTypeDisplacement::template MatrixType<
-                displacement_size + pressure_size,
-                displacement_size + pressure_size>>(
-            local_Jac_data, displacement_size + pressure_size,
-            displacement_size + pressure_size);
-
-        auto local_rhs = MathLib::createZeroedVector<
-            typename ShapeMatricesTypeDisplacement::template VectorType<
-                displacement_size + pressure_size>>(
-            local_rhs_data, displacement_size + pressure_size);
-
-        typename ShapeMatricesTypePressure::NodalMatrixType laplace_p =
-            ShapeMatricesTypePressure::NodalMatrixType::Zero(pressure_size,
-                                                             pressure_size);
-
-        typename ShapeMatricesTypePressure::NodalMatrixType storage_p =
-            ShapeMatricesTypePressure::NodalMatrixType::Zero(pressure_size,
-                                                             pressure_size);
-
-        typename ShapeMatricesTypeDisplacement::template MatrixType<
-            displacement_size, pressure_size>
-            Kup = ShapeMatricesTypeDisplacement::template MatrixType<
-                displacement_size, pressure_size>::Zero(displacement_size,
-                                                        pressure_size);
-
-        double const& dt = _process_data.dt;
-
-        SpatialPosition x_position;
-        x_position.setElementID(_element.getID());
+                              std::vector<double>& local_Jac_data) override;
 
-        unsigned const n_integration_points =
-            _integration_method.getNumberOfPoints();
-        for (unsigned ip = 0; ip < n_integration_points; ip++)
-        {
-            x_position.setIntegrationPoint(ip);
-            auto const& w = _ip_data[ip].integration_weight;
-
-            auto const& N_u_op = _ip_data[ip].N_u_op;
-
-            auto const& N_u = _ip_data[ip].N_u;
-            auto const& dNdx_u = _ip_data[ip].dNdx_u;
-
-            auto const& N_p = _ip_data[ip].N_p;
-            auto const& dNdx_p = _ip_data[ip].dNdx_p;
-
-            auto const x_coord =
-                interpolateXCoordinate<ShapeFunctionDisplacement,
-                                       ShapeMatricesTypeDisplacement>(_element,
-                                                                      N_u);
-            auto const B = LinearBMatrix::computeBMatrix<
-                DisplacementDim, ShapeFunctionDisplacement::NPOINTS,
-                typename BMatricesType::BMatrixType>(dNdx_u, N_u, x_coord,
-                                                     _is_axially_symmetric);
-
-            auto& eps = _ip_data[ip].eps;
-            auto const& sigma_eff = _ip_data[ip].sigma_eff;
-
-            double const S = _process_data.specific_storage(t, x_position)[0];
-            double const K_over_mu =
-                _process_data.intrinsic_permeability(t, x_position)[0] /
-                _process_data.fluid_viscosity(t, x_position)[0];
-            auto const alpha = _process_data.biot_coefficient(t, x_position)[0];
-            auto const rho_sr = _process_data.solid_density(t, x_position)[0];
-            auto const rho_fr = _process_data.fluid_density(t, x_position)[0];
-            auto const porosity = _process_data.porosity(t, x_position)[0];
-            auto const& b = _process_data.specific_body_force;
-            auto const& identity2 = MaterialLib::SolidModels::Invariants<
-                KelvinVectorDimensions<DisplacementDim>::value>::identity2;
-
-            //
-            // displacement equation, displacement part
-            //
-            eps.noalias() = B * u;
-
-            auto C =
-                _ip_data[ip].updateConstitutiveRelation(t, x_position, dt, u);
-
-            local_Jac
-                .template block<displacement_size, displacement_size>(
-                    displacement_index, displacement_index)
-                .noalias() += B.transpose() * C * B * w;
-
-            double const rho = rho_sr * (1 - porosity) + porosity * rho_fr;
-            local_rhs.template segment<displacement_size>(displacement_index)
-                .noalias() -=
-                (B.transpose() * sigma_eff - N_u_op.transpose() * rho * b) * w;
-
-            //
-            // displacement equation, pressure part
-            //
-            Kup.noalias() += B.transpose() * alpha * identity2 * N_p * w;
-
-            //
-            // pressure equation, pressure part.
-            //
-            laplace_p.noalias() += dNdx_p.transpose() * K_over_mu * dNdx_p * w;
-
-            storage_p.noalias() += N_p.transpose() * S * N_p * w;
-
-            local_rhs.template segment<pressure_size>(pressure_index)
-                .noalias() += dNdx_p.transpose() * rho_fr * K_over_mu * b * w;
-
-            //
-            // pressure equation, displacement part.
-            //
-            // Reusing Kup.transpose().
-        }
-        // displacement equation, pressure part
-        local_Jac
-            .template block<displacement_size, pressure_size>(
-                displacement_index, pressure_index)
-            .noalias() = -Kup;
-
-        // pressure equation, pressure part.
-        local_Jac
-            .template block<pressure_size, pressure_size>(pressure_index,
-                                                          pressure_index)
-            .noalias() = laplace_p + storage_p / dt;
-
-        // pressure equation, displacement part.
-        local_Jac
-            .template block<pressure_size, displacement_size>(
-                pressure_index, displacement_index)
-            .noalias() = Kup.transpose() / dt;
-
-        // pressure equation
-        local_rhs.template segment<pressure_size>(pressure_index).noalias() -=
-            laplace_p * p + storage_p * p_dot + Kup.transpose() * u_dot;
-
-        // displacement equation
-        local_rhs.template segment<displacement_size>(displacement_index)
-            .noalias() += Kup * p;
-    }
+    void assembleWithJacobianForStaggeredScheme(
+        double const t, std::vector<double> const& local_xdot,
+        const double dxdot_dx, const double dx_dx,
+        std::vector<double>& local_M_data, std::vector<double>& local_K_data,
+        std::vector<double>& local_b_data, std::vector<double>& local_Jac_data,
+        LocalCoupledSolutions const& local_coupled_solutions) override;
 
     void preTimestepConcrete(std::vector<double> const& /*local_x*/,
                              double const /*t*/,
@@ -365,6 +163,10 @@ public:
         }
     }
 
+    void postNonLinearSolverConcrete(std::vector<double> const& local_x,
+                                     double const t,
+                                     bool const use_monolithic_scheme) override;
+
     Eigen::Map<const Eigen::RowVectorXd> getShapeMatrix(
         const unsigned integration_point) const override
     {
@@ -490,49 +292,7 @@ public:
         const double t,
         GlobalVector const& current_solution,
         NumLib::LocalToGlobalIndexMap const& dof_table,
-        std::vector<double>& cache) const override
-    {
-        auto const num_intpts = _ip_data.size();
-
-        auto const indices = NumLib::getIndices(_element.getID(), dof_table);
-        assert(!indices.empty());
-        auto const local_x = current_solution.get(indices);
-
-        cache.clear();
-        auto cache_matrix = MathLib::createZeroedMatrix<Eigen::Matrix<
-            double, DisplacementDim, Eigen::Dynamic, Eigen::RowMajor>>(
-            cache, DisplacementDim, num_intpts);
-
-        SpatialPosition pos;
-        pos.setElementID(_element.getID());
-
-        auto p =
-            Eigen::Map<typename ShapeMatricesTypePressure::template VectorType<
-                pressure_size> const>(local_x.data() + pressure_index,
-                                      pressure_size);
-
-        unsigned const n_integration_points =
-            _integration_method.getNumberOfPoints();
-
-        SpatialPosition x_position;
-        x_position.setElementID(_element.getID());
-        for (unsigned ip = 0; ip < n_integration_points; ip++) {
-            x_position.setIntegrationPoint(ip);
-            double const K_over_mu =
-                _process_data.intrinsic_permeability(t, x_position)[0] /
-                _process_data.fluid_viscosity(t, x_position)[0];
-
-            auto const rho_fr = _process_data.fluid_density(t, x_position)[0];
-            auto const& b = _process_data.specific_body_force;
-
-            // Compute the velocity
-            auto const& dNdx_p = _ip_data[ip].dNdx_p;
-            cache_matrix.col(ip).noalias() =
-                -K_over_mu * dNdx_p * p - K_over_mu * rho_fr * b;
-        }
-
-        return cache;
-    }
+        std::vector<double>& cache) const override;
 
 private:
     std::vector<double> const& getIntPtSigma(std::vector<double>& cache,
@@ -566,6 +326,73 @@ private:
         return cache;
     }
 
+    /**
+     * Assemble local matrices and vectors arise from the linearized discretized
+     * weak form of the residual of the momentum balance equation,
+     *      \f[
+     *            \nabla (\sigma - \alpha_b p \mathrm{I}) = f
+     *      \f]
+     * where \f$ \sigma\f$ is the effective stress tensor, \f$p\f$ is the pore
+     * pressure, \f$\alpha_b\f$ is the Biot constant, \f$\mathrm{I}\f$ is the
+     * identity tensor, and \f$f\f$ is the body force.
+     *
+     * @param t               Time
+     * @param local_xdot      Nodal values of \f$\dot{x}\f$ of an element.
+     * @param dxdot_dx        Value of \f$\dot{x} \cdot dx\f$.
+     * @param dx_dx           Value of \f$ x \cdot dx\f$.
+     * @param local_M_data    Mass matrix of an element, which takes the form of
+     *                        \f$ \inta N^T N\mathrm{d}\Omega\f$. Not used.
+     * @param local_K_data    Laplacian matrix of an element, which takes the
+     *         form of \f$ \int (\nabla N)^T K \nabla N\mathrm{d}\Omega\f$.
+     *                        Not used.
+     * @param local_b_data    Right hand side vector of an element.
+     * @param local_Jac_data  Element Jacobian matrix for the Newton-Raphson
+     *                        method.
+     * @param local_coupled_solutions Nodal values of solutions of the coupled
+     *                                processes of an element.
+     */
+    void assembleWithJacobianForDeformationEquations(
+        double const t, std::vector<double> const& local_xdot,
+        const double dxdot_dx, const double dx_dx,
+        std::vector<double>& local_M_data, std::vector<double>& local_K_data,
+        std::vector<double>& local_b_data, std::vector<double>& local_Jac_data,
+        LocalCoupledSolutions const& local_coupled_solutions);
+
+    /**
+     * Assemble local matrices and vectors arise from the linearized discretized
+     * weak form of the residual of the mass balance equation of single phase
+     * flow,
+     *      \f[
+     *          \alpha \cdot{p} - \nabla (K (\nabla p + \rho g \nabla z) +
+     *          \alpha_b \nabla \cdot \dot{u}  = Q
+     *      \f]
+     * where \f$ alpha\f$ is a coefficient may depend on storage or the fluid
+     * density change, \f$ \rho\f$ is the fluid density, \f$\g\f$ is the
+     * gravitational acceleration, \f$z\f$ is the vertical coordinate, \f$u\f$
+     * is the displacement, and \f$Q\f$ is the source/sink term.
+     *
+     * @param t               Time
+     * @param local_xdot      Nodal values of \f$\dot{x}\f$ of an element.
+     * @param dxdot_dx        Value of \f$\dot{x} \cdot dx\f$.
+     * @param dx_dx           Value of \f$ x \cdot dx\f$.
+     * @param local_M_data    Mass matrix of an element, which takes the form of
+     *                        \f$ \inta N^T N\mathrm{d}\Omega\f$. Not used.
+     * @param local_K_data    Laplacian matrix of an element, which takes the
+     *         form of \f$ \int (\nabla N)^T K \nabla N\mathrm{d}\Omega\f$.
+     *                        Not used.
+     * @param local_b_data    Right hand side vector of an element.
+     * @param local_Jac_data  Element Jacobian matrix for the Newton-Raphson
+     *                        method.
+     * @param local_coupled_solutions Nodal values of solutions of the coupled
+     *                                processes of an element.
+     */
+    void assembleWithJacobianForPressureEquations(
+        double const t, std::vector<double> const& local_xdot,
+        const double dxdot_dx, const double dx_dx,
+        std::vector<double>& local_M_data, std::vector<double>& local_K_data,
+        std::vector<double>& local_b_data, std::vector<double>& local_Jac_data,
+        LocalCoupledSolutions const& local_coupled_solutions);
+
 private:
     HydroMechanicsProcessData<DisplacementDim>& _process_data;
 
@@ -595,3 +422,5 @@ private:
 
 }  // namespace HydroMechanics
 }  // namespace ProcessLib
+
+#include "HydroMechanicsFEM-impl.h"
diff --git a/ProcessLib/HydroMechanics/HydroMechanicsProcess-impl.h b/ProcessLib/HydroMechanics/HydroMechanicsProcess-impl.h
index 963cb487485d0f2dae03e589730b5a4f18f0db05..14aec3eaf0605531b4420ecd885366084a7d051b 100644
--- a/ProcessLib/HydroMechanics/HydroMechanicsProcess-impl.h
+++ b/ProcessLib/HydroMechanics/HydroMechanicsProcess-impl.h
@@ -12,11 +12,13 @@
 #include <cassert>
 
 #include "MeshLib/Elements/Utils.h"
+#include "NumLib/DOF/ComputeSparsityPattern.h"
+#include "ProcessLib/HydroMechanics/CreateLocalAssemblers.h"
 #include "ProcessLib/Process.h"
 
-#include "CreateLocalAssemblers.h"
 #include "HydroMechanicsFEM.h"
 #include "HydroMechanicsProcessData.h"
+#include "HydroMechanicsProcess.h"
 
 namespace ProcessLib
 {
@@ -48,6 +50,26 @@ bool HydroMechanicsProcess<DisplacementDim>::isLinear() const
     return false;
 }
 
+template <int DisplacementDim>
+MathLib::MatrixSpecifications
+HydroMechanicsProcess<DisplacementDim>::getMatrixSpecifications(
+    const int process_id) const
+{
+    // For the monolithic scheme or the M process (deformation) in the staggered
+    // scheme.
+    if (_use_monolithic_scheme || process_id == 1)
+    {
+        auto const& l = *_local_to_global_index_map;
+        return {l.dofSizeWithoutGhosts(), l.dofSizeWithoutGhosts(),
+                &l.getGhostIndices(), &this->_sparsity_pattern};
+    }
+
+    // For staggered scheme and H process (pressure).
+    auto const& l = *_local_to_global_index_map_with_base_nodes;
+    return {l.dofSizeWithoutGhosts(), l.dofSizeWithoutGhosts(),
+            &l.getGhostIndices(), &_sparsity_pattern_with_linear_element};
+}
+
 template <int DisplacementDim>
 void HydroMechanicsProcess<DisplacementDim>::constructDofTable()
 {
@@ -59,23 +81,75 @@ void HydroMechanicsProcess<DisplacementDim>::constructDofTable()
     _mesh_subset_base_nodes =
         std::make_unique<MeshLib::MeshSubset>(_mesh, &_base_nodes);
 
-    // Collect the mesh subsets in a vector.
-
-    // For pressure, which is the first
-    std::vector<MeshLib::MeshSubsets> all_mesh_subsets;
-    all_mesh_subsets.emplace_back(_mesh_subset_base_nodes.get());
-
-    // For displacement.
-    std::generate_n(
-        std::back_inserter(all_mesh_subsets),
-        getProcessVariables()[1].get().getNumberOfComponents(),
-        [&]() { return MeshLib::MeshSubsets{_mesh_subset_all_nodes.get()}; });
-
-    std::vector<int> const vec_n_components{1, DisplacementDim};
-    _local_to_global_index_map =
+    // TODO move the two data members somewhere else.
+    // for extrapolation of secondary variables of stress or strain
+    std::vector<MeshLib::MeshSubsets> all_mesh_subsets_single_component;
+    all_mesh_subsets_single_component.emplace_back(
+        _mesh_subset_all_nodes.get());
+    _local_to_global_index_map_single_component =
         std::make_unique<NumLib::LocalToGlobalIndexMap>(
-            std::move(all_mesh_subsets), vec_n_components,
+            std::move(all_mesh_subsets_single_component),
+            // by location order is needed for output
             NumLib::ComponentOrder::BY_LOCATION);
+
+    if (_use_monolithic_scheme)
+    {
+        // For pressure, which is the first
+        std::vector<MeshLib::MeshSubsets> all_mesh_subsets;
+        all_mesh_subsets.emplace_back(_mesh_subset_base_nodes.get());
+
+        // For displacement.
+        const int monolithic_process_id = 0;
+        std::generate_n(
+            std::back_inserter(all_mesh_subsets),
+            getProcessVariables(monolithic_process_id)[1]
+                .get()
+                .getNumberOfComponents(),
+            [&]() {
+                return MeshLib::MeshSubsets{_mesh_subset_all_nodes.get()};
+            });
+
+        std::vector<int> const vec_n_components{1, DisplacementDim};
+        _local_to_global_index_map =
+            std::make_unique<NumLib::LocalToGlobalIndexMap>(
+                std::move(all_mesh_subsets), vec_n_components,
+                NumLib::ComponentOrder::BY_LOCATION);
+        assert(_local_to_global_index_map);
+    }
+    else
+    {
+        // For displacement equation.
+        const int process_id = 1;
+        std::vector<MeshLib::MeshSubsets> all_mesh_subsets;
+        std::generate_n(
+            std::back_inserter(all_mesh_subsets),
+            getProcessVariables(process_id)[0].get().getNumberOfComponents(),
+            [&]() {
+                return MeshLib::MeshSubsets{_mesh_subset_all_nodes.get()};
+            });
+
+        std::vector<int> const vec_n_components{DisplacementDim};
+        _local_to_global_index_map =
+            std::make_unique<NumLib::LocalToGlobalIndexMap>(
+                std::move(all_mesh_subsets), vec_n_components,
+                NumLib::ComponentOrder::BY_LOCATION);
+
+        // For pressure equation.
+        // Collect the mesh subsets with base nodes in a vector.
+        std::vector<MeshLib::MeshSubsets> all_mesh_subsets_base_nodes;
+        all_mesh_subsets_base_nodes.emplace_back(_mesh_subset_base_nodes.get());
+        _local_to_global_index_map_with_base_nodes =
+            std::make_unique<NumLib::LocalToGlobalIndexMap>(
+                std::move(all_mesh_subsets_base_nodes),
+                // by location order is needed for output
+                NumLib::ComponentOrder::BY_LOCATION);
+
+        _sparsity_pattern_with_linear_element = NumLib::computeSparsityPattern(
+            *_local_to_global_index_map_with_base_nodes, _mesh);
+
+        assert(_local_to_global_index_map);
+        assert(_local_to_global_index_map_with_base_nodes);
+    }
 }
 
 template <int DisplacementDim>
@@ -84,92 +158,99 @@ void HydroMechanicsProcess<DisplacementDim>::initializeConcreteProcess(
     MeshLib::Mesh const& mesh,
     unsigned const integration_order)
 {
-    createLocalAssemblers<DisplacementDim, HydroMechanicsLocalAssembler>(
+    const int mechanical_process_id = _use_monolithic_scheme ? 0 : 1;
+    const int deformation_variable_id = _use_monolithic_scheme ? 1 : 0;
+    ProcessLib::HydroMechanics::createLocalAssemblers<
+        DisplacementDim, HydroMechanicsLocalAssembler>(
         mesh.getDimension(), mesh.getElements(), dof_table,
-        // use displacment process variable for shapefunction order
-        getProcessVariables()[1].get().getShapeFunctionOrder(),
+        // use displacement process variable to set shape function order
+        getProcessVariables(mechanical_process_id)[deformation_variable_id]
+            .get()
+            .getShapeFunctionOrder(),
         _local_assemblers, mesh.isAxiallySymmetric(), integration_order,
         _process_data);
 
-    // TODO move the two data members somewhere else.
-    // for extrapolation of secondary variables
-    std::vector<MeshLib::MeshSubsets> all_mesh_subsets_single_component;
-    all_mesh_subsets_single_component.emplace_back(
-        _mesh_subset_all_nodes.get());
-    _local_to_global_index_map_single_component =
-        std::make_unique<NumLib::LocalToGlobalIndexMap>(
-            std::move(all_mesh_subsets_single_component),
-            // by location order is needed for output
-            NumLib::ComponentOrder::BY_LOCATION);
-
     Base::_secondary_variables.addSecondaryVariable(
         "sigma_xx",
-        makeExtrapolator(
-            1, getExtrapolator(), _local_assemblers,
-            &LocalAssemblerInterface::getIntPtSigmaXX));
+        makeExtrapolator(1, getExtrapolator(), _local_assemblers,
+                         &LocalAssemblerInterface::getIntPtSigmaXX));
 
     Base::_secondary_variables.addSecondaryVariable(
         "sigma_yy",
-        makeExtrapolator(
-            1, getExtrapolator(), _local_assemblers,
-            &LocalAssemblerInterface::getIntPtSigmaYY));
+        makeExtrapolator(1, getExtrapolator(), _local_assemblers,
+                         &LocalAssemblerInterface::getIntPtSigmaYY));
 
     Base::_secondary_variables.addSecondaryVariable(
         "sigma_zz",
-        makeExtrapolator(
-            1, getExtrapolator(), _local_assemblers,
-            &LocalAssemblerInterface::getIntPtSigmaZZ));
+        makeExtrapolator(1, getExtrapolator(), _local_assemblers,
+                         &LocalAssemblerInterface::getIntPtSigmaZZ));
 
     Base::_secondary_variables.addSecondaryVariable(
         "sigma_xy",
-        makeExtrapolator(
-            1, getExtrapolator(), _local_assemblers,
-            &LocalAssemblerInterface::getIntPtSigmaXY));
+        makeExtrapolator(1, getExtrapolator(), _local_assemblers,
+                         &LocalAssemblerInterface::getIntPtSigmaXY));
 
     if (DisplacementDim == 3)
     {
         Base::_secondary_variables.addSecondaryVariable(
             "sigma_xz",
-            makeExtrapolator(
-                1, getExtrapolator(), _local_assemblers,
-                &LocalAssemblerInterface::getIntPtSigmaXZ));
+            makeExtrapolator(1, getExtrapolator(), _local_assemblers,
+                             &LocalAssemblerInterface::getIntPtSigmaXZ));
 
         Base::_secondary_variables.addSecondaryVariable(
             "sigma_yz",
-            makeExtrapolator(
-                1, getExtrapolator(), _local_assemblers,
-                &LocalAssemblerInterface::getIntPtSigmaYZ));
+            makeExtrapolator(1, getExtrapolator(), _local_assemblers,
+                             &LocalAssemblerInterface::getIntPtSigmaYZ));
     }
 
     Base::_secondary_variables.addSecondaryVariable(
         "epsilon_xx",
-        makeExtrapolator(
-            1, getExtrapolator(), _local_assemblers,
-            &LocalAssemblerInterface::getIntPtEpsilonXX));
+        makeExtrapolator(1, getExtrapolator(), _local_assemblers,
+                         &LocalAssemblerInterface::getIntPtEpsilonXX));
 
     Base::_secondary_variables.addSecondaryVariable(
         "epsilon_yy",
-        makeExtrapolator(
-            1, getExtrapolator(), _local_assemblers,
-            &LocalAssemblerInterface::getIntPtEpsilonYY));
+        makeExtrapolator(1, getExtrapolator(), _local_assemblers,
+                         &LocalAssemblerInterface::getIntPtEpsilonYY));
 
     Base::_secondary_variables.addSecondaryVariable(
         "epsilon_zz",
-        makeExtrapolator(
-            1, getExtrapolator(), _local_assemblers,
-            &LocalAssemblerInterface::getIntPtEpsilonZZ));
+        makeExtrapolator(1, getExtrapolator(), _local_assemblers,
+                         &LocalAssemblerInterface::getIntPtEpsilonZZ));
 
     Base::_secondary_variables.addSecondaryVariable(
         "epsilon_xy",
-        makeExtrapolator(
-            1, getExtrapolator(), _local_assemblers,
-            &LocalAssemblerInterface::getIntPtEpsilonXY));
+        makeExtrapolator(1, getExtrapolator(), _local_assemblers,
+                         &LocalAssemblerInterface::getIntPtEpsilonXY));
 
     Base::_secondary_variables.addSecondaryVariable(
         "velocity",
-        makeExtrapolator(
-            mesh.getDimension(), getExtrapolator(), _local_assemblers,
-            &LocalAssemblerInterface::getIntPtDarcyVelocity));
+        makeExtrapolator(mesh.getDimension(), getExtrapolator(),
+                         _local_assemblers,
+                         &LocalAssemblerInterface::getIntPtDarcyVelocity));
+}
+
+template <int DisplacementDim>
+void HydroMechanicsProcess<DisplacementDim>::initializeBoundaryConditions()
+{
+    if (_use_monolithic_scheme)
+    {
+        const int process_id_of_hydromechancs = 0;
+        initializeProcessBoundaryConditionsAndSourceTerms(
+            *_local_to_global_index_map, process_id_of_hydromechancs);
+        return;
+    }
+
+    // Staggered scheme:
+    // for the equations of pressure
+    const int hydraulic_process_id = 0;
+    initializeProcessBoundaryConditionsAndSourceTerms(
+        *_local_to_global_index_map_with_base_nodes, hydraulic_process_id);
+
+    // for the equations of deformation.
+    const int mechanical_process_id = 1;
+    initializeProcessBoundaryConditionsAndSourceTerms(
+        *_local_to_global_index_map, mechanical_process_id);
 }
 
 template <int DisplacementDim>
@@ -177,53 +258,126 @@ void HydroMechanicsProcess<DisplacementDim>::assembleConcreteProcess(
     const double t, GlobalVector const& x, GlobalMatrix& M, GlobalMatrix& K,
     GlobalVector& b)
 {
-    DBUG("Assemble HydroMechanicsProcess.");
+    DBUG("Assemble the equations for HydroMechanics");
+
+    // Note: This assembly function is for the Picard nonlinear solver. Since
+    // only the Newton-Raphson method is employed to simulate coupled HM
+    // processes in this class, this function is actually not used so far.
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+        dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_table, t, x, M, K, b, _coupled_solutions);
 }
 
 template <int DisplacementDim>
 void HydroMechanicsProcess<DisplacementDim>::
-    assembleWithJacobianConcreteProcess(
-        const double t, GlobalVector const& x, GlobalVector const& xdot,
-        const double dxdot_dx, const double dx_dx, GlobalMatrix& M,
-        GlobalMatrix& K, GlobalVector& b, GlobalMatrix& Jac)
+    assembleWithJacobianConcreteProcess(const double t, GlobalVector const& x,
+                                        GlobalVector const& xdot,
+                                        const double dxdot_dx,
+                                        const double dx_dx, GlobalMatrix& M,
+                                        GlobalMatrix& K, GlobalVector& b,
+                                        GlobalMatrix& Jac)
 {
-    DBUG("AssembleJacobian HydroMechanicsProcess.");
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+        dof_tables;
+    // For the monolithic scheme
+    if (_use_monolithic_scheme)
+    {
+        DBUG(
+            "Assemble the Jacobian of HydroMechanics for the monolithic"
+            " scheme.");
+        dof_tables.emplace_back(*_local_to_global_index_map);
+    }
+    else
+    {
+        // For the staggered scheme
+        if (_coupled_solutions->process_id == 0)
+        {
+            DBUG(
+                "Assemble the Jacobian equations of liquid fluid process in "
+                "HydroMechanics for the staggered scheme.");
+        }
+        else
+        {
+            DBUG(
+                "Assemble the Jacobian equations of mechanical process in "
+                "HydroMechanics for the staggered scheme.");
+        }
+        dof_tables.emplace_back(*_local_to_global_index_map_with_base_nodes);
+        dof_tables.emplace_back(*_local_to_global_index_map);
+    }
 
-    // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
-        dx_dx, M, K, b, Jac, _coupled_solutions);
+        _local_assemblers, dof_tables, t, x, xdot, dxdot_dx, dx_dx, M, K, b,
+        Jac, _coupled_solutions);
 }
+
 template <int DisplacementDim>
 void HydroMechanicsProcess<DisplacementDim>::preTimestepConcreteProcess(
     GlobalVector const& x, double const t, double const dt,
-    const int /*process_id*/)
+    const int process_id)
 {
     DBUG("PreTimestep HydroMechanicsProcess.");
 
     _process_data.dt = dt;
     _process_data.t = t;
 
-    GlobalExecutor::executeMemberOnDereferenced(
-        &LocalAssemblerInterface::preTimestep, _local_assemblers,
-        *_local_to_global_index_map, x, t, dt);
+    if (hasMechanicalProcess(process_id))
+        GlobalExecutor::executeMemberOnDereferenced(
+            &LocalAssemblerInterface::preTimestep, _local_assemblers,
+            *_local_to_global_index_map, x, t, dt);
 }
 
 template <int DisplacementDim>
 void HydroMechanicsProcess<DisplacementDim>::postTimestepConcreteProcess(
-    GlobalVector const& x)
+    GlobalVector const& x, const int process_id)
 {
     DBUG("PostTimestep HydroMechanicsProcess.");
-
     GlobalExecutor::executeMemberOnDereferenced(
         &LocalAssemblerInterface::postTimestep, _local_assemblers,
-        *_local_to_global_index_map, x);
+        getDOFTable(process_id), x);
+}
+
+template <int DisplacementDim>
+void HydroMechanicsProcess<DisplacementDim>::postNonLinearSolverConcreteProcess(
+    GlobalVector const& x, const double t, const int process_id)
+{
+    if (!hasMechanicalProcess(process_id))
+    {
+        return;
+    }
+
+    DBUG("PostNonLinearSolver HydroMechanicsProcess.");
+    // Calculate strain, stress or other internal variables of mechanics.
+    GlobalExecutor::executeMemberOnDereferenced(
+        &LocalAssemblerInterface::postNonLinearSolver, _local_assemblers,
+        getDOFTable(process_id), x, t, _use_monolithic_scheme);
+}
+
+template <int DisplacementDim>
+std::tuple<NumLib::LocalToGlobalIndexMap*, bool>
+HydroMechanicsProcess<DisplacementDim>::getDOFTableForExtrapolatorData() const
+{
+    const bool manage_storage = false;
+    return std::make_tuple(_local_to_global_index_map_single_component.get(),
+                           manage_storage);
+}
+
+template <int DisplacementDim>
+NumLib::LocalToGlobalIndexMap const&
+HydroMechanicsProcess<DisplacementDim>::getDOFTable(const int process_id) const
+{
+    if (hasMechanicalProcess(process_id))
+    {
+        return *_local_to_global_index_map;
+    }
+
+    // For the equation of pressure
+    return *_local_to_global_index_map_with_base_nodes;
 }
 
 }  // namespace HydroMechanics
diff --git a/ProcessLib/HydroMechanics/HydroMechanicsProcess.h b/ProcessLib/HydroMechanics/HydroMechanicsProcess.h
index ddb7bd27a6b53b78cd753a8283b6af8544ffd7a9..f6ce3e4ced8a2e4e53622f5ee84995c5219b16c6 100644
--- a/ProcessLib/HydroMechanics/HydroMechanicsProcess.h
+++ b/ProcessLib/HydroMechanics/HydroMechanicsProcess.h
@@ -48,6 +48,21 @@ public:
     bool isLinear() const override;
     //! @}
 
+    /**
+     * Get the size and the sparse pattern of the global matrix in order to
+     * create the global matrices and vectors for the system equations of this
+     * process.
+     *
+     * @param process_id Process ID. If the monolithic scheme is applied,
+     *                               process_id = 0. For the staggered scheme,
+     *                               process_id = 0 represents the
+     *                               hydraulic (H) process, while process_id = 1
+     *                               represents the mechanical (M) process.                         
+     * @return Matrix specifications including size and sparse pattern.
+     */
+    MathLib::MatrixSpecifications getMatrixSpecifications(
+        const int process_id) const override;
+
 private:
     void constructDofTable() override;
 
@@ -56,20 +71,29 @@ private:
         MeshLib::Mesh const& mesh,
         unsigned const integration_order) override;
 
-    void assembleConcreteProcess(
-        const double t, GlobalVector const& x, GlobalMatrix& M, GlobalMatrix& K,
-        GlobalVector& b) override;
+    void initializeBoundaryConditions() override;
+
+    void assembleConcreteProcess(const double t, GlobalVector const& x,
+                                 GlobalMatrix& M, GlobalMatrix& K,
+                                 GlobalVector& b) override;
 
     void assembleWithJacobianConcreteProcess(
         const double t, GlobalVector const& x, GlobalVector const& xdot,
         const double dxdot_dx, const double dx_dx, GlobalMatrix& M,
         GlobalMatrix& K, GlobalVector& b, GlobalMatrix& Jac) override;
 
-    void preTimestepConcreteProcess(
-        GlobalVector const& x, double const t, double const dt,
-        const int process_id) override;
+    void preTimestepConcreteProcess(GlobalVector const& x, double const t,
+                                    double const dt,
+                                    const int process_id) override;
+
+    void postTimestepConcreteProcess(GlobalVector const& x,
+                                     int const process_id) override;
 
-    void postTimestepConcreteProcess(GlobalVector const& x) override;
+    void postNonLinearSolverConcreteProcess(GlobalVector const& x, const double t,
+                                     int const process_id) override;
+
+    NumLib::LocalToGlobalIndexMap const& getDOFTable(
+        const int process_id) const override;
 
 private:
     std::vector<MeshLib::Node*> _base_nodes;
@@ -80,6 +104,32 @@ private:
 
     std::unique_ptr<NumLib::LocalToGlobalIndexMap>
         _local_to_global_index_map_single_component;
+
+    /// Local to global index mapping for base nodes, which is used for linear
+    /// interpolation for pressure in the staggered scheme.
+    std::unique_ptr<NumLib::LocalToGlobalIndexMap>
+        _local_to_global_index_map_with_base_nodes;
+
+    /// Sparsity pattern for the flow equation, and it is initialized only if
+    /// the staggered scheme is used.
+    GlobalSparsityPattern _sparsity_pattern_with_linear_element;
+
+    /// Solutions of the previous time step
+    std::array<std::unique_ptr<GlobalVector>, 2> _xs_previous_timestep;
+
+    /**
+     * @copydoc ProcessLib::Process::getDOFTableForExtrapolatorData()
+     */
+    std::tuple<NumLib::LocalToGlobalIndexMap*, bool>
+        getDOFTableForExtrapolatorData() const override;
+
+    /// Check whether the process represented by \c process_id is/has
+    /// mechanical process. In the present implementation, the mechanical
+    /// process has process_id == 1 in the staggered scheme.
+    bool hasMechanicalProcess(int const process_id) const
+    {
+        return _use_monolithic_scheme || process_id == 1;
+    }
 };
 
 extern template class HydroMechanicsProcess<2>;
diff --git a/ProcessLib/HydroMechanics/Tests.cmake b/ProcessLib/HydroMechanics/Tests.cmake
index 51fd51b4344cb30c93f7d95d5f712570e2456050..8c1a462b1bb8997c8e4979d4864af2f8ae942b17 100644
--- a/ProcessLib/HydroMechanics/Tests.cmake
+++ b/ProcessLib/HydroMechanics/Tests.cmake
@@ -1,4 +1,6 @@
 # HydroMechanics; Small deformations, linear poroelastic (HML)
+
+### With monolithic scheme
 AddTest(
     NAME HydroMechanics_HML_square_1e2_quad8_confined_compression
     PATH HydroMechanics/Linear/Confined_Compression
@@ -84,3 +86,37 @@ AddTest(
     expected_square_1e2_UC_late_pcs_0_ts_10_t_1000.000000.vtu square_1e2_UC_late_pcs_0_ts_10_t_1000.000000.vtu displacement displacement 1e-13 1e-16
     expected_square_1e2_UC_late_pcs_0_ts_10_t_1000.000000.vtu square_1e2_UC_late_pcs_0_ts_10_t_1000.000000.vtu pressure pressure 1e-13 1e-16
 )
+
+## Test as the reference of InjectionProduction1D
+AddTest(
+    NAME MonolithicInjectionProduction1D
+    PATH HydroMechanics/StaggeredScheme/InjectionProduction1D/RerefenceSolutionByMonolithicScheme
+    EXECUTABLE ogs
+    EXECUTABLE_ARGS InjectionProduction1DMono.prj
+    WRAPPER time
+    TESTER vtkdiff
+    REQUIREMENTS NOT OGS_USE_MPI
+    DIFF_DATA
+    InjectionProduction1D_pcs_1_ts_100_t_8640000.000000.vtu InjectionProduction1D_Mono_pcs_0_ts_100_t_8640000.000000.vtu displacement displacement 1e-11 1e-11
+    InjectionProduction1D_pcs_1_ts_100_t_8640000.000000.vtu InjectionProduction1D_Mono_pcs_0_ts_100_t_8640000.000000.vtu pressure pressure 1e-11 1e-11
+    InjectionProduction1D_pcs_1_ts_100_t_8640000.000000.vtu InjectionProduction1D_Mono_pcs_0_ts_100_t_8640000.000000.vtu velocity velocity 1e-11 1e-11
+    InjectionProduction1D_pcs_1_ts_100_t_8640000.000000.vtu InjectionProduction1D_Mono_pcs_0_ts_100_t_8640000.000000.vtu epsilon_yy epsilon_yy 1e-11 1e-11
+    InjectionProduction1D_pcs_1_ts_100_t_8640000.000000.vtu InjectionProduction1D_Mono_pcs_0_ts_100_t_8640000.000000.vtu sigma_yy sigma_yy 1e-11 1e-11
+)
+
+### With staggered scheme
+AddTest(
+    NAME StaggeredInjectionProduction1D
+    PATH HydroMechanics/StaggeredScheme/InjectionProduction1D
+    EXECUTABLE ogs
+    EXECUTABLE_ARGS InjectionProduction1D.prj
+    WRAPPER time
+    TESTER vtkdiff
+    REQUIREMENTS NOT OGS_USE_MPI
+    DIFF_DATA
+    InjectionProduction1D_Mono_pcs_0_ts_100_t_8640000.000000.vtu InjectionProduction1D_pcs_1_ts_100_t_8640000.000000.vtu displacement displacement 1e-11 1e-11
+    InjectionProduction1D_Mono_pcs_0_ts_100_t_8640000.000000.vtu InjectionProduction1D_pcs_1_ts_100_t_8640000.000000.vtu pressure pressure 1e-11 1e-11
+    InjectionProduction1D_Mono_pcs_0_ts_100_t_8640000.000000.vtu InjectionProduction1D_pcs_1_ts_100_t_8640000.000000.vtu velocity velocity 1e-11 1e-11
+    InjectionProduction1D_Mono_pcs_0_ts_100_t_8640000.000000.vtu InjectionProduction1D_pcs_1_ts_100_t_8640000.000000.vtu epsilon_yy epsilon_yy 1e-11 1e-11
+    InjectionProduction1D_Mono_pcs_0_ts_100_t_8640000.000000.vtu InjectionProduction1D_pcs_1_ts_100_t_8640000.000000.vtu sigma_yy sigma_yy 1e-11 1e-11
+)
diff --git a/ProcessLib/LIE/HydroMechanics/HydroMechanicsProcess.cpp b/ProcessLib/LIE/HydroMechanics/HydroMechanicsProcess.cpp
index 9e79e533a5de37aa84fc0b7b1c08f8402f35ccfe..46c456ca7556fe577ddf8089968e2a8b3147b6d7 100644
--- a/ProcessLib/LIE/HydroMechanics/HydroMechanicsProcess.cpp
+++ b/ProcessLib/LIE/HydroMechanics/HydroMechanicsProcess.cpp
@@ -81,7 +81,8 @@ HydroMechanicsProcess<GlobalDim>::HydroMechanicsProcess(
     }
 
     // need to use a custom Neumann BC assembler for displacement jumps
-    for (ProcessVariable& pv : getProcessVariables())
+    const int monolithic_process_id = 0;
+    for (ProcessVariable& pv : getProcessVariables(monolithic_process_id))
     {
         if (pv.getName().find("displacement_jump") == std::string::npos)
             continue;
@@ -111,7 +112,9 @@ HydroMechanicsProcess<GlobalDim>::HydroMechanicsProcess(
             std::make_unique<MeshLib::ElementStatus>(&mesh,
                                                      vec_p_inactive_matIDs);
 
-        ProcessVariable const& pv_p = getProcessVariables()[0];
+        const int monolithic_process_id = 0;
+        ProcessVariable const& pv_p =
+            getProcessVariables(monolithic_process_id)[0];
         _process_data.p0 = &pv_p.getInitialCondition();
     }
 }
@@ -192,15 +195,17 @@ void HydroMechanicsProcess<GlobalDim>::initializeConcreteProcess(
 {
     assert(mesh.getDimension() == GlobalDim);
     INFO("[LIE/HM] creating local assemblers");
+    const int monolithic_process_id = 0;
     ProcessLib::LIE::HydroMechanics::createLocalAssemblers<
         GlobalDim, HydroMechanicsLocalAssemblerMatrix,
         HydroMechanicsLocalAssemblerMatrixNearFracture,
         HydroMechanicsLocalAssemblerFracture>(
         mesh.getElements(), dof_table,
         // use displacment process variable for shapefunction order
-        getProcessVariables()[1].get().getShapeFunctionOrder(),
-        _local_assemblers, mesh.isAxiallySymmetric(), integration_order,
-        _process_data);
+        getProcessVariables(
+            monolithic_process_id)[1].get().getShapeFunctionOrder(),
+            _local_assemblers, mesh.isAxiallySymmetric(), integration_order,
+            _process_data);
 
     auto mesh_prop_sigma_xx = MeshLib::getOrCreateMeshProperty<double>(
         const_cast<MeshLib::Mesh&>(mesh), "stress_xx",
@@ -420,11 +425,16 @@ void HydroMechanicsProcess<GlobalDim>::computeSecondaryVariableConcrete(
     {
         int global_component_offset_next = 0;
         int global_component_offset = 0;
+
+        const int monolithic_process_id = 0;
         for (int variable_id = 0;
-             variable_id < static_cast<int>(this->getProcessVariables().size());
+             variable_id <
+             static_cast<int>(this->getProcessVariables(
+                              monolithic_process_id).size());
              ++variable_id)
         {
-            ProcessVariable& pv = this->getProcessVariables()[variable_id];
+            ProcessVariable& pv =
+                this->getProcessVariables(monolithic_process_id)[variable_id];
             int const n_components = pv.getNumberOfComponents();
             global_component_offset = global_component_offset_next;
             global_component_offset_next += n_components;
@@ -439,7 +449,9 @@ void HydroMechanicsProcess<GlobalDim>::computeSecondaryVariableConcrete(
 
     MathLib::LinAlg::setLocalAccessibleVector(x);
 
-    ProcessVariable& pv_g = this->getProcessVariables()[g_variable_id];
+    const int monolithic_process_id = 0;
+    ProcessVariable& pv_g =
+        this->getProcessVariables(monolithic_process_id)[g_variable_id];
     auto& mesh_prop_g = pv_g.getOrCreateMeshProperty();
     auto const num_comp = pv_g.getNumberOfComponents();
     for (int component_id = 0; component_id < num_comp; ++component_id)
@@ -499,10 +511,12 @@ void HydroMechanicsProcess<GlobalDim>::assembleConcreteProcess(
 {
     DBUG("Assemble HydroMechanicsProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+        dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_table, t, x, M, K, b, _coupled_solutions);
 }
 
 template <int GlobalDim>
@@ -514,9 +528,11 @@ void HydroMechanicsProcess<GlobalDim>::assembleWithJacobianConcreteProcess(
     DBUG("AssembleWithJacobian HydroMechanicsProcess.");
 
     // Call global assembler for each local assembly item.
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_table, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 }
 
diff --git a/ProcessLib/LIE/SmallDeformation/SmallDeformationProcess.cpp b/ProcessLib/LIE/SmallDeformation/SmallDeformationProcess.cpp
index 53ad1fe0846f9b261c80fff4e21d8d8f61e7ac37..0a570a64bab61c19c22e9c99a1bd17e5e1b48230 100644
--- a/ProcessLib/LIE/SmallDeformation/SmallDeformationProcess.cpp
+++ b/ProcessLib/LIE/SmallDeformation/SmallDeformationProcess.cpp
@@ -99,7 +99,8 @@ SmallDeformationProcess<DisplacementDim>::SmallDeformationProcess(
 
     // need to use a custom Neumann BC assembler for displacement jumps
     int pv_disp_jump_id = 0;
-    for (ProcessVariable& pv : getProcessVariables())
+    const int process_id = 0;
+    for (ProcessVariable& pv : getProcessVariables(process_id))
     {
         if (pv.getName().find("displacement_jump") == std::string::npos)
         {
@@ -394,7 +395,7 @@ void SmallDeformationProcess<DisplacementDim>::initializeConcreteProcess(
 
 template <int DisplacementDim>
 void SmallDeformationProcess<DisplacementDim>::postTimestepConcreteProcess(
-    GlobalVector const& x)
+    GlobalVector const& x, int const /*process_id*/)
 {
     DBUG("PostTimestep SmallDeformationProcess.");
 
@@ -416,10 +417,12 @@ void SmallDeformationProcess<DisplacementDim>::assembleConcreteProcess(
 {
     DBUG("Assemble SmallDeformationProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+        dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_table, t, x, M, K, b, _coupled_solutions);
 }
 template <int DisplacementDim>
 void SmallDeformationProcess<DisplacementDim>::
@@ -433,9 +436,11 @@ void SmallDeformationProcess<DisplacementDim>::
     DBUG("AssembleWithJacobian SmallDeformationProcess.");
 
     // Call global assembler for each local assembly item.
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_table, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 }
 template <int DisplacementDim>
diff --git a/ProcessLib/LIE/SmallDeformation/SmallDeformationProcess.h b/ProcessLib/LIE/SmallDeformation/SmallDeformationProcess.h
index 3245f76ec3853ccfc56f30c1b434f2a03c31ad1e..d7325fc331d7d2326c66cb33d4401a9f7600fc9d 100644
--- a/ProcessLib/LIE/SmallDeformation/SmallDeformationProcess.h
+++ b/ProcessLib/LIE/SmallDeformation/SmallDeformationProcess.h
@@ -70,7 +70,8 @@ private:
                                     double const dt,
                                     const int /*process_id*/) override;
 
-    void postTimestepConcreteProcess(GlobalVector const& x) override;
+    void postTimestepConcreteProcess(GlobalVector const& x,
+                                     int const process_id) override;
 
 private:
     SmallDeformationProcessData<DisplacementDim> _process_data;
diff --git a/ProcessLib/LiquidFlow/LiquidFlowProcess.cpp b/ProcessLib/LiquidFlow/LiquidFlowProcess.cpp
index 1052c50eac0443c042eb0c39225579ac3c2990ff..8c04ad6d3c694ddf6aed3ec7fffc903b057f01c4 100644
--- a/ProcessLib/LiquidFlow/LiquidFlowProcess.cpp
+++ b/ProcessLib/LiquidFlow/LiquidFlowProcess.cpp
@@ -62,7 +62,8 @@ void LiquidFlowProcess::initializeConcreteProcess(
     MeshLib::Mesh const& mesh,
     unsigned const integration_order)
 {
-    ProcessLib::ProcessVariable const& pv = getProcessVariables()[0];
+    const int process_id = 0;
+    ProcessLib::ProcessVariable const& pv = getProcessVariables(process_id)[0];
     ProcessLib::createLocalAssemblers<LiquidFlowLocalAssembler>(
         mesh.getDimension(), mesh.getElements(), dof_table,
         pv.getShapeFunctionOrder(), _local_assemblers,
@@ -84,10 +85,13 @@ void LiquidFlowProcess::assembleConcreteProcess(const double t,
                                                 GlobalVector& b)
 {
     DBUG("Assemble LiquidFlowProcess.");
+
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_table, t, x, M, K, b, _coupled_solutions);
 }
 
 void LiquidFlowProcess::assembleWithJacobianConcreteProcess(
@@ -97,10 +101,12 @@ void LiquidFlowProcess::assembleWithJacobianConcreteProcess(
 {
     DBUG("AssembleWithJacobian LiquidFlowProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_table, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 }
 
diff --git a/ProcessLib/LocalAssemblerInterface.cpp b/ProcessLib/LocalAssemblerInterface.cpp
index b039bae02c7e53c588e1f875a0bc7a4e18a027a3..b23bf60c6b31b48f4b2c2280e68fa215f9829ce0 100644
--- a/ProcessLib/LocalAssemblerInterface.cpp
+++ b/ProcessLib/LocalAssemblerInterface.cpp
@@ -25,7 +25,7 @@ void LocalAssemblerInterface::assemble(double const /*t*/,
         "The assemble() function is not implemented in the local assembler.");
 }
 
-void LocalAssemblerInterface::assembleWithCoupledTerm(
+void LocalAssemblerInterface::assembleForStaggeredScheme(
     double const /*t*/,
     std::vector<double>& /*local_M_data*/,
     std::vector<double>& /*local_K_data*/,
@@ -33,7 +33,7 @@ void LocalAssemblerInterface::assembleWithCoupledTerm(
     LocalCoupledSolutions const& /*coupled_solutions*/)
 {
     OGS_FATAL(
-        "The assembleWithCoupledTerm() function is not implemented in the "
+        "The assembleForStaggeredScheme() function is not implemented in the "
         "local assembler.");
 }
 
@@ -50,7 +50,7 @@ void LocalAssemblerInterface::assembleWithJacobian(
         "assembler.");
 }
 
-void LocalAssemblerInterface::assembleWithJacobianAndCoupling(
+void LocalAssemblerInterface::assembleWithJacobianForStaggeredScheme(
     double const /*t*/, std::vector<double> const& /*local_xdot*/,
     const double /*dxdot_dx*/, const double /*dx_dx*/,
     std::vector<double>& /*local_M_data*/,
@@ -60,7 +60,7 @@ void LocalAssemblerInterface::assembleWithJacobianAndCoupling(
     LocalCoupledSolutions const& /*local_coupled_solutions*/)
 {
     OGS_FATAL(
-        "The assembleWithJacobianAndCoupling() function is not implemented in"
+        "The assembleWithJacobianForStaggeredScheme() function is not implemented in"
         " the local assembler.");
 }
 
@@ -71,17 +71,11 @@ void LocalAssemblerInterface::computeSecondaryVariable(
 {
     auto const indices = NumLib::getIndices(mesh_item_id, dof_table);
 
-    if (coupled_xs == nullptr)
-    {
-        auto const local_x = x.get(indices);
-        computeSecondaryVariableConcrete(t, local_x);
-    }
-    else
-    {
-        auto const local_coupled_xs =
-            getCurrentLocalSolutions(*coupled_xs, indices);
-        computeSecondaryVariableWithCoupledProcessConcrete(t, local_coupled_xs);
-    }
+    if (coupled_xs != nullptr)
+        return;
+
+    auto const local_x = x.get(indices);
+    computeSecondaryVariableConcrete(t, local_x);
 }
 
 void LocalAssemblerInterface::preTimestep(
@@ -106,4 +100,15 @@ void LocalAssemblerInterface::postTimestep(
     postTimestepConcrete(local_x);
 }
 
+void LocalAssemblerInterface::postNonLinearSolver(
+    std::size_t const mesh_item_id,
+    NumLib::LocalToGlobalIndexMap const& dof_table,
+    GlobalVector const& x, double const t, bool const use_monolithic_scheme)
+{
+    auto const indices = NumLib::getIndices(mesh_item_id, dof_table);
+    auto const local_x = x.get(indices);
+
+    postNonLinearSolverConcrete(local_x, t, use_monolithic_scheme);
+}
+
 }  // namespace ProcessLib
diff --git a/ProcessLib/LocalAssemblerInterface.h b/ProcessLib/LocalAssemblerInterface.h
index a99e41e26d1f2a74bdc3e92712de7b2bc7ce2c60..f750b6aa2a71b281b63bc99dabfb8f222f8a6ab8 100644
--- a/ProcessLib/LocalAssemblerInterface.h
+++ b/ProcessLib/LocalAssemblerInterface.h
@@ -40,7 +40,7 @@ public:
                           std::vector<double>& local_K_data,
                           std::vector<double>& local_b_data);
 
-    virtual void assembleWithCoupledTerm(
+    virtual void assembleForStaggeredScheme(
         double const t,
         std::vector<double>& local_M_data,
         std::vector<double>& local_K_data,
@@ -56,7 +56,7 @@ public:
                                       std::vector<double>& local_b_data,
                                       std::vector<double>& local_Jac_data);
 
-    virtual void assembleWithJacobianAndCoupling(
+    virtual void assembleWithJacobianForStaggeredScheme(
         double const t, std::vector<double> const& local_xdot,
         const double dxdot_dx, const double dx_dx,
         std::vector<double>& local_M_data, std::vector<double>& local_K_data,
@@ -78,6 +78,11 @@ public:
                               NumLib::LocalToGlobalIndexMap const& dof_table,
                               GlobalVector const& x);
 
+    void postNonLinearSolver(std::size_t const mesh_item_id,
+                             NumLib::LocalToGlobalIndexMap const& dof_table,
+                             GlobalVector const& x, double const t,
+                             bool const use_monolithic_scheme);
+
     /// Computes the flux in the point \c p_local_coords that is given in local
     /// coordinates using the values from \c local_x.
     virtual std::vector<double> getFlux(
@@ -94,6 +99,13 @@ private:
     }
 
     virtual void postTimestepConcrete(std::vector<double> const& /*local_x*/) {}
+
+    virtual void postNonLinearSolverConcrete(
+        std::vector<double> const& /*local_x*/, double const /*t*/,
+        bool const /*use_monolithic_scheme*/)
+    {
+    }
+
     virtual void computeSecondaryVariableConcrete(
         double const /*t*/, std::vector<double> const& /*local_x*/)
     {
diff --git a/ProcessLib/Output/Output.cpp b/ProcessLib/Output/Output.cpp
index f572d0d635e66235ff7acc5b1fbb75912932fb6c..0de4da7dfdd4c0700b66b1f52888bbce298c2747 100644
--- a/ProcessLib/Output/Output.cpp
+++ b/ProcessLib/Output/Output.cpp
@@ -130,8 +130,8 @@ void Output::doOutputAlways(Process const& process,
     time_output.start();
 
     // Need to add variables of process to vtu even no output takes place.
-    processOutputData(t, x, process.getMesh(), process.getDOFTable(),
-                      process.getProcessVariables(),
+    processOutputData(t, x, process.getMesh(), process.getDOFTable(process_id),
+                      process.getProcessVariables(process_id),
                       process.getSecondaryVariables(), process_output);
 
     // For the staggered scheme for the coupling, only the last process, which
@@ -207,8 +207,9 @@ void Output::doOutputNonlinearIteration(Process const& process,
     BaseLib::RunTime time_output;
     time_output.start();
 
-    processOutputData(t, x, process.getMesh(), process.getDOFTable(),
-                      process.getProcessVariables(),
+    processOutputData(t, x, process.getMesh(),
+                      process.getDOFTable(process_id),
+                      process.getProcessVariables(process_id),
                       process.getSecondaryVariables(), process_output);
 
     // For the staggered scheme for the coupling, only the last process, which
diff --git a/ProcessLib/PhaseField/PhaseFieldProcess-impl.h b/ProcessLib/PhaseField/PhaseFieldProcess-impl.h
index 207cb3a307ab3457ea5e13167f709c9f11f77259..eedd3abc08af808b1f93005b477cc099cb09354a 100644
--- a/ProcessLib/PhaseField/PhaseFieldProcess-impl.h
+++ b/ProcessLib/PhaseField/PhaseFieldProcess-impl.h
@@ -142,10 +142,12 @@ void PhaseFieldProcess<DisplacementDim>::assembleConcreteProcess(
 {
     DBUG("Assemble PhaseFieldProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_tables = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_tables, t, x, M, K, b, _coupled_solutions);
 }
 
 template <int DisplacementDim>
@@ -156,10 +158,12 @@ void PhaseFieldProcess<DisplacementDim>::assembleWithJacobianConcreteProcess(
 {
     // DBUG("AssembleJacobian PhaseFieldProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_tables = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_tables, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 }
 
@@ -180,7 +184,7 @@ void PhaseFieldProcess<DisplacementDim>::preTimestepConcreteProcess(
 
 template <int DisplacementDim>
 void PhaseFieldProcess<DisplacementDim>::postTimestepConcreteProcess(
-    GlobalVector const& x)
+    GlobalVector const& x, int const /*process_id*/)
 {
     DBUG("PostTimestep PhaseFieldProcess.");
 
diff --git a/ProcessLib/PhaseField/PhaseFieldProcess.h b/ProcessLib/PhaseField/PhaseFieldProcess.h
index 9e87e5e1e7da795e515642a6cded3d6a5734ca9d..efdc9d2b5954ae0b7a5b8be02edc726979a1c798 100644
--- a/ProcessLib/PhaseField/PhaseFieldProcess.h
+++ b/ProcessLib/PhaseField/PhaseFieldProcess.h
@@ -90,7 +90,8 @@ private:
         GlobalVector const& x, double const t, double const dt,
         const int process_id) override;
 
-    void postTimestepConcreteProcess(GlobalVector const& x) override;
+    void postTimestepConcreteProcess(GlobalVector const& x,
+                                     int const process_id) override;
 
 private:
     PhaseFieldProcessData<DisplacementDim> _process_data;
diff --git a/ProcessLib/Process.cpp b/ProcessLib/Process.cpp
index e062d076324488c65134dfa12505e6cb5c8aa2c4..8e2e0875f2812320d55c33d8a83ae8130be8435c 100644
--- a/ProcessLib/Process.cpp
+++ b/ProcessLib/Process.cpp
@@ -51,6 +51,39 @@ Process::Process(
 {
 }
 
+void Process::initializeProcessBoundaryConditionsAndSourceTerms(
+    const NumLib::LocalToGlobalIndexMap& dof_table, const int process_id)
+{
+    auto const& per_process_variables = _process_variables[process_id];
+    auto& per_process_BCs = _boundary_conditions[process_id];
+
+    per_process_BCs.addBCsForProcessVariables(per_process_variables, dof_table,
+                                              _integration_order);
+
+    std::vector<std::unique_ptr<NodalSourceTerm>> per_process_source_terms;
+    for (auto& pv : per_process_variables)
+    {
+        auto sts = pv.get().createSourceTerms(dof_table, 0, _integration_order);
+
+        std::move(sts.begin(), sts.end(),
+                  std::back_inserter(per_process_source_terms));
+    }
+    _source_terms.push_back(std::move(per_process_source_terms));
+}
+
+void Process::initializeBoundaryConditions()
+{
+    // The number of processes is identical to the size of _process_variables,
+    // the vector contains variables for different processes. See the
+    // documentation of _process_variables.
+    const std::size_t number_of_processes = _process_variables.size();
+    for (std::size_t pcs_id = 0; pcs_id < number_of_processes; pcs_id++)
+    {
+        initializeProcessBoundaryConditionsAndSourceTerms(
+            *_local_to_global_index_map, pcs_id);
+    }
+}
+
 void Process::initialize()
 {
     DBUG("Initialize process.");
@@ -70,32 +103,16 @@ void Process::initialize()
     finishNamedFunctionsInitialization();
 
     DBUG("Initialize boundary conditions.");
-    for (std::size_t pcs_id = 0; pcs_id < _process_variables.size(); pcs_id++)
-    {
-        auto const& per_process_variables = _process_variables[pcs_id];
-        auto& per_process_BCs = _boundary_conditions[pcs_id];
-
-        per_process_BCs.addBCsForProcessVariables(per_process_variables,
-                                                  *_local_to_global_index_map,
-                                                  _integration_order);
-
-        std::vector<std::unique_ptr<NodalSourceTerm>> per_process_source_terms;
-        for (auto& pv : per_process_variables)
-        {
-            auto sts = pv.get().createSourceTerms(*_local_to_global_index_map,
-                                                  0, _integration_order);
-
-            std::move(sts.begin(), sts.end(),
-                      std::back_inserter(per_process_source_terms));
-        }
-        _source_terms.push_back(std::move(per_process_source_terms));
-    }
+    initializeBoundaryConditions();
 }
 
-void Process::setInitialConditions(const unsigned pcs_id, double const t,
+void Process::setInitialConditions(const int process_id, double const t,
                                    GlobalVector& x)
 {
-    auto const& per_process_variables = _process_variables[pcs_id];
+    // getDOFTableOfProcess can be overloaded by the specific process.
+    auto const& dof_table_of_process = getDOFTable(process_id);
+
+    auto const& per_process_variables = _process_variables[process_id];
     for (std::size_t variable_id = 0;
          variable_id < per_process_variables.size();
          variable_id++)
@@ -104,19 +121,16 @@ void Process::setInitialConditions(const unsigned pcs_id, double const t,
 
         auto const& pv = per_process_variables[variable_id];
         DBUG("Set the initial condition of variable %s of process %d.",
-             pv.get().getName().data(), pcs_id);
+             pv.get().getName().data(), process_id);
 
         auto const& ic = pv.get().getInitialCondition();
 
         auto const num_comp = pv.get().getNumberOfComponents();
 
-        const int mesh_subset_id = _use_monolithic_scheme ? variable_id : 0;
-
         for (int component_id = 0; component_id < num_comp; ++component_id)
         {
             auto const& mesh_subsets =
-                _local_to_global_index_map->getMeshSubsets(mesh_subset_id,
-                                                           component_id);
+                dof_table_of_process.getMeshSubsets(variable_id, component_id);
             for (auto const& mesh_subset : mesh_subsets)
             {
                 auto const mesh_id = mesh_subset->getMeshID();
@@ -129,8 +143,8 @@ void Process::setInitialConditions(const unsigned pcs_id, double const t,
                     auto const& ic_value = ic(t, pos);
 
                     auto global_index =
-                        std::abs(_local_to_global_index_map->getGlobalIndex(
-                            l, mesh_subset_id, component_id));
+                        std::abs(dof_table_of_process.getGlobalIndex(
+                            l, variable_id, component_id));
 #ifdef USE_PETSC
                     // The global indices of the ghost entries of the global
                     // matrix or the global vectors need to be set as negative
@@ -150,7 +164,8 @@ void Process::setInitialConditions(const unsigned pcs_id, double const t,
     }
 }
 
-MathLib::MatrixSpecifications Process::getMatrixSpecifications() const
+MathLib::MatrixSpecifications Process::getMatrixSpecifications(
+    const int /*process_id*/) const
 {
     auto const& l = *_local_to_global_index_map;
     return {l.dofSizeWithoutGhosts(), l.dofSizeWithoutGhosts(),
@@ -252,33 +267,40 @@ void Process::constructDofTable()
             NumLib::ComponentOrder::BY_LOCATION);
 }
 
-void Process::initializeExtrapolator()
+std::tuple<NumLib::LocalToGlobalIndexMap*, bool>
+Process::getDOFTableForExtrapolatorData() const
 {
-    NumLib::LocalToGlobalIndexMap const* dof_table_single_component;
-    bool manage_storage;
-
-    if (_local_to_global_index_map->getNumberOfComponents() == 1 ||
-        !_use_monolithic_scheme)
+    if (_local_to_global_index_map->getNumberOfComponents() == 1)
     {
         // For single-variable-single-component processes reuse the existing DOF
         // table.
-        dof_table_single_component = _local_to_global_index_map.get();
-        manage_storage = false;
-    }
-    else
-    {
-        // Otherwise construct a new DOF table.
-        std::vector<MeshLib::MeshSubsets> all_mesh_subsets_single_component;
-        all_mesh_subsets_single_component.emplace_back(
-            _mesh_subset_all_nodes.get());
-
-        dof_table_single_component = new NumLib::LocalToGlobalIndexMap(
-            std::move(all_mesh_subsets_single_component),
-            // by location order is needed for output
-            NumLib::ComponentOrder::BY_LOCATION);
-        manage_storage = true;
+        const bool manage_storage = false;
+        return std::make_tuple(_local_to_global_index_map.get(),
+                               manage_storage);
     }
 
+    // Otherwise construct a new DOF table.
+    std::vector<MeshLib::MeshSubsets> all_mesh_subsets_single_component;
+    all_mesh_subsets_single_component.emplace_back(
+        _mesh_subset_all_nodes.get());
+
+    const bool manage_storage = true;
+
+    return std::make_tuple(new NumLib::LocalToGlobalIndexMap(
+                               std::move(all_mesh_subsets_single_component),
+                               // by location order is needed for output
+                               NumLib::ComponentOrder::BY_LOCATION),
+                           manage_storage);
+}
+
+void Process::initializeExtrapolator()
+{
+    NumLib::LocalToGlobalIndexMap* dof_table_single_component;
+    bool manage_storage;
+
+    std::tie(dof_table_single_component, manage_storage) =
+        getDOFTableForExtrapolatorData();
+
     std::unique_ptr<NumLib::Extrapolator> extrapolator(
         new NumLib::LocalLinearLeastSquaresExtrapolator(
             *dof_table_single_component));
@@ -326,10 +348,17 @@ void Process::preTimestep(GlobalVector const& x, const double t,
     preTimestepConcreteProcess(x, t, delta_t, process_id);
 }
 
-void Process::postTimestep(GlobalVector const& x)
+void Process::postTimestep(GlobalVector const& x, int const process_id)
 {
     MathLib::LinAlg::setLocalAccessibleVector(x);
-    postTimestepConcreteProcess(x);
+    postTimestepConcreteProcess(x, process_id);
+}
+
+void Process::postNonLinearSolver(GlobalVector const& x, const double t,
+                                  int const process_id)
+{
+    MathLib::LinAlg::setLocalAccessibleVector(x);
+    postNonLinearSolverConcreteProcess(x, t, process_id);
 }
 
 void Process::computeSecondaryVariable(const double t, GlobalVector const& x)
@@ -357,27 +386,4 @@ NumLib::IterationResult Process::postIteration(const GlobalVector& x)
     return postIterationConcreteProcess(x);
 }
 
-void Process::setCoupledSolutionsOfPreviousTimeStep()
-{
-    const auto number_of_coupled_solutions =
-        _coupled_solutions->coupled_xs.size();
-    _coupled_solutions->coupled_xs_t0.reserve(number_of_coupled_solutions);
-    for (std::size_t i = 0; i < number_of_coupled_solutions; i++)
-    {
-        const auto x_t0 = getPreviousTimeStepSolution(i);
-        if (x_t0 == nullptr)
-        {
-            OGS_FATAL(
-                "Memory is not allocated for the global vector "
-                "of the solution of the previous time step for the ."
-                "staggered scheme.\n It can be done by overloading "
-                "Process::preTimestepConcreteProcess"
-                "(ref. HTProcess::preTimestepConcreteProcess) ");
-        }
-
-        MathLib::LinAlg::setLocalAccessibleVector(*x_t0);
-        _coupled_solutions->coupled_xs_t0.emplace_back(x_t0);
-    }
-}
-
 }  // namespace ProcessLib
diff --git a/ProcessLib/Process.h b/ProcessLib/Process.h
index 44bbe15c6f2a63fec3edb12903630c36d9ea8db4..1ffb253dc710e58595214f79f77b4c1e9ca04904 100644
--- a/ProcessLib/Process.h
+++ b/ProcessLib/Process.h
@@ -9,6 +9,8 @@
 
 #pragma once
 
+#include <tuple>
+
 #include "NumLib/NamedFunctionCaller.h"
 #include "NumLib/ODESolver/NonlinearSolver.h"
 #include "NumLib/ODESolver/ODESystem.h"
@@ -57,7 +59,12 @@ public:
                      const double delta_t, const int process_id);
 
     /// Postprocessing after a complete timestep.
-    void postTimestep(GlobalVector const& x);
+    void postTimestep(GlobalVector const& x, int const process_id);
+
+    /// Calculates secondary variables, e.g. stress and strain for deformation
+    /// analysis, only after nonlinear solver being successfully conducted.
+    void postNonLinearSolver(GlobalVector const& x, const double t,
+                             int const process_id);
 
     void preIteration(const unsigned iter, GlobalVector const& x) final;
 
@@ -68,16 +75,16 @@ public:
 
     void initialize();
 
-    void setInitialConditions(const unsigned pcs_id, const double t,
+    void setInitialConditions(const int process_id, const double t,
                               GlobalVector& x);
 
-    MathLib::MatrixSpecifications getMatrixSpecifications() const final;
+    virtual MathLib::MatrixSpecifications getMatrixSpecifications(
+        const int process_id) const override;
 
     void setCoupledSolutionsForStaggeredScheme(
         CoupledSolutionsForStaggeredScheme* const coupled_solutions)
     {
         _coupled_solutions = coupled_solutions;
-
     }
 
     bool isMonolithicSchemeUsed() const { return _use_monolithic_scheme; }
@@ -100,18 +107,17 @@ public:
         return _boundary_conditions[pcs_id].getKnownSolutions(t);
     }
 
-    NumLib::LocalToGlobalIndexMap const& getDOFTable() const
+    virtual NumLib::LocalToGlobalIndexMap const& getDOFTable(
+        const int /*process_id*/) const
     {
         return *_local_to_global_index_map;
     }
 
     MeshLib::Mesh& getMesh() const { return _mesh; }
     std::vector<std::reference_wrapper<ProcessVariable>> const&
-    getProcessVariables() const
+    getProcessVariables(const int process_id) const
     {
-        const auto pcs_id =
-            (_coupled_solutions) ? _coupled_solutions->process_id : 0;
-        return _process_variables[pcs_id];
+        return _process_variables[process_id];
     }
 
     SecondaryVariableCollection const& getSecondaryVariables() const
@@ -119,14 +125,6 @@ public:
         return _secondary_variables;
     }
 
-    // Get the solution of the previous time step.
-
-    virtual GlobalVector* getPreviousTimeStepSolution(
-        const int /*process_id*/) const
-    {
-        return nullptr;
-    }
-
     // Used as a call back for CalculateSurfaceFlux process.
 
     virtual std::vector<double> getFlux(std::size_t /*element_id*/,
@@ -147,6 +145,14 @@ protected:
         return _extrapolator_data.getDOFTable();
     }
 
+    /**
+     * Initialize the boundary conditions for a single process or coupled
+     * processes modelled by the monolithic scheme. It is called by
+     * initializeBoundaryConditions().
+     */
+    void initializeProcessBoundaryConditionsAndSourceTerms(
+        const NumLib::LocalToGlobalIndexMap& dof_table, const int process_id);
+
 private:
     /// Process specific initialization called by initialize().
     virtual void initializeConcreteProcess(
@@ -154,6 +160,10 @@ private:
         MeshLib::Mesh const& mesh,
         unsigned const integration_order) = 0;
 
+    /// Member function to initialize the boundary conditions for all coupled
+    /// processes. It is called by initialize().
+    virtual void initializeBoundaryConditions();
+
     virtual void preAssembleConcreteProcess(const double /*t*/,
                                             GlobalVector const& /*x*/)
     {
@@ -175,7 +185,17 @@ private:
     {
     }
 
-    virtual void postTimestepConcreteProcess(GlobalVector const& /*x*/) {}
+    virtual void postTimestepConcreteProcess(GlobalVector const& /*x*/,
+                                             int const /*process_id*/)
+    {
+    }
+
+    virtual void postNonLinearSolverConcreteProcess(GlobalVector const& /*x*/,
+                                                    const double /*t*/,
+                                                    int const /*process_id*/)
+    {
+    }
+
     virtual void preIterationConcreteProcess(const unsigned /*iter*/,
                                              GlobalVector const& /*x*/)
     {
@@ -195,6 +215,17 @@ private:
 protected:
     virtual void constructDofTable();
 
+    /**
+     * Get the address of a LocalToGlobalIndexMap, and the status of its memory.
+     * If the LocalToGlobalIndexMap is created as new in this function, the
+     * function also returns a true boolean value to let Extrapolator manage
+     * the memory by the address returned by this function.
+     *
+     * @return Address of a LocalToGlobalIndexMap and its memory status.
+     */
+    virtual std::tuple<NumLib::LocalToGlobalIndexMap*, bool>
+    getDOFTableForExtrapolatorData() const;
+
 private:
     void initializeExtrapolator();
 
@@ -227,18 +258,14 @@ protected:
     /// references to the solutions of the coupled processes.
     CoupledSolutionsForStaggeredScheme* _coupled_solutions;
 
-    /// Set the solutions of the previous time step to the coupled term.
-    /// It only performs for the staggered scheme.
-    void setCoupledSolutionsOfPreviousTimeStep();
-
     /// Order of the integration method for element-wise integration.
     /// The Gauss-Legendre integration method and available orders is
     /// implemented in MathLib::GaussLegendre.
     unsigned const _integration_order;
 
-private:
     GlobalSparsityPattern _sparsity_pattern;
 
+private:
     /// Variables used by this process.  For the monolithic scheme or a
     /// single process, the size of the outer vector is one. For the
     /// staggered scheme, the size of the outer vector is the number of the
diff --git a/ProcessLib/RichardsComponentTransport/RichardsComponentTransportProcess.cpp b/ProcessLib/RichardsComponentTransport/RichardsComponentTransportProcess.cpp
index 37f7004dce72e81483758b5c7609d279c3c33ec5..9eaaa8a0d3ff17ad1e35072eb990ef31aa8fbcf3 100644
--- a/ProcessLib/RichardsComponentTransport/RichardsComponentTransportProcess.cpp
+++ b/ProcessLib/RichardsComponentTransport/RichardsComponentTransportProcess.cpp
@@ -41,7 +41,9 @@ void RichardsComponentTransportProcess::initializeConcreteProcess(
     MeshLib::Mesh const& mesh,
     unsigned const integration_order)
 {
-    ProcessLib::ProcessVariable const& pv = getProcessVariables()[0];
+    const int monolithic_process_id = 0;
+    ProcessLib::ProcessVariable const& pv =
+        getProcessVariables(monolithic_process_id)[0];
     ProcessLib::createLocalAssemblers<LocalAssemblerData>(
         mesh.getDimension(), mesh.getElements(), dof_table,
         pv.getShapeFunctionOrder(), _local_assemblers,
@@ -70,10 +72,12 @@ void RichardsComponentTransportProcess::assembleConcreteProcess(
 {
     DBUG("Assemble RichardsComponentTransportProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_table, t, x, M, K, b, _coupled_solutions);
 }
 
 void RichardsComponentTransportProcess::assembleWithJacobianConcreteProcess(
@@ -83,10 +87,12 @@ void RichardsComponentTransportProcess::assembleWithJacobianConcreteProcess(
 {
     DBUG("AssembleWithJacobian RichardsComponentTransportProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_table, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 }
 
diff --git a/ProcessLib/RichardsFlow/RichardsFlowProcess.cpp b/ProcessLib/RichardsFlow/RichardsFlowProcess.cpp
index 77c65a109e5858d75859fdc946e4a9ca6915c3f4..a3a49cf9df33715ddcaa7564d6783f536b917a2b 100644
--- a/ProcessLib/RichardsFlow/RichardsFlowProcess.cpp
+++ b/ProcessLib/RichardsFlow/RichardsFlowProcess.cpp
@@ -43,7 +43,8 @@ void RichardsFlowProcess::initializeConcreteProcess(
     MeshLib::Mesh const& mesh,
     unsigned const integration_order)
 {
-    ProcessLib::ProcessVariable const& pv = getProcessVariables()[0];
+    const int process_id = 0;
+    ProcessLib::ProcessVariable const& pv = getProcessVariables(process_id)[0];
     ProcessLib::createLocalAssemblers<LocalAssemblerData>(
         mesh.getDimension(), mesh.getElements(), dof_table,
         pv.getShapeFunctionOrder(), _local_assemblers,
@@ -71,10 +72,12 @@ void RichardsFlowProcess::assembleConcreteProcess(
 {
     DBUG("Assemble RichardsFlowProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_table, t, x, M, K, b, _coupled_solutions);
 }
 
 void RichardsFlowProcess::assembleWithJacobianConcreteProcess(
@@ -84,10 +87,12 @@ void RichardsFlowProcess::assembleWithJacobianConcreteProcess(
 {
     DBUG("AssembleWithJacobian RichardsFlowProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_table, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 }
 
diff --git a/ProcessLib/SmallDeformation/SmallDeformationProcess-impl.h b/ProcessLib/SmallDeformation/SmallDeformationProcess-impl.h
index a2188a9b6b46f24e55b29b339857fe9fff51920c..cb19a67400a17ce7743f7219621fd436e83e6cdb 100644
--- a/ProcessLib/SmallDeformation/SmallDeformationProcess-impl.h
+++ b/ProcessLib/SmallDeformation/SmallDeformationProcess-impl.h
@@ -148,10 +148,12 @@ void SmallDeformationProcess<DisplacementDim>::assembleConcreteProcess(
 {
     DBUG("Assemble SmallDeformationProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_table, t, x, M, K, b, _coupled_solutions);
 }
 
 template <int DisplacementDim>
@@ -163,10 +165,12 @@ void SmallDeformationProcess<DisplacementDim>::
 {
     DBUG("AssembleWithJacobian SmallDeformationProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_table, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 
     b.copyValues(*_nodal_forces);
@@ -191,7 +195,7 @@ void SmallDeformationProcess<DisplacementDim>::preTimestepConcreteProcess(
 
 template <int DisplacementDim>
 void SmallDeformationProcess<DisplacementDim>::postTimestepConcreteProcess(
-    GlobalVector const& x)
+    GlobalVector const& x, int const /*process_id*/)
 {
     DBUG("PostTimestep SmallDeformationProcess.");
 
diff --git a/ProcessLib/SmallDeformation/SmallDeformationProcess.h b/ProcessLib/SmallDeformation/SmallDeformationProcess.h
index 85247bf0c12a2d8c97102563c1eb4b84c079f5bb..4375858b7e754d40301f7879477a8c8f01eab920 100644
--- a/ProcessLib/SmallDeformation/SmallDeformationProcess.h
+++ b/ProcessLib/SmallDeformation/SmallDeformationProcess.h
@@ -62,7 +62,8 @@ private:
         GlobalVector const& x, double const t, double const dt,
         const int process_id) override;
 
-    void postTimestepConcreteProcess(GlobalVector const& x) override;
+    void postTimestepConcreteProcess(GlobalVector const& x,
+                                     int const process_id) override;
 
 private:
     SmallDeformationProcessData<DisplacementDim> _process_data;
diff --git a/ProcessLib/TES/TESProcess.cpp b/ProcessLib/TES/TESProcess.cpp
index 659e6e576180e44cecfb289a180d4cb490635512..a162c814b96e1e3b9efc6dd88f54fe185fc60ecc 100644
--- a/ProcessLib/TES/TESProcess.cpp
+++ b/ProcessLib/TES/TESProcess.cpp
@@ -141,7 +141,9 @@ void TESProcess::initializeConcreteProcess(
     NumLib::LocalToGlobalIndexMap const& dof_table,
     MeshLib::Mesh const& mesh, unsigned const integration_order)
 {
-    ProcessLib::ProcessVariable const& pv = getProcessVariables()[0];
+    const int monolithic_process_id = 0;
+    ProcessLib::ProcessVariable const& pv =
+        getProcessVariables(monolithic_process_id)[0];
     ProcessLib::createLocalAssemblers<TESLocalAssembler>(
         mesh.getDimension(), mesh.getElements(), dof_table,
         pv.getShapeFunctionOrder(), _local_assemblers,
@@ -230,10 +232,12 @@ void TESProcess::assembleConcreteProcess(const double t,
 {
     DBUG("Assemble TESProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_table, t, x, M, K, b, _coupled_solutions);
 }
 
 void TESProcess::assembleWithJacobianConcreteProcess(
@@ -241,9 +245,12 @@ void TESProcess::assembleWithJacobianConcreteProcess(
     const double dxdot_dx, const double dx_dx, GlobalMatrix& M, GlobalMatrix& K,
     GlobalVector& b, GlobalMatrix& Jac)
 {
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
+    // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_table, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 }
 
diff --git a/ProcessLib/ThermalTwoPhaseFlowWithPP/ThermalTwoPhaseFlowWithPPProcess.cpp b/ProcessLib/ThermalTwoPhaseFlowWithPP/ThermalTwoPhaseFlowWithPPProcess.cpp
index 1ad3d8fc78a3fb8603fba17b712b50bafb73fbca..e3e43f9faa052bcddf2c790bc6ff30200455670d 100644
--- a/ProcessLib/ThermalTwoPhaseFlowWithPP/ThermalTwoPhaseFlowWithPPProcess.cpp
+++ b/ProcessLib/ThermalTwoPhaseFlowWithPP/ThermalTwoPhaseFlowWithPPProcess.cpp
@@ -51,7 +51,9 @@ void ThermalTwoPhaseFlowWithPPProcess::initializeConcreteProcess(
     MeshLib::Mesh const& mesh,
     unsigned const integration_order)
 {
-    ProcessLib::ProcessVariable const& pv = getProcessVariables()[0];
+    const int monolithic_process_id = 0;
+    ProcessLib::ProcessVariable const& pv =
+        getProcessVariables(monolithic_process_id)[0];
     ProcessLib::createLocalAssemblers<ThermalTwoPhaseFlowWithPPLocalAssembler>(
         mesh.getDimension(), mesh.getElements(), dof_table,
         pv.getShapeFunctionOrder(), _local_assemblers,
@@ -78,10 +80,13 @@ void ThermalTwoPhaseFlowWithPPProcess::assembleConcreteProcess(
     GlobalVector& b)
 {
     DBUG("Assemble ThermalTwoPhaseFlowWithPPProcess.");
+
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_table, t, x, M, K, b, _coupled_solutions);
 }
 
 void ThermalTwoPhaseFlowWithPPProcess::assembleWithJacobianConcreteProcess(
@@ -91,10 +96,12 @@ void ThermalTwoPhaseFlowWithPPProcess::assembleWithJacobianConcreteProcess(
 {
     DBUG("AssembleWithJacobian ThermalTwoPhaseFlowWithPPProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_table, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 }
 void ThermalTwoPhaseFlowWithPPProcess::preTimestepConcreteProcess(
diff --git a/ProcessLib/ThermoMechanics/ThermoMechanicsProcess-impl.h b/ProcessLib/ThermoMechanics/ThermoMechanicsProcess-impl.h
index 478507bbcbb03ffc2df978c05ebac39c84f1772c..120310b627c0c6f3ec897ddf128cf88753690118 100644
--- a/ProcessLib/ThermoMechanics/ThermoMechanicsProcess-impl.h
+++ b/ProcessLib/ThermoMechanics/ThermoMechanicsProcess-impl.h
@@ -154,10 +154,12 @@ void ThermoMechanicsProcess<DisplacementDim>::assembleConcreteProcess(
 {
     DBUG("Assemble ThermoMechanicsProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_table, t, x, M, K, b, _coupled_solutions);
 }
 
 template <int DisplacementDim>
@@ -171,10 +173,12 @@ void ThermoMechanicsProcess<DisplacementDim>::
 {
     DBUG("AssembleJacobian ThermoMechanicsProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_table, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 }
 
@@ -195,7 +199,7 @@ void ThermoMechanicsProcess<DisplacementDim>::preTimestepConcreteProcess(
 
 template <int DisplacementDim>
 void ThermoMechanicsProcess<DisplacementDim>::postTimestepConcreteProcess(
-    GlobalVector const& x)
+    GlobalVector const& x, int const /*process_id*/)
 {
     DBUG("PostTimestep ThermoMechanicsProcess.");
 
diff --git a/ProcessLib/ThermoMechanics/ThermoMechanicsProcess.h b/ProcessLib/ThermoMechanics/ThermoMechanicsProcess.h
index cac7104ef5b439f753522bbb5eb607991c872068..2fb243e92f729133bc3de684b657031c638cf9fd 100644
--- a/ProcessLib/ThermoMechanics/ThermoMechanicsProcess.h
+++ b/ProcessLib/ThermoMechanics/ThermoMechanicsProcess.h
@@ -63,7 +63,8 @@ private:
         GlobalVector const& x, double const t, double const dt,
         const int process_id) override;
 
-    void postTimestepConcreteProcess(GlobalVector const& x) override;
+    void postTimestepConcreteProcess(GlobalVector const& x,
+                                     int const process_id) override;
 
 private:
     std::vector<MeshLib::Node*> _base_nodes;
diff --git a/ProcessLib/TwoPhaseFlowWithPP/TwoPhaseFlowWithPPProcess.cpp b/ProcessLib/TwoPhaseFlowWithPP/TwoPhaseFlowWithPPProcess.cpp
index 656bd6dfad26a1c60a9a08426c06629b1fbf264e..4a6b65f77e02ced0322d2e6161447d0602c2b720 100644
--- a/ProcessLib/TwoPhaseFlowWithPP/TwoPhaseFlowWithPPProcess.cpp
+++ b/ProcessLib/TwoPhaseFlowWithPP/TwoPhaseFlowWithPPProcess.cpp
@@ -50,7 +50,8 @@ void TwoPhaseFlowWithPPProcess::initializeConcreteProcess(
     MeshLib::Mesh const& mesh,
     unsigned const integration_order)
 {
-    ProcessLib::ProcessVariable const& pv = getProcessVariables()[0];
+    const int process_id = 0;
+    ProcessLib::ProcessVariable const& pv = getProcessVariables(process_id)[0];
     ProcessLib::createLocalAssemblers<TwoPhaseFlowWithPPLocalAssembler>(
         mesh.getDimension(), mesh.getElements(), dof_table,
         pv.getShapeFunctionOrder(), _local_assemblers,
@@ -76,10 +77,13 @@ void TwoPhaseFlowWithPPProcess::assembleConcreteProcess(const double t,
                                                         GlobalVector& b)
 {
     DBUG("Assemble TwoPhaseFlowWithPPProcess.");
+
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_table, t, x, M, K, b, _coupled_solutions);
 }
 
 void TwoPhaseFlowWithPPProcess::assembleWithJacobianConcreteProcess(
@@ -89,10 +93,12 @@ void TwoPhaseFlowWithPPProcess::assembleWithJacobianConcreteProcess(
 {
     DBUG("AssembleWithJacobian TwoPhaseFlowWithPPProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_table, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 }
 
diff --git a/ProcessLib/TwoPhaseFlowWithPrho/TwoPhaseFlowWithPrhoProcess.cpp b/ProcessLib/TwoPhaseFlowWithPrho/TwoPhaseFlowWithPrhoProcess.cpp
index ca4ab84b72f3ed669e83f90517e120c284e49ee7..1b878fe50e3ad763b2f745feacf24f423300f57d 100644
--- a/ProcessLib/TwoPhaseFlowWithPrho/TwoPhaseFlowWithPrhoProcess.cpp
+++ b/ProcessLib/TwoPhaseFlowWithPrho/TwoPhaseFlowWithPrhoProcess.cpp
@@ -50,7 +50,8 @@ void TwoPhaseFlowWithPrhoProcess::initializeConcreteProcess(
     MeshLib::Mesh const& mesh,
     unsigned const integration_order)
 {
-    ProcessLib::ProcessVariable const& pv = getProcessVariables()[0];
+    const int process_id = 0;
+    ProcessLib::ProcessVariable const& pv = getProcessVariables(process_id)[0];
     ProcessLib::createLocalAssemblers<TwoPhaseFlowWithPrhoLocalAssembler>(
         mesh.getDimension(), mesh.getElements(), dof_table,
         pv.getShapeFunctionOrder(), _local_assemblers,
@@ -76,10 +77,13 @@ void TwoPhaseFlowWithPrhoProcess::assembleConcreteProcess(const double t,
                                                           GlobalVector& b)
 {
     DBUG("Assemble TwoPhaseFlowWithPrhoProcess.");
+
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assemble, _local_assemblers,
-        *_local_to_global_index_map, t, x, M, K, b, _coupled_solutions);
+        dof_table, t, x, M, K, b, _coupled_solutions);
 }
 
 void TwoPhaseFlowWithPrhoProcess::assembleWithJacobianConcreteProcess(
@@ -89,10 +93,12 @@ void TwoPhaseFlowWithPrhoProcess::assembleWithJacobianConcreteProcess(
 {
     DBUG("AssembleWithJacobian TwoPhaseFlowWithPrhoProcess.");
 
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>>
+       dof_table = {std::ref(*_local_to_global_index_map)};
     // Call global assembler for each local assembly item.
     GlobalExecutor::executeMemberDereferenced(
         _global_assembler, &VectorMatrixAssembler::assembleWithJacobian,
-        _local_assemblers, *_local_to_global_index_map, t, x, xdot, dxdot_dx,
+        _local_assemblers, dof_table, t, x, xdot, dxdot_dx,
         dx_dx, M, K, b, Jac, _coupled_solutions);
 }
 void TwoPhaseFlowWithPrhoProcess::preTimestepConcreteProcess(
diff --git a/ProcessLib/UncoupledProcessesTimeLoop.cpp b/ProcessLib/UncoupledProcessesTimeLoop.cpp
index 6532c3d748082ebff941dae336faceee7b111e30..334ef56f3e6a17393671efbb6f1b1b09630803d6 100644
--- a/ProcessLib/UncoupledProcessesTimeLoop.cpp
+++ b/ProcessLib/UncoupledProcessesTimeLoop.cpp
@@ -126,6 +126,10 @@ struct SingleProcessData
     //! cast of \c tdisc_ode_sys to NumLib::InternalMatrixStorage
     NumLib::InternalMatrixStorage* mat_strg = nullptr;
 
+    /// Process ID. It is alway 0 when the monolithic scheme is used or
+    /// a single process is modelled.
+    int process_id = 0;
+
     Process& process;
     ProcessOutput process_output;
 };
@@ -183,7 +187,7 @@ void setTimeDiscretizedODESystem(
 
         spd.tdisc_ode_sys = std::make_unique<
             NumLib::TimeDiscretizedODESystem<ODETag, Tag::Picard>>(
-            ode_sys, *spd.time_disc);
+            spd.process_id, ode_sys, *spd.time_disc);
     }
     else if (dynamic_cast<NonlinearSolverNewton*>(&spd.nonlinear_solver))
     {
@@ -194,7 +198,7 @@ void setTimeDiscretizedODESystem(
         {
             spd.tdisc_ode_sys = std::make_unique<
                 NumLib::TimeDiscretizedODESystem<ODETag, Tag::Newton>>(
-                *ode_newton, *spd.time_disc);
+                spd.process_id, *ode_newton, *spd.time_disc);
         }
         else
         {
@@ -323,16 +327,29 @@ std::unique_ptr<UncoupledProcessesTimeLoop> createUncoupledProcessesTimeLoop(
         //! \ogs_file_param{prj__time_loop__global_process_coupling}
         = config.getConfigSubtreeOptional("global_process_coupling");
 
-    std::unique_ptr<NumLib::ConvergenceCriterion> coupling_conv_crit = nullptr;
+    std::vector<std::unique_ptr<NumLib::ConvergenceCriterion>>
+        global_coupling_conv_criteria;
     unsigned max_coupling_iterations = 1;
     if (coupling_config)
     {
         max_coupling_iterations
             //! \ogs_file_param{prj__time_loop__global_process_coupling__max_iter}
             = coupling_config->getConfigParameter<unsigned>("max_iter");
-        coupling_conv_crit = NumLib::createConvergenceCriterion(
-            //! \ogs_file_param{prj__time_loop__global_process_coupling__convergence_criterion}
-            coupling_config->getConfigSubtree("convergence_criterion"));
+
+        auto const& coupling_convergence_criteria_config =
+            //! \ogs_file_param{prj__time_loop__global_process_coupling__convergence_criteria}
+            coupling_config->getConfigSubtree("convergence_criteria");
+
+        for (
+            auto coupling_convergence_criterion_config :
+            //! \ogs_file_param{prj__time_loop__global_process_coupling__convergence_criteria__convergence_criterion}
+            coupling_convergence_criteria_config.getConfigSubtreeList(
+                "convergence_criterion"))
+        {
+            global_coupling_conv_criteria.push_back(
+                NumLib::createConvergenceCriterion(
+                    coupling_convergence_criterion_config));
+        }
     }
 
     auto output =
@@ -343,6 +360,18 @@ std::unique_ptr<UncoupledProcessesTimeLoop> createUncoupledProcessesTimeLoop(
         //! \ogs_file_param{prj__time_loop__processes}
         config.getConfigSubtree("processes"), processes, nonlinear_solvers);
 
+    if (coupling_config)
+    {
+        if (global_coupling_conv_criteria.size() != per_process_data.size())
+        {
+            OGS_FATAL(
+                "The number of convergence criteria of the global staggered "
+                "coupling loop is not identical to the number of the "
+                "processes! Please check the element by tag "
+                "global_process_coupling in the project file.");
+        }
+    }
+
     const auto minmax_iter = std::minmax_element(
         per_process_data.begin(),
         per_process_data.end(),
@@ -359,7 +388,7 @@ std::unique_ptr<UncoupledProcessesTimeLoop> createUncoupledProcessesTimeLoop(
 
     return std::make_unique<UncoupledProcessesTimeLoop>(
         std::move(output), std::move(per_process_data), max_coupling_iterations,
-        std::move(coupling_conv_crit), start_time, end_time);
+        std::move(global_coupling_conv_criteria), start_time, end_time);
 }
 
 std::vector<GlobalVector*> setInitialConditions(
@@ -368,7 +397,7 @@ std::vector<GlobalVector*> setInitialConditions(
 {
     std::vector<GlobalVector*> process_solutions;
 
-    unsigned pcs_idx = 0;
+    int process_id = 0;
     for (auto& spd : per_process_data)
     {
         auto& pcs = spd->process;
@@ -380,10 +409,10 @@ std::vector<GlobalVector*> setInitialConditions(
         // append a solution vector of suitable size
         process_solutions.emplace_back(
             &NumLib::GlobalVectorProvider::provider.getVector(
-                ode_sys.getMatrixSpecifications()));
+                ode_sys.getMatrixSpecifications(process_id)));
 
-        auto& x0 = *process_solutions[pcs_idx];
-        pcs.setInitialConditions(pcs_idx, t0, x0);
+        auto& x0 = *process_solutions[process_id];
+        pcs.setInitialConditions(process_id, t0, x0);
         MathLib::LinAlg::finalizeAssembly(x0);
 
         time_disc.setInitialState(t0, x0);  // push IC
@@ -401,15 +430,15 @@ std::vector<GlobalVector*> setInitialConditions(
                 mat_strg);  // TODO: that might do duplicate work
         }
 
-        ++pcs_idx;
+        ++process_id;
     }
 
     return process_solutions;
 }
 
-bool solveOneTimeStepOneProcess(const unsigned process_index,
-                                GlobalVector& x, std::size_t const timestep,
-                                double const t, double const delta_t,
+bool solveOneTimeStepOneProcess(int const process_id, GlobalVector& x,
+                                std::size_t const timestep, double const t,
+                                double const delta_t,
                                 SingleProcessData const& process_data,
                                 Output& output_control)
 {
@@ -433,14 +462,19 @@ bool solveOneTimeStepOneProcess(const unsigned process_index,
 
     auto const post_iteration_callback = [&](unsigned iteration,
                                              GlobalVector const& x) {
-        output_control.doOutputNonlinearIteration(
-            process, process_index, process_data.process_output, timestep, t,
-            x, iteration);
+        output_control.doOutputNonlinearIteration(process, process_id,
+                                                  process_data.process_output,
+                                                  timestep, t, x, iteration);
     };
 
     bool nonlinear_solver_succeeded =
         nonlinear_solver.solve(x, post_iteration_callback);
 
+    if (nonlinear_solver_succeeded)
+    {
+        process.postNonLinearSolver(x, t, process_id);
+    }
+
     return nonlinear_solver_succeeded;
 }
 
@@ -448,7 +482,8 @@ UncoupledProcessesTimeLoop::UncoupledProcessesTimeLoop(
     std::unique_ptr<Output>&& output,
     std::vector<std::unique_ptr<SingleProcessData>>&& per_process_data,
     const unsigned global_coupling_max_iterations,
-    std::unique_ptr<NumLib::ConvergenceCriterion>&& global_coupling_conv_crit,
+    std::vector<std::unique_ptr<NumLib::ConvergenceCriterion>>&&
+        global_coupling_conv_crit,
     const double start_time, const double end_time)
     : _output(std::move(output)),
       _per_process_data(std::move(per_process_data)),
@@ -470,9 +505,10 @@ bool UncoupledProcessesTimeLoop::setCoupledSolutions()
     }
 
     _solutions_of_coupled_processes.reserve(_per_process_data.size());
-    for (unsigned pcs_idx = 0; pcs_idx < _per_process_data.size(); pcs_idx++)
+    for (unsigned process_id = 0; process_id < _per_process_data.size();
+         process_id++)
     {
-        auto const& x = *_process_solutions[pcs_idx];
+        auto const& x = *_process_solutions[process_id];
         _solutions_of_coupled_processes.emplace_back(x);
 
         // Create a vector to store the solution of the last coupling iteration
@@ -650,22 +686,24 @@ bool UncoupledProcessesTimeLoop::loop()
 {
     // initialize output, convergence criterion, etc.
     {
-        unsigned pcs_idx = 0;
+        unsigned process_id = 0;
         for (auto& spd : _per_process_data)
         {
             auto& pcs = spd->process;
-            _output->addProcess(pcs, pcs_idx);
+            _output->addProcess(pcs, process_id);
 
+            spd->process_id = process_id;
             setTimeDiscretizedODESystem(*spd);
 
             if (auto* conv_crit =
                     dynamic_cast<NumLib::ConvergenceCriterionPerComponent*>(
                         spd->conv_crit.get()))
             {
-                conv_crit->setDOFTable(pcs.getDOFTable(), pcs.getMesh());
+                conv_crit->setDOFTable(pcs.getDOFTable(process_id),
+                                       pcs.getMesh());
             }
 
-            ++pcs_idx;
+            ++process_id;
         }
     }
 
@@ -780,41 +818,41 @@ bool UncoupledProcessesTimeLoop::solveUncoupledEquationSystems(
     const double t, const double dt, const std::size_t timestep_id)
 {
     // TODO(wenqing): use process name
-    unsigned pcs_idx = 0;
+    unsigned process_id = 0;
     for (auto& spd : _per_process_data)
     {
         if (spd->skip_time_stepping)
         {
-            INFO("Process %u is skipped in the time stepping.", pcs_idx);
-            ++pcs_idx;
+            INFO("Process %u is skipped in the time stepping.", process_id);
+            ++process_id;
             continue;
         }
 
         BaseLib::RunTime time_timestep_process;
         time_timestep_process.start();
 
-        auto& x = *_process_solutions[pcs_idx];
+        auto& x = *_process_solutions[process_id];
         auto& pcs = spd->process;
-        pcs.preTimestep(x, t, dt, pcs_idx);
+        pcs.preTimestep(x, t, dt, process_id);
 
-        const auto nonlinear_solver_succeeded =
-            solveOneTimeStepOneProcess(pcs_idx, x, timestep_id, t, dt, *spd, *_output);
+        const auto nonlinear_solver_succeeded = solveOneTimeStepOneProcess(
+            process_id, x, timestep_id, t, dt, *spd, *_output);
         spd->nonlinear_solver_converged = nonlinear_solver_succeeded;
-        pcs.postTimestep(x);
+        pcs.postTimestep(x, process_id);
         pcs.computeSecondaryVariable(t, x);
 
-        INFO("[time] Solving process #%u took %g s in time step #%u ", pcs_idx,
-             time_timestep_process.elapsed(), timestep_id);
+        INFO("[time] Solving process #%u took %g s in time step #%u ",
+             process_id, time_timestep_process.elapsed(), timestep_id);
 
         if (!nonlinear_solver_succeeded)
         {
             ERR("The nonlinear solver failed in time step #%u at t = %g "
                 "s for process #%u.",
-                timestep_id, t, pcs_idx);
+                timestep_id, t, process_id);
 
             // save unsuccessful solution
-            _output->doOutputAlways(pcs, pcs_idx, spd->process_output, timestep_id, t,
-                                    x);
+            _output->doOutputAlways(pcs, process_id, spd->process_output,
+                                    timestep_id, t, x);
 
             if (!spd->timestepper->isSolutionErrorComputationNeeded())
             {
@@ -824,9 +862,10 @@ bool UncoupledProcessesTimeLoop::solveUncoupledEquationSystems(
             return false;
         }
 
-        _output->doOutput(pcs, pcs_idx, spd->process_output, timestep_id, t, x);
+        _output->doOutput(pcs, process_id, spd->process_output, timestep_id, t,
+                          x);
 
-        ++pcs_idx;
+        ++process_id;
     }  // end of for (auto& spd : _per_process_data)
 
     return true;
@@ -839,53 +878,63 @@ bool UncoupledProcessesTimeLoop::solveCoupledEquationSystemsByStaggeredScheme(
     if (_global_coupling_max_iterations != 0)
     {
         // Set the flag of the first iteration be true.
-        _global_coupling_conv_crit->preFirstIteration();
+        for (auto& conv_crit : _global_coupling_conv_crit)
+        {
+            conv_crit->preFirstIteration();
+        }
     }
+    auto resetCouplingConvergenceCtiteria = [&]() {
+        for (auto& conv_crit : _global_coupling_conv_crit)
+        {
+            conv_crit->reset();
+        }
+    };
+
     bool coupling_iteration_converged = true;
     for (unsigned global_coupling_iteration = 0;
          global_coupling_iteration < _global_coupling_max_iterations;
-         global_coupling_iteration++, _global_coupling_conv_crit->reset())
+         global_coupling_iteration++, resetCouplingConvergenceCtiteria())
     {
         // TODO(wenqing): use process name
         bool nonlinear_solver_succeeded = true;
         coupling_iteration_converged = true;
-        unsigned pcs_idx = 0;
+        unsigned process_id = 0;
         for (auto& spd : _per_process_data)
         {
             if (spd->skip_time_stepping)
             {
-                INFO("Process %u is skipped in the time stepping.", pcs_idx);
-                ++pcs_idx;
+                INFO("Process %u is skipped in the time stepping.", process_id);
+                ++process_id;
                 continue;
             }
 
             BaseLib::RunTime time_timestep_process;
             time_timestep_process.start();
 
-            auto& x = *_process_solutions[pcs_idx];
+            auto& x = *_process_solutions[process_id];
             if (global_coupling_iteration == 0)
             {
                 // Copy the solution of the previous time step to a vector that
                 // belongs to process. For some problems, both of the current
                 // solution and the solution of the previous time step are
                 // required for the coupling computation.
-                spd->process.preTimestep(x, t, dt, pcs_idx);
+                spd->process.preTimestep(x, t, dt, process_id);
             }
 
             CoupledSolutionsForStaggeredScheme coupled_solutions(
-                _solutions_of_coupled_processes, dt, pcs_idx);
+                _solutions_of_coupled_processes, dt, process_id);
 
             spd->process.setCoupledSolutionsForStaggeredScheme(
                 &coupled_solutions);
 
             const auto nonlinear_solver_succeeded = solveOneTimeStepOneProcess(
-                pcs_idx, x, timestep_id, t, dt, *spd, *_output);
+                process_id, x, timestep_id, t, dt, *spd, *_output);
             spd->nonlinear_solver_converged = nonlinear_solver_succeeded;
 
             INFO(
                 "[time] Solving process #%u took %g s in time step #%u "
                 " coupling iteration #%u",
-                pcs_idx, time_timestep_process.elapsed(), timestep_id,
+                process_id, time_timestep_process.elapsed(), timestep_id,
                 global_coupling_iteration);
 
             if (!nonlinear_solver_succeeded)
@@ -893,12 +942,11 @@ bool UncoupledProcessesTimeLoop::solveCoupledEquationSystemsByStaggeredScheme(
                 ERR("The nonlinear solver failed in time step #%u at t = %g "
                     "s"
                     " for process #%u.",
-                    timestep_id, t, pcs_idx);
+                    timestep_id, t, process_id);
 
                 // save unsuccessful solution
-                _output->doOutputAlways(spd->process, pcs_idx,
-                                        spd->process_output,
-                                        timestep_id, t, x);
+                _output->doOutputAlways(spd->process, process_id,
+                                        spd->process_output, timestep_id, t, x);
 
                 if (!spd->timestepper->isSolutionErrorComputationNeeded())
                 {
@@ -909,24 +957,21 @@ bool UncoupledProcessesTimeLoop::solveCoupledEquationSystemsByStaggeredScheme(
             }
 
             // Check the convergence of the coupling iteration
-            auto& x_old = *_solutions_of_last_cpl_iteration[pcs_idx];
-            MathLib::LinAlg::axpy(x_old, -1.0, x);
-            _global_coupling_conv_crit->checkResidual(x_old);
-            coupling_iteration_converged =
-                coupling_iteration_converged &&
-                _global_coupling_conv_crit->isSatisfied();
-
-            MathLib::LinAlg::copy(x, x_old);
-
-            if (coupling_iteration_converged)
+            auto& x_old = *_solutions_of_last_cpl_iteration[process_id];
+            if (global_coupling_iteration > 0)
             {
-                break;
+                MathLib::LinAlg::axpy(x_old, -1.0, x);  // save dx to x_old
+                _global_coupling_conv_crit[process_id]->checkDeltaX(x_old, x);
+                coupling_iteration_converged =
+                    coupling_iteration_converged &&
+                    _global_coupling_conv_crit[process_id]->isSatisfied();
             }
+            MathLib::LinAlg::copy(x, x_old);
 
-            ++pcs_idx;
+            ++process_id;
         }  // end of for (auto& spd : _per_process_data)
 
-        if (coupling_iteration_converged)
+        if (coupling_iteration_converged && global_coupling_iteration > 0)
         {
             break;
         }
@@ -945,20 +990,20 @@ bool UncoupledProcessesTimeLoop::solveCoupledEquationSystemsByStaggeredScheme(
             timestep_id, t);
     }
 
-    unsigned pcs_idx = 0;
+    unsigned process_id = 0;
     for (auto& spd : _per_process_data)
     {
         if (spd->skip_time_stepping)
         {
-            ++pcs_idx;
+            ++process_id;
             continue;
         }
         auto& pcs = spd->process;
-        auto& x = *_process_solutions[pcs_idx];
-        pcs.postTimestep(x);
+        auto& x = *_process_solutions[process_id];
+        pcs.postTimestep(x, process_id);
         pcs.computeSecondaryVariable(t, x);
 
-        ++pcs_idx;
+        ++process_id;
     }
 
     {
@@ -977,7 +1022,7 @@ void UncoupledProcessesTimeLoop::outputSolutions(
     unsigned timestep, const double t, OutputClass& output_object,
     OutputClassMember output_class_member) const
 {
-    unsigned pcs_idx = 0;
+    unsigned process_id = 0;
     for (auto& spd : _per_process_data)
     {
         auto& pcs = spd->process;
@@ -985,36 +1030,35 @@ void UncoupledProcessesTimeLoop::outputSolutions(
         // saved.
         if ((!spd->nonlinear_solver_converged) || spd->skip_time_stepping)
         {
-            ++pcs_idx;
+            ++process_id;
             continue;
         }
 
-        auto const& x = *_process_solutions[pcs_idx];
+        auto const& x = *_process_solutions[process_id];
 
         if (output_initial_condition)
         {
             pcs.preTimestep(x, _start_time,
-                            spd->timestepper->getTimeStep().dt(), pcs_idx);
+                            spd->timestepper->getTimeStep().dt(), process_id);
         }
         if (is_staggered_coupling)
         {
             CoupledSolutionsForStaggeredScheme coupled_solutions(
-                _solutions_of_coupled_processes, 0.0, pcs_idx);
+                _solutions_of_coupled_processes, 0.0, process_id);
 
             spd->process.setCoupledSolutionsForStaggeredScheme(
                 &coupled_solutions);
             spd->process.setCoupledTermForTheStaggeredSchemeToLocalAssemblers();
-            (output_object.*output_class_member)(pcs, pcs_idx,
-                                                 spd->process_output,
-                                                 timestep, t, x);
+            (output_object.*output_class_member)(
+                pcs, process_id, spd->process_output, timestep, t, x);
         }
         else
         {
             (output_object.*output_class_member)(
-                pcs, pcs_idx, spd->process_output, timestep, t, x);
+                pcs, process_id, spd->process_output, timestep, t, x);
         }
 
-        ++pcs_idx;
+        ++process_id;
     }
 }
 
diff --git a/ProcessLib/UncoupledProcessesTimeLoop.h b/ProcessLib/UncoupledProcessesTimeLoop.h
index e26ea2d986f2e07566d6860cb019a37e30f7c148..ab83fe3144f5472b4949e87ab250ec1eaef71267 100644
--- a/ProcessLib/UncoupledProcessesTimeLoop.h
+++ b/ProcessLib/UncoupledProcessesTimeLoop.h
@@ -39,7 +39,7 @@ public:
         std::unique_ptr<Output>&& output,
         std::vector<std::unique_ptr<SingleProcessData>>&& per_process_data,
         const unsigned global_coupling_max_iterations,
-        std::unique_ptr<NumLib::ConvergenceCriterion>&&
+        std::vector<std::unique_ptr<NumLib::ConvergenceCriterion>>&&
             global_coupling_conv_crit,
         const double start_time, const double end_time);
 
@@ -71,8 +71,9 @@ private:
 
     /// Maximum iterations of the global coupling.
     const unsigned _global_coupling_max_iterations;
-    /// Convergence criteria of the global coupling iterations.
-    std::unique_ptr<NumLib::ConvergenceCriterion> _global_coupling_conv_crit;
+    /// Convergence criteria of processes for the global coupling iterations.
+    std::vector<std::unique_ptr<NumLib::ConvergenceCriterion>>
+        _global_coupling_conv_crit;
 
     /**
      *  Vector of solutions of the coupled processes.
diff --git a/ProcessLib/VectorMatrixAssembler.cpp b/ProcessLib/VectorMatrixAssembler.cpp
index 8021402d73cd43855bc62cc648d813b4667d93f8..e5b77645e5207b8da8ec167e0e0153da22e7b481 100644
--- a/ProcessLib/VectorMatrixAssembler.cpp
+++ b/ProcessLib/VectorMatrixAssembler.cpp
@@ -10,6 +10,7 @@
 #include "VectorMatrixAssembler.h"
 
 #include <cassert>
+#include <functional>  // for std::reference_wrapper.
 
 #include "NumLib/DOF/DOFTableUtil.h"
 #include "MathLib/LinAlg/Eigen/EigenMapTools.h"
@@ -39,12 +40,22 @@ void VectorMatrixAssembler::preAssemble(
 
 void VectorMatrixAssembler::assemble(
     const std::size_t mesh_item_id, LocalAssemblerInterface& local_assembler,
-    const NumLib::LocalToGlobalIndexMap& dof_table, const double t,
-    const GlobalVector& x, GlobalMatrix& M, GlobalMatrix& K, GlobalVector& b,
-    const CoupledSolutionsForStaggeredScheme* cpl_xs)
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>> const&
+        dof_tables,
+    const double t, const GlobalVector& x, GlobalMatrix& M, GlobalMatrix& K,
+    GlobalVector& b, CoupledSolutionsForStaggeredScheme const* const cpl_xs)
 {
-    auto const indices = NumLib::getIndices(mesh_item_id, dof_table);
+    std::vector<std::vector<GlobalIndexType>> indices_of_processes;
+    indices_of_processes.reserve(dof_tables.size());
+    for (std::size_t i = 0; i < dof_tables.size(); i++)
+    {
+        indices_of_processes.emplace_back(
+            NumLib::getIndices(mesh_item_id, dof_tables[i].get()));
+    }
 
+    auto const& indices = (cpl_xs == nullptr)
+                              ? indices_of_processes[0]
+                              : indices_of_processes[cpl_xs->process_id];
     _local_M_data.clear();
     _local_K_data.clear();
     _local_b_data.clear();
@@ -57,16 +68,18 @@ void VectorMatrixAssembler::assemble(
     }
     else
     {
-        auto local_coupled_xs0 = getPreviousLocalSolutions(*cpl_xs, indices);
-        auto local_coupled_xs = getCurrentLocalSolutions(*cpl_xs, indices);
+        auto local_coupled_xs0 =
+            getPreviousLocalSolutions(*cpl_xs, indices_of_processes);
+        auto local_coupled_xs =
+            getCurrentLocalSolutions(*cpl_xs, indices_of_processes);
 
         ProcessLib::LocalCoupledSolutions local_coupled_solutions(
             cpl_xs->dt, cpl_xs->process_id, std::move(local_coupled_xs0),
             std::move(local_coupled_xs));
 
-        local_assembler.assembleWithCoupledTerm(t, _local_M_data, _local_K_data,
-                                                _local_b_data,
-                                                local_coupled_solutions);
+        local_assembler.assembleForStaggeredScheme(t, _local_M_data,
+                                                   _local_K_data, _local_b_data,
+                                                   local_coupled_solutions);
     }
 
     auto const num_r_c = indices.size();
@@ -92,12 +105,24 @@ void VectorMatrixAssembler::assemble(
 
 void VectorMatrixAssembler::assembleWithJacobian(
     std::size_t const mesh_item_id, LocalAssemblerInterface& local_assembler,
-    NumLib::LocalToGlobalIndexMap const& dof_table, const double t,
-    GlobalVector const& x, GlobalVector const& xdot, const double dxdot_dx,
-    const double dx_dx, GlobalMatrix& M, GlobalMatrix& K, GlobalVector& b,
-    GlobalMatrix& Jac, const CoupledSolutionsForStaggeredScheme* cpl_xs)
+    std::vector<std::reference_wrapper<NumLib::LocalToGlobalIndexMap>> const&
+        dof_tables,
+    const double t, GlobalVector const& x, GlobalVector const& xdot,
+    const double dxdot_dx, const double dx_dx, GlobalMatrix& M, GlobalMatrix& K,
+    GlobalVector& b, GlobalMatrix& Jac,
+    CoupledSolutionsForStaggeredScheme const* const cpl_xs)
 {
-    auto const indices = NumLib::getIndices(mesh_item_id, dof_table);
+    std::vector<std::vector<GlobalIndexType>> indices_of_processes;
+    indices_of_processes.reserve(dof_tables.size());
+    for (std::size_t i = 0; i < dof_tables.size(); i++)
+    {
+        indices_of_processes.emplace_back(
+            NumLib::getIndices(mesh_item_id, dof_tables[i].get()));
+    }
+
+    auto const& indices = (cpl_xs == nullptr)
+                              ? indices_of_processes[0]
+                              : indices_of_processes[cpl_xs->process_id];
     auto const local_xdot = xdot.get(indices);
 
     _local_M_data.clear();
@@ -114,14 +139,16 @@ void VectorMatrixAssembler::assembleWithJacobian(
     }
     else
     {
-        auto local_coupled_xs0 = getPreviousLocalSolutions(*cpl_xs, indices);
-        auto local_coupled_xs = getCurrentLocalSolutions(*cpl_xs, indices);
+        auto local_coupled_xs0 =
+            getPreviousLocalSolutions(*cpl_xs, indices_of_processes);
+        auto local_coupled_xs =
+            getCurrentLocalSolutions(*cpl_xs, indices_of_processes);
 
         ProcessLib::LocalCoupledSolutions local_coupled_solutions(
             cpl_xs->dt, cpl_xs->process_id, std::move(local_coupled_xs0),
             std::move(local_coupled_xs));
 
-        _jacobian_assembler->assembleWithJacobianAndCoupling(
+        _jacobian_assembler->assembleWithJacobianForStaggeredScheme(
             local_assembler, t, local_xdot, dxdot_dx, dx_dx, _local_M_data,
             _local_K_data, _local_b_data, _local_Jac_data,
             local_coupled_solutions);
diff --git a/ProcessLib/VectorMatrixAssembler.h b/ProcessLib/VectorMatrixAssembler.h
index 3e3a23c43b88dfe10fe7d2005308336057036a63..69ef7eec01a9270aa14d1fe549b7f646ee79235f 100644
--- a/ProcessLib/VectorMatrixAssembler.h
+++ b/ProcessLib/VectorMatrixAssembler.h
@@ -44,22 +44,24 @@ public:
     //! \remark Jacobian is not assembled here, see assembleWithJacobian().
     void assemble(std::size_t const mesh_item_id,
                   LocalAssemblerInterface& local_assembler,
-                  NumLib::LocalToGlobalIndexMap const& dof_table,
+                  std::vector<std::reference_wrapper<
+                      NumLib::LocalToGlobalIndexMap>> const& dof_tables,
                   double const t, GlobalVector const& x, GlobalMatrix& M,
                   GlobalMatrix& K, GlobalVector& b,
-                  const CoupledSolutionsForStaggeredScheme* cpl_xs);
+                  CoupledSolutionsForStaggeredScheme const* const cpl_xs);
 
     //! Assembles \c M, \c K, \c b, and the Jacobian \c Jac of the residual.
     //! \note The Jacobian must be assembled.
-    void assembleWithJacobian(std::size_t const mesh_item_id,
-                              LocalAssemblerInterface& local_assembler,
-                              NumLib::LocalToGlobalIndexMap const& dof_table,
-                              const double t, GlobalVector const& x,
-                              GlobalVector const& xdot, const double dxdot_dx,
-                              const double dx_dx, GlobalMatrix& M,
-                              GlobalMatrix& K, GlobalVector& b,
-                              GlobalMatrix& Jac,
-                              const CoupledSolutionsForStaggeredScheme* cpl_xs);
+    void assembleWithJacobian(
+        std::size_t const mesh_item_id,
+        LocalAssemblerInterface& local_assembler,
+        std::vector<
+            std::reference_wrapper<NumLib::LocalToGlobalIndexMap>> const&
+            dof_tables,
+        const double t, GlobalVector const& x, GlobalVector const& xdot,
+        const double dxdot_dx, const double dx_dx, GlobalMatrix& M,
+        GlobalMatrix& K, GlobalVector& b, GlobalMatrix& Jac,
+        CoupledSolutionsForStaggeredScheme const* const cpl_xs);
 
 private:
     // temporary data only stored here in order to avoid frequent memory
diff --git a/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/InjectionProduction1D.prj b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/InjectionProduction1D.prj
new file mode 100644
index 0000000000000000000000000000000000000000..f31d7d4408ee7da140fbc3769f457ae85fb5f6ba
--- /dev/null
+++ b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/InjectionProduction1D.prj
@@ -0,0 +1,294 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<OpenGeoSysProject>
+    <mesh>mesh.vtu</mesh>
+    <geometry>bar.gml</geometry>
+    <processes>
+        <process>
+            <name>InjectionProduction1D</name>
+            <type>HYDRO_MECHANICS</type>
+            <coupling_scheme>staggered</coupling_scheme>
+            <integration_order>3</integration_order>
+            <dimension>2</dimension>
+            <constitutive_relation>
+                <type>LinearElasticIsotropic</type>
+                <youngs_modulus>E</youngs_modulus>
+                <poissons_ratio>nu</poissons_ratio>
+            </constitutive_relation>
+            <intrinsic_permeability>k</intrinsic_permeability>
+            <specific_storage>S</specific_storage>
+            <fluid_viscosity>mu</fluid_viscosity>
+            <fluid_density>rho_f</fluid_density>
+            <biot_coefficient>alpha</biot_coefficient>
+            <porosity>phi</porosity>
+            <solid_density>rho_s</solid_density>
+            <process_variables>
+                <pressure>pressure</pressure>
+                <displacement>displacement</displacement>
+            </process_variables>
+            <secondary_variables>
+                <secondary_variable type="static" internal_name="sigma_xx" output_name="sigma_xx"/>
+                <secondary_variable type="static" internal_name="sigma_yy" output_name="sigma_yy"/>
+                <secondary_variable type="static" internal_name="sigma_zz" output_name="sigma_zz"/>
+                <secondary_variable type="static" internal_name="sigma_xy" output_name="sigma_xy"/>
+                <secondary_variable type="static" internal_name="epsilon_xx" output_name="epsilon_xx"/>
+                <secondary_variable type="static" internal_name="epsilon_yy" output_name="epsilon_yy"/>
+                <secondary_variable type="static" internal_name="epsilon_zz" output_name="epsilon_zz"/>
+                <secondary_variable type="static" internal_name="epsilon_xy" output_name="epsilon_xy"/>
+                <secondary_variable type="static" internal_name="velocity" output_name="velocity"/>
+            </secondary_variables>
+            <specific_body_force>0 0</specific_body_force>
+        </process>
+    </processes>
+    <time_loop>
+        <global_process_coupling>
+            <max_iter> 6 </max_iter>
+            <convergence_criteria>
+                <!-- convergence criterion for the first process -->
+                <convergence_criterion>
+                    <type>DeltaX</type>
+                    <norm_type>NORM2</norm_type>
+                    <reltol>1.e-10</reltol>
+                </convergence_criterion>
+                <!-- convergence criterion for the second process -->
+                <convergence_criterion>
+                    <type>DeltaX</type>
+                    <norm_type>NORM2</norm_type>
+                    <reltol>1.e-10</reltol>
+                </convergence_criterion>
+            </convergence_criteria>
+        </global_process_coupling>
+
+        <processes>
+            <!--For the equations of pressure-->
+            <process ref="InjectionProduction1D">
+                <nonlinear_solver>basic_newton_p</nonlinear_solver>
+                <convergence_criterion>
+                    <type>DeltaX</type>
+                    <norm_type>NORM2</norm_type>
+                    <abstol>1e-6</abstol>
+                </convergence_criterion>
+                <time_discretization>
+                    <type>BackwardEuler</type>
+                </time_discretization>
+                <output>
+                    <variables>
+                        <variable>pressure</variable>
+                        <variable>velocity</variable>
+                    </variables>
+                </output>
+                <time_stepping>
+                    <type>FixedTimeStepping</type>
+                    <t_initial>0</t_initial>
+                    <t_end>8.64e6</t_end>
+                    <timesteps>
+                        <pair>
+                            <repeat>100</repeat>
+                            <delta_t>8.64e4</delta_t>
+                        </pair>
+                    </timesteps>
+                </time_stepping>
+            </process>
+            <!--For the equations of deformation-->
+            <process ref="InjectionProduction1D">
+                <nonlinear_solver>basic_newton_u</nonlinear_solver>
+                <convergence_criterion>
+                    <type>DeltaX</type>
+                    <norm_type>NORM2</norm_type>
+                    <abstol>1e-6</abstol>
+                </convergence_criterion>
+                <time_discretization>
+                    <type>BackwardEuler</type>
+                </time_discretization>
+                <output>
+                    <variables>
+                        <variable>displacement</variable>
+                        <variable>sigma_xx</variable>
+                        <variable>sigma_yy</variable>
+                        <variable>sigma_zz</variable>
+                        <variable>sigma_xy</variable>
+                        <variable>epsilon_xx</variable>
+                        <variable>epsilon_yy</variable>
+                        <variable>epsilon_zz</variable>
+                        <variable>epsilon_xy</variable>
+                    </variables>
+                </output>
+                <time_stepping>
+                    <type>FixedTimeStepping</type>
+                    <t_initial>0</t_initial>
+                    <t_end>8.64e6</t_end>
+                    <timesteps>
+                        <pair>
+                            <repeat>100</repeat>
+                            <delta_t>8.64e4</delta_t>
+                        </pair>
+                    </timesteps>
+                </time_stepping>
+            </process>            
+        </processes>
+        <output>
+            <type>VTK</type>
+            <prefix>InjectionProduction1D</prefix>
+            <timesteps>
+                <pair>
+                    <repeat>1</repeat>
+                    <each_steps>1</each_steps>
+                </pair>
+                <pair>
+                    <repeat>1</repeat>
+                    <each_steps>19</each_steps>
+                </pair>
+                <pair>
+                    <repeat>1</repeat>
+                    <each_steps>100</each_steps>
+                </pair>
+            </timesteps>
+        </output>
+    </time_loop>
+    <parameters>
+        <!-- Mechanics -->
+        <parameter>
+            <name>E</name>
+            <type>Constant</type>
+            <value>5.0e8</value>
+        </parameter>
+        <parameter>
+            <name>nu</name>
+            <type>Constant</type>
+            <value>0.3</value>
+        </parameter>
+        <!-- Model parameters -->
+        <parameter>
+            <name>k</name>
+            <type>Constant</type>
+            <value>4.9346165e-11</value>
+        </parameter>
+        <parameter>
+            <name>S</name>
+            <type>Constant</type>
+            <value>1e-5</value>
+        </parameter>
+        <parameter>
+            <name>mu</name>
+            <type>Constant</type>
+            <value>1e-3</value>
+        </parameter>
+        <parameter>
+            <name>alpha</name>
+            <type>Constant</type>
+            <value>1</value>
+        </parameter>
+        <parameter>
+            <name>phi</name>
+            <type>Constant</type>
+            <value>0.3</value>
+        </parameter>
+        <parameter>
+            <name>rho_s</name>
+            <type>Constant</type>
+            <value>0.0</value>
+        </parameter>
+        <parameter>
+            <name>rho_f</name>
+            <type>Constant</type>
+            <value>1.0e3</value>
+        </parameter>
+        <parameter>
+            <name>displacement0</name>
+            <type>Constant</type>
+            <values>0 0</values>
+        </parameter>
+        <parameter>
+            <name>pressure_ic</name>
+            <type>Constant</type>
+            <values>0.</values>
+        </parameter>
+        <parameter>
+            <name>dirichlet0</name>
+            <type>Constant</type>
+            <value>0</value>
+        </parameter>
+        <parameter>
+            <name>overburden</name>
+            <type>Constant</type>
+            <value>-2.125e6</value>
+        </parameter>
+    </parameters>
+    <process_variables>
+        <process_variable>
+            <name>pressure</name>
+            <components>1</components>
+            <order>1</order>
+            <initial_condition>pressure_ic</initial_condition>
+        </process_variable>
+        <process_variable>
+            <name>displacement</name>
+            <components>2</components>
+            <order>2</order>
+            <initial_condition>displacement0</initial_condition>
+            <boundary_conditions>
+                <boundary_condition>
+                    <geometrical_set>bar</geometrical_set>
+                    <geometry>left</geometry>
+                    <type>Dirichlet</type>
+                    <component>0</component>
+                    <parameter>dirichlet0</parameter>
+                </boundary_condition>
+                <boundary_condition>
+                    <geometrical_set>bar</geometrical_set>
+                    <geometry>right</geometry>
+                    <type>Dirichlet</type>
+                    <component>0</component>
+                    <parameter>dirichlet0</parameter>
+                </boundary_condition>
+                <boundary_condition>
+                    <geometrical_set>bar</geometrical_set>
+                    <geometry>bottom</geometry>
+                    <type>Dirichlet</type>
+                    <component>1</component>
+                    <parameter>dirichlet0</parameter>
+                </boundary_condition>
+                <boundary_condition>
+                    <geometrical_set>bar</geometrical_set>
+                    <geometry>top</geometry>
+                    <type>Neumann</type>
+                    <component>1</component>
+                    <parameter>overburden</parameter>
+                </boundary_condition>
+            </boundary_conditions>
+        </process_variable>
+    </process_variables>
+    <nonlinear_solvers>
+        <nonlinear_solver>
+            <name>basic_newton_p</name>
+            <type>Newton</type>
+            <max_iter>50</max_iter>
+            <linear_solver>linear_solver_p</linear_solver>
+        </nonlinear_solver>
+        <nonlinear_solver>
+            <name>basic_newton_u</name>
+            <type>Newton</type>
+            <max_iter>50</max_iter>
+            <linear_solver>linear_solver_u</linear_solver>
+        </nonlinear_solver>
+    </nonlinear_solvers>
+    <linear_solvers>
+        <linear_solver>
+            <name>linear_solver_p</name>
+            <eigen>
+                <solver_type>CG</solver_type>
+                <precon_type>DIAGONAL</precon_type>
+                <max_iteration_step>10000</max_iteration_step>
+                <error_tolerance>1e-10</error_tolerance>
+            </eigen>
+        </linear_solver>
+        <linear_solver>
+            <name>linear_solver_u</name>
+            <eigen>
+                <solver_type>BiCGSTAB</solver_type>
+                <precon_type>ILUT</precon_type>
+                <max_iteration_step>10000</max_iteration_step>
+                <error_tolerance>1e-16</error_tolerance>
+            </eigen>
+        </linear_solver>
+    </linear_solvers>
+</OpenGeoSysProject>
diff --git a/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/InjectionProduction1D_Mono_pcs_0_ts_100_t_8640000.000000.vtu b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/InjectionProduction1D_Mono_pcs_0_ts_100_t_8640000.000000.vtu
new file mode 100644
index 0000000000000000000000000000000000000000..6c94bd2950dc5c7ed2b5b763f5f152744ced38dd
--- /dev/null
+++ b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/InjectionProduction1D_Mono_pcs_0_ts_100_t_8640000.000000.vtu
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d227aaa03f90d40b28d1704f593f65abb04e21542b90e9f40b82c240323a6416
+size 9525
diff --git a/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/RerefenceSolutionByMonolithicScheme/InjectionProduction1DMono.prj b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/RerefenceSolutionByMonolithicScheme/InjectionProduction1DMono.prj
new file mode 100644
index 0000000000000000000000000000000000000000..08315700ec8f30fbce67f3e0e159c8b444228b51
--- /dev/null
+++ b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/RerefenceSolutionByMonolithicScheme/InjectionProduction1DMono.prj
@@ -0,0 +1,248 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<OpenGeoSysProject>
+    <mesh>mesh.vtu</mesh>
+    <geometry>bar.gml</geometry>
+    <processes>
+        <process>
+            <name>InjectionProduction1D</name>
+            <type>HYDRO_MECHANICS</type>
+            <integration_order>3</integration_order>
+            <dimension>2</dimension>
+            <constitutive_relation>
+                <type>LinearElasticIsotropic</type>
+                <youngs_modulus>E</youngs_modulus>
+                <poissons_ratio>nu</poissons_ratio>
+            </constitutive_relation>
+            <intrinsic_permeability>k</intrinsic_permeability>
+            <specific_storage>S</specific_storage>
+            <fluid_viscosity>mu</fluid_viscosity>
+            <fluid_density>rho_f</fluid_density>
+            <biot_coefficient>alpha</biot_coefficient>
+            <porosity>phi</porosity>
+            <solid_density>rho_s</solid_density>
+            <process_variables>
+                <pressure>pressure</pressure>
+                <displacement>displacement</displacement>
+            </process_variables>
+            <secondary_variables>
+                <secondary_variable type="static" internal_name="sigma_xx" output_name="sigma_xx"/>
+                <secondary_variable type="static" internal_name="sigma_yy" output_name="sigma_yy"/>
+                <secondary_variable type="static" internal_name="sigma_zz" output_name="sigma_zz"/>
+                <secondary_variable type="static" internal_name="sigma_xy" output_name="sigma_xy"/>
+                <secondary_variable type="static" internal_name="epsilon_xx" output_name="epsilon_xx"/>
+                <secondary_variable type="static" internal_name="epsilon_yy" output_name="epsilon_yy"/>
+                <secondary_variable type="static" internal_name="epsilon_zz" output_name="epsilon_zz"/>
+                <secondary_variable type="static" internal_name="epsilon_xy" output_name="epsilon_xy"/>
+                <secondary_variable type="static" internal_name="velocity" output_name="velocity"/>
+            </secondary_variables>
+            <specific_body_force>0 0</specific_body_force>
+        </process>
+    </processes>
+    <time_loop>
+        <processes>
+            <!--For the equations of deformation-->
+            <process ref="InjectionProduction1D">
+                <nonlinear_solver>basic_newton_u</nonlinear_solver>
+                <convergence_criterion>
+                    <type>DeltaX</type>
+                    <norm_type>NORM2</norm_type>
+                    <abstol>1e-6</abstol>
+                </convergence_criterion>
+                <time_discretization>
+                    <type>BackwardEuler</type>
+                </time_discretization>
+                <output>
+                    <variables>
+                        <variable>displacement</variable>
+ 			<variable>pressure</variable>
+                        <variable>sigma_xx</variable>
+                        <variable>sigma_yy</variable>
+                        <variable>sigma_zz</variable>
+                        <variable>sigma_xy</variable>
+                        <variable>epsilon_xx</variable>
+                        <variable>epsilon_yy</variable>
+                        <variable>epsilon_zz</variable>
+                        <variable>epsilon_xy</variable>
+ 			<variable>velocity</variable>
+                    </variables>
+                </output>
+                <time_stepping>
+                    <type>FixedTimeStepping</type>
+                    <t_initial>0</t_initial>
+                    <t_end>8.64e6</t_end>
+                    <timesteps>
+                        <pair>
+                            <repeat>100</repeat>
+                            <delta_t>8.64e4</delta_t>
+                        </pair>
+                    </timesteps>
+                </time_stepping>
+            </process>            
+        </processes>
+        <output>
+            <type>VTK</type>
+            <prefix>InjectionProduction1D_Mono</prefix>
+            <timesteps>
+                <pair>
+                    <repeat>1</repeat>
+                    <each_steps>1</each_steps>
+                </pair>
+                <pair>
+                    <repeat>1</repeat>
+                    <each_steps>19</each_steps>
+                </pair>
+                <pair>
+                    <repeat>1</repeat>
+                    <each_steps>100</each_steps>
+                </pair>
+            </timesteps>
+        </output>
+    </time_loop>
+    <parameters>
+        <!-- Mechanics -->
+        <parameter>
+            <name>E</name>
+            <type>Constant</type>
+            <value>5.0e8</value>
+        </parameter>
+        <parameter>
+            <name>nu</name>
+            <type>Constant</type>
+            <value>0.3</value>
+        </parameter>
+        <!-- Model parameters -->
+        <parameter>
+            <name>k</name>
+            <type>Constant</type>
+            <value>4.9346165e-11</value>
+        </parameter>
+        <parameter>
+            <name>S</name>
+            <type>Constant</type>
+            <value>1e-5</value>
+        </parameter>
+        <parameter>
+            <name>mu</name>
+            <type>Constant</type>
+            <value>1e-3</value>
+        </parameter>
+        <parameter>
+            <name>alpha</name>
+            <type>Constant</type>
+            <value>1</value>
+        </parameter>
+        <parameter>
+            <name>phi</name>
+            <type>Constant</type>
+            <value>0.3</value>
+        </parameter>
+        <parameter>
+            <name>rho_s</name>
+            <type>Constant</type>
+            <value>0.0</value>
+        </parameter>
+        <parameter>
+            <name>rho_f</name>
+            <type>Constant</type>
+            <value>1.0e3</value>
+        </parameter>
+        <parameter>
+            <name>displacement0</name>
+            <type>Constant</type>
+            <values>0 0</values>
+        </parameter>
+        <parameter>
+            <name>pressure_ic</name>
+            <type>Constant</type>
+            <values>0.</values>
+        </parameter>
+        <parameter>
+            <name>dirichlet0</name>
+            <type>Constant</type>
+            <value>0</value>
+        </parameter>
+        <parameter>
+            <name>oberburden</name>
+            <type>Constant</type>
+            <value>-2.125e6</value>
+        </parameter>
+    </parameters>
+    <process_variables>
+        <process_variable>
+            <name>pressure</name>
+            <components>1</components>
+            <order>1</order>
+            <initial_condition>pressure_ic</initial_condition>
+        </process_variable>
+        <process_variable>
+            <name>displacement</name>
+            <components>2</components>
+            <order>2</order>
+            <initial_condition>displacement0</initial_condition>
+            <boundary_conditions>
+                <boundary_condition>
+                    <geometrical_set>bar</geometrical_set>
+                    <geometry>left</geometry>
+                    <type>Dirichlet</type>
+                    <component>0</component>
+                    <parameter>dirichlet0</parameter>
+                </boundary_condition>
+                <boundary_condition>
+                    <geometrical_set>bar</geometrical_set>
+                    <geometry>right</geometry>
+                    <type>Dirichlet</type>
+                    <component>0</component>
+                    <parameter>dirichlet0</parameter>
+                </boundary_condition>
+                <boundary_condition>
+                    <geometrical_set>bar</geometrical_set>
+                    <geometry>bottom</geometry>
+                    <type>Dirichlet</type>
+                    <component>1</component>
+                    <parameter>dirichlet0</parameter>
+                </boundary_condition>
+                <boundary_condition>
+                    <geometrical_set>bar</geometrical_set>
+                    <geometry>top</geometry>
+                    <type>Neumann</type>
+                    <component>1</component>
+                    <parameter>oberburden</parameter>
+                </boundary_condition>
+            </boundary_conditions>
+        </process_variable>
+    </process_variables>
+    <nonlinear_solvers>
+        <nonlinear_solver>
+            <name>basic_newton_p</name>
+            <type>Newton</type>
+            <max_iter>50</max_iter>
+            <linear_solver>linear_solver_p</linear_solver>
+        </nonlinear_solver>
+        <nonlinear_solver>
+            <name>basic_newton_u</name>
+            <type>Newton</type>
+            <max_iter>50</max_iter>
+            <linear_solver>linear_solver_u</linear_solver>
+        </nonlinear_solver>
+    </nonlinear_solvers>
+    <linear_solvers>
+        <linear_solver>
+            <name>linear_solver_p</name>
+            <eigen>
+                <solver_type>CG</solver_type>
+                <precon_type>DIAGONAL</precon_type>
+                <max_iteration_step>10000</max_iteration_step>
+                <error_tolerance>1e-10</error_tolerance>
+            </eigen>
+        </linear_solver>
+        <linear_solver>
+            <name>linear_solver_u</name>
+            <eigen>
+                <solver_type>BiCGSTAB</solver_type>
+                <precon_type>ILUT</precon_type>
+                <max_iteration_step>10000</max_iteration_step>
+                <error_tolerance>1e-16</error_tolerance>
+            </eigen>
+        </linear_solver>
+    </linear_solvers>
+</OpenGeoSysProject>
diff --git a/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/RerefenceSolutionByMonolithicScheme/InjectionProduction1D_pcs_1_ts_100_t_8640000.000000.vtu b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/RerefenceSolutionByMonolithicScheme/InjectionProduction1D_pcs_1_ts_100_t_8640000.000000.vtu
new file mode 100644
index 0000000000000000000000000000000000000000..29a5bce11e77cda2fe8508405ebb6dd35bffb678
--- /dev/null
+++ b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/RerefenceSolutionByMonolithicScheme/InjectionProduction1D_pcs_1_ts_100_t_8640000.000000.vtu
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:532cb10c716f1f4253ab3637e8109d6f5e15ddee1e6b7d55d3bd533fea74886d
+size 9562
diff --git a/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/RerefenceSolutionByMonolithicScheme/bar.gml b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/RerefenceSolutionByMonolithicScheme/bar.gml
new file mode 100644
index 0000000000000000000000000000000000000000..3fefc7023a17f36aacddd41908416b312729a9de
--- /dev/null
+++ b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/RerefenceSolutionByMonolithicScheme/bar.gml
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a01e45480094978e75ac805044d7f57ae56db9a9a6368ec7f4addda05fd10816
+size 1007
diff --git a/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/RerefenceSolutionByMonolithicScheme/mesh.vtu b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/RerefenceSolutionByMonolithicScheme/mesh.vtu
new file mode 100644
index 0000000000000000000000000000000000000000..7c0caf12a33726da3aa6213e2c83783cd79dd6ab
--- /dev/null
+++ b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/RerefenceSolutionByMonolithicScheme/mesh.vtu
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e053430e4ab5cf4670cbf3f656f51f117cfab2cdb9ed04f6ca82d93b2b3e4bd6
+size 2777
diff --git a/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/bar.gml b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/bar.gml
new file mode 100644
index 0000000000000000000000000000000000000000..3fefc7023a17f36aacddd41908416b312729a9de
--- /dev/null
+++ b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/bar.gml
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a01e45480094978e75ac805044d7f57ae56db9a9a6368ec7f4addda05fd10816
+size 1007
diff --git a/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/mesh.vtu b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/mesh.vtu
new file mode 100644
index 0000000000000000000000000000000000000000..7c0caf12a33726da3aa6213e2c83783cd79dd6ab
--- /dev/null
+++ b/Tests/Data/HydroMechanics/StaggeredScheme/InjectionProduction1D/mesh.vtu
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e053430e4ab5cf4670cbf3f656f51f117cfab2cdb9ed04f6ca82d93b2b3e4bd6
+size 2777
diff --git a/Tests/Data/Parabolic/HT/StaggeredCoupling/ConstViscosity/square_5500x5500.vtu b/Tests/Data/Parabolic/HT/StaggeredCoupling/ConstViscosity/square_5500x5500.vtu
index 3df5c5201c9e653cb43c9809d103ffff3390aacc..daa22a7fd61f65b64d5e4f7e8048a414d0fa0af0 100644
--- a/Tests/Data/Parabolic/HT/StaggeredCoupling/ConstViscosity/square_5500x5500.vtu
+++ b/Tests/Data/Parabolic/HT/StaggeredCoupling/ConstViscosity/square_5500x5500.vtu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6f5d32a50ce7d3eddd0f4f4b1f33de2230d2e49a24aa1b55511222e3cf038f68
+oid sha256:5938bdfce5fdd7675654514a0e2c26ff1b8fbdf419e02cef8ffc48da15f13aa2
 size 138816
diff --git a/Tests/Data/Parabolic/HT/StaggeredCoupling/ConstViscosity/square_5500x5500_staggered_scheme.prj b/Tests/Data/Parabolic/HT/StaggeredCoupling/ConstViscosity/square_5500x5500_staggered_scheme.prj
index fe303865d284a2b490b21ad3db57d5d5b100fa6a..eee8f7a9319ab9f3ad3ebb8ae15936c96a7ba851 100644
--- a/Tests/Data/Parabolic/HT/StaggeredCoupling/ConstViscosity/square_5500x5500_staggered_scheme.prj
+++ b/Tests/Data/Parabolic/HT/StaggeredCoupling/ConstViscosity/square_5500x5500_staggered_scheme.prj
@@ -57,11 +57,20 @@
     <time_loop>
         <global_process_coupling>
             <max_iter> 6 </max_iter>
-            <convergence_criterion>
-                <type>Residual</type>
-                <norm_type>NORM2</norm_type>
-                <reltol>1.e-10</reltol>
-            </convergence_criterion>
+            <convergence_criteria>
+                <!-- convergence criterion for the first process -->
+                <convergence_criterion>
+                    <type>DeltaX</type>
+                    <norm_type>NORM2</norm_type>
+                    <reltol>1.e-14</reltol>
+                </convergence_criterion>
+                <!-- convergence criterion for the second process -->
+                <convergence_criterion>
+                    <type>DeltaX</type>
+                    <norm_type>NORM2</norm_type>
+                    <reltol>1.e-14</reltol>
+                </convergence_criterion>
+            </convergence_criteria>
         </global_process_coupling>
 
         <processes>
diff --git a/Tests/NumLib/ODEs.h b/Tests/NumLib/ODEs.h
index 099e7150ace33633c9be9d9ef4fdb283a9615bf3..cd12d81fd52e00347a5b25fca1cede671843fc3b 100644
--- a/Tests/NumLib/ODEs.h
+++ b/Tests/NumLib/ODEs.h
@@ -38,9 +38,9 @@ public:
     }
 
     void assembleWithJacobian(const double t, GlobalVector const& x_curr,
-                              GlobalVector const& /*xdot*/, const double dxdot_dx,
-                              const double dx_dx, GlobalMatrix& M,
-                              GlobalMatrix& K, GlobalVector& b,
+                              GlobalVector const& /*xdot*/,
+                              const double dxdot_dx, const double dx_dx,
+                              GlobalMatrix& M, GlobalMatrix& K, GlobalVector& b,
                               GlobalMatrix& Jac) override
     {
         namespace LinAlg = MathLib::LinAlg;
@@ -51,13 +51,15 @@ public:
         LinAlg::finalizeAssembly(M);
         LinAlg::copy(M, Jac);
         LinAlg::scale(Jac, dxdot_dx);
-        if (dx_dx != 0.0) {
+        if (dx_dx != 0.0)
+        {
             LinAlg::finalizeAssembly(K);
             LinAlg::axpy(Jac, dx_dx, K);
         }
     }
 
-    MathLib::MatrixSpecifications getMatrixSpecifications() const override
+    MathLib::MatrixSpecifications getMatrixSpecifications(
+        const int /*process_id*/) const override
     {
         return { N, N, nullptr, nullptr };
     }
@@ -140,7 +142,8 @@ public:
         }
     }
 
-    MathLib::MatrixSpecifications getMatrixSpecifications() const override
+    MathLib::MatrixSpecifications getMatrixSpecifications(
+        const int /*process_id*/) const override
     {
         return { N, N, nullptr, nullptr };
     }
@@ -266,7 +269,8 @@ public:
         // INFO("Det J: %e <<<", J.determinant());
     }
 
-    MathLib::MatrixSpecifications getMatrixSpecifications() const override
+    MathLib::MatrixSpecifications getMatrixSpecifications(
+        const int /*process_id*/) const override
     {
         return { N, N, nullptr, nullptr };
     }
diff --git a/Tests/NumLib/TestODEInt.cpp b/Tests/NumLib/TestODEInt.cpp
index b14c79d00acae3d14d8a3d4ae0f3eb6b678ddc72..396ab73532d554a052311e765d6d082e30243870 100644
--- a/Tests/NumLib/TestODEInt.cpp
+++ b/Tests/NumLib/TestODEInt.cpp
@@ -68,8 +68,9 @@ public:
 
         Solution sol;
 
+        const int process_id = 0;
         NumLib::TimeDiscretizedODESystem<ODE_::ODETag, NLTag>
-                ode_sys(ode, timeDisc);
+                ode_sys(process_id, ode, timeDisc);
 
         auto linear_solver = createLinearSolver();
         auto conv_crit = std::make_unique<NumLib::ConvergenceCriterionDeltaX>(
@@ -90,7 +91,7 @@ public:
              delta_t);
 
         // initial condition
-        GlobalVector x0(ode.getMatrixSpecifications().nrows);
+        GlobalVector x0(ode.getMatrixSpecifications(process_id).nrows);
         ODET::setIC(x0);
 
         sol.ts.push_back(t0);
diff --git a/web/content/docs/benchmarks/hydro-mechanics/InjectionProduction.md b/web/content/docs/benchmarks/hydro-mechanics/InjectionProduction.md
new file mode 100644
index 0000000000000000000000000000000000000000..60e79510b25bd48612e21a964dd7d91e52151c2f
--- /dev/null
+++ b/web/content/docs/benchmarks/hydro-mechanics/InjectionProduction.md
@@ -0,0 +1,93 @@
++++
+date = "2017-12-29"
+title = "StaggeredScheme"
+weight = 151
+project = "HydroMechanics/StaggeredScheme/InjectionProduction1D/InjectionProduction1D.prj"
+author = "Wenqing Wang"
+
+[menu]
+  [menu.benchmarks]
+    parent = "hydro-mechanics"
+
++++
+
+{{< data-link >}}
+
+---
+#     Consolidation example based on fluid injection and production    application
+
+{{< img src="../InjectionProduction.png" >}}
+
+Based on one of  the examples about injection and
+production well that is present by Kim et al. \cite kimTchJua2011, this benchmark
+ is used to verify the staggered scheme, which
+  is implemented in OGS-6 for modelling of the coupled hydro-mechanical (HM)
+ processes in the porous media. The problem of the example is defined
+ in a 10 x 150 m  <span class="math inline"><em></em><sup>2</sup></span>
+domain (as shown in the figure). The deformation is solved under the plane strain
+ assumption. Initially, the pore pressure and the
+ stress are set to zero. On all boundaries no fluid flux condition is
+ applied for the hydraulic process. On the lateral
+and the bottom boundaries, zero normal displacement is prescribed. On
+the top surface a vertical traction boundary condition of 2.125 MP is
+applied. The gravity related terms are neglected in both: the Darcy velocity
+ and the momentum balance equation.
+
+The material properties are shown in the following table.
+<table>
+<caption>Material properties</caption>
+<thead>
+<tr class="header">
+<th align="left">Property</th>
+<th align="left">Value</th>
+<th align="left">Unit</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td align="left">Water density</td>
+<td align="left">1,000</td>
+<td align="left">kg/m<span class="math inline"><em></em><sup>3</sup></span></td>
+</tr>
+<tr class="even">
+<td align="left">Porosity</td>
+<td align="left">0.3</td>
+<td align="left">–</td>
+</tr>
+<tr class="odd">
+<td align="left">Viscosity</td>
+<td align="left"><span class="math inline">10<sup>−3</sup></span></td>
+<td align="left">Pa<span class="math inline">â‹…</span>s</td>
+</tr>
+<tr class="even">
+<td align="left">Specific storage</td>
+<td align="left"><span class="math inline">10<sup>−4</sup></span></td>
+<td align="left">m <span class="math inline"><em></em><sup>−1</sup></span></td>
+</tr>
+<tr class="odd">
+<td align="left">Intrinsic permeability</td>
+<td align="left"><span class="math inline">4.9346165<em>e</em> × 10<sup>−11</sup></span></td>
+<td align="left">m<span class="math inline"><em></em><sup>2</sup></span></td>
+</tr>
+<tr class="even">
+<td align="left">Young’s modulus</td>
+<td align="left"><span class="math inline">5 × 10<sup>8</sup></span></td>
+<td align="left">Pa</td>
+</tr>
+<tr class="odd">
+<td align="left">Poisson ratio</td>
+<td align="left">0.3</td>
+<td align="left">–</td>
+</tr>
+</tbody>
+</table>
+
+The time duration is 8.64 <span class="math inline">â‹…10<sup>6</sup></span> s, 
+and the time step size is 8.64 <span class="math inline">â‹…10<sup>4</sup></span> s.
+ For the verification the example is also solved by the monolithic
+scheme. The displacement solution at the last
+time step is shown in the enclosed figure too.
+
+## References
+
+{{< bib id="kimTchJua2011" >}}
diff --git a/web/content/docs/benchmarks/hydro-mechanics/InjectionProduction.png b/web/content/docs/benchmarks/hydro-mechanics/InjectionProduction.png
new file mode 100644
index 0000000000000000000000000000000000000000..0ff9f1d15c1bfd1e2046802b0ea15230e70a78f8
--- /dev/null
+++ b/web/content/docs/benchmarks/hydro-mechanics/InjectionProduction.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53c20caf2b381e6b504fdb5ebfa79b6d18c50f1e5f1d68ef6e07784e4ab025d2
+size 15884