diff --git a/ProcessLib/Assembly/MatrixAssemblyStats.h b/ProcessLib/Assembly/MatrixAssemblyStats.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ebc066effd64d388e8172686734237a38c2149d
--- /dev/null
+++ b/ProcessLib/Assembly/MatrixAssemblyStats.h
@@ -0,0 +1,135 @@
+/**
+ * \file
+ * \copyright
+ * Copyright (c) 2012-2023, OpenGeoSys Community (http://www.opengeosys.org)
+ *            Distributed under a Modified BSD License.
+ *              See accompanying file LICENSE.txt or
+ *              http://www.opengeosys.org/project/license
+ *
+ */
+
+#pragma once
+
+#include <memory>
+
+#include "BaseLib/Logging.h"
+
+namespace ProcessLib::Assembly
+{
+struct Stats
+{
+    std::size_t count = 0;
+    std::size_t count_nonzero = 0;
+    std::size_t count_global = 0;
+
+    Stats& operator+=(Stats const& other)
+    {
+        count += other.count;
+        count_nonzero += other.count_nonzero;
+        count_global += other.count_global;
+
+        return *this;
+    }
+
+    void print(std::string const& matrix_or_vector_name) const
+    {
+        DBUG("Stats [{}]: {} elements added to the matrix cache.",
+             matrix_or_vector_name,
+             count);
+        DBUG("Stats [{}]: {} nonzero elements added to the matrix cache.",
+             matrix_or_vector_name,
+             count_nonzero);
+        DBUG("Stats [{}]: {} elements added to the global matrix.",
+             matrix_or_vector_name,
+             count_global);
+    }
+};
+
+struct MultiStats
+{
+    Stats M;
+    Stats K;
+    Stats b;
+    Stats Jac;
+
+    MultiStats& operator+=(MultiStats const& other)
+    {
+        M += other.M;
+        K += other.K;
+        b += other.b;
+        Jac += other.Jac;
+
+        return *this;
+    }
+
+    void print() const
+    {
+        M.print("M");
+        K.print("K");
+        b.print("b");
+        Jac.print("J");
+    }
+};
+
+template <typename Data>
+class CumulativeStats
+    : public std::enable_shared_from_this<CumulativeStats<Data>>
+{
+    using Base = std::enable_shared_from_this<CumulativeStats<Data>>;
+
+public:
+    Data data;
+
+    static std::shared_ptr<CumulativeStats<Data>> create()
+    {
+        return std::shared_ptr<CumulativeStats<Data>>(
+            new CumulativeStats<Data>());
+    }
+
+    // Could return unique_ptr, but shared_ptr is more consistent with the
+    // create() method.
+    std::shared_ptr<CumulativeStats<Data>> clone()
+    {
+        return std::make_shared<CumulativeStats<Data>>(*this);
+    }
+
+    CumulativeStats(CumulativeStats<Data> const& other) = delete;
+
+    CumulativeStats(CumulativeStats<Data>& other)
+        : Base{other},
+          data{},
+          parent_{other.parent_ ? other.parent_ : other.shared_from_this()},
+          parent_mutex_{other.parent_mutex_}
+    {
+    }
+
+    CumulativeStats(CumulativeStats<Data>&& other)
+        : parent_{std::move(other.parent_)},
+          parent_mutex_{std::move(other.parent_mutex_)}
+    {
+        std::swap(data, other.data);
+    }
+
+    ~CumulativeStats()
+    {
+        if (!parent_)
+        {
+            return;
+        }
+
+        std::lock_guard<std::mutex> const lock(*parent_mutex_);
+
+        DBUG("Adding cumulative stats to parent.");
+
+        parent_->data += data;
+    }
+
+    void print() const { data.print(); }
+
+private:
+    CumulativeStats() : parent_mutex_{std::make_shared<std::mutex>()} {}
+
+    std::shared_ptr<CumulativeStats<Data>> parent_;
+    std::shared_ptr<std::mutex> parent_mutex_;
+};
+}  // namespace ProcessLib::Assembly
diff --git a/ProcessLib/Assembly/MatrixElementCache.h b/ProcessLib/Assembly/MatrixElementCache.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca2e2cb1d68d797fb9bcf3ce8fa90ad09797e19c
--- /dev/null
+++ b/ProcessLib/Assembly/MatrixElementCache.h
@@ -0,0 +1,308 @@
+/**
+ * \file
+ * \copyright
+ * Copyright (c) 2012-2023, OpenGeoSys Community (http://www.opengeosys.org)
+ *            Distributed under a Modified BSD License.
+ *              See accompanying file LICENSE.txt or
+ *              http://www.opengeosys.org/project/license
+ *
+ */
+
+#pragma once
+
+#include <algorithm>
+
+#include "MathLib/LinAlg/Eigen/EigenMapTools.h"
+#include "MathLib/LinAlg/GlobalMatrixVectorTypes.h"
+#include "ProcessLib/Assembly/MatrixAssemblyStats.h"
+
+namespace ProcessLib::Assembly
+{
+namespace detail
+{
+#ifdef USE_PETSC
+inline GlobalIndexType transformToNonGhostIndex(GlobalIndexType const i,
+                                                GlobalIndexType const n)
+{
+    if (i == -n)
+    {
+        return 0;
+    }
+    else
+    {
+        return std::abs(i);
+    }
+}
+#else
+inline GlobalIndexType transformToNonGhostIndex(GlobalIndexType const i,
+                                                GlobalIndexType const /*n*/)
+{
+    return i;
+}
+#endif
+}  // namespace detail
+
+template <std::size_t Dim>
+struct MatrixElementCacheEntry
+{
+    MatrixElementCacheEntry() = default;
+    MatrixElementCacheEntry(MatrixElementCacheEntry const&) = default;
+    MatrixElementCacheEntry(MatrixElementCacheEntry&&) = default;
+
+    MatrixElementCacheEntry(std::array<GlobalIndexType, Dim> const& indices,
+                            double value)
+        : indices{indices}, value{value}
+    {
+    }
+
+    MatrixElementCacheEntry& operator=(MatrixElementCacheEntry const&) =
+        default;
+    MatrixElementCacheEntry& operator=(MatrixElementCacheEntry&&) = default;
+
+    static bool compareIndices(MatrixElementCacheEntry<Dim> const& a,
+                               MatrixElementCacheEntry<Dim> const& b)
+    {
+        auto const& ia = a.indices;
+        auto const& ib = b.indices;
+
+        return std::lexicographical_compare(ia.begin(), ia.end(), ib.begin(),
+                                            ib.end());
+    }
+
+    bool indicesEqualTo(MatrixElementCacheEntry<Dim> const& other) const
+    {
+        return std::equal(indices.begin(), indices.end(),
+                          other.indices.begin());
+    }
+
+    std::array<GlobalIndexType, Dim> indices;
+    double value;
+};
+
+template <std::size_t Dim>
+class ConcurrentMatrixView
+{
+    static_assert(Dim == 1 || Dim == 2);
+    using GlobalMatOrVec =
+        std::conditional_t<Dim == 1, GlobalVector, GlobalMatrix>;
+
+public:
+    explicit ConcurrentMatrixView(GlobalMatOrVec& mat_or_vec)
+        : mat_or_vec_{mat_or_vec}
+    {
+    }
+
+    void add(std::vector<MatrixElementCacheEntry<Dim>> const& entries)
+    {
+        std::lock_guard<std::mutex> const lock(mutex_);
+
+        if constexpr (Dim == 2)
+        {
+            auto const n_cols = mat_or_vec_.getNumberOfColumns();
+
+            // TODO would be more efficient if our global matrix and vector
+            // implementations supported batch addition of matrix elements with
+            // arbitrary indices (not restricted to (n x m) shaped submatrices).
+            for (auto const [rc, value] : entries)
+            {
+                auto const [r, c] = rc;
+
+                assert(r >= 0);  // rows must not be ghost indices
+                auto const c_no_ghost =
+                    detail::transformToNonGhostIndex(c, n_cols);
+
+                mat_or_vec_.add(r, c_no_ghost, value);
+            }
+        }
+        else
+        {
+            // TODO batch addition would be more efficient. That needs the
+            // refactoring of the matrix element cache.
+            for (auto const [r, value] : entries)
+            {
+                mat_or_vec_.add(r.front(), value);
+            }
+        }
+    }
+
+private:
+    std::mutex mutex_;
+    GlobalMatOrVec& mat_or_vec_;
+};
+
+explicit ConcurrentMatrixView(GlobalVector&)->ConcurrentMatrixView<1>;
+explicit ConcurrentMatrixView(GlobalMatrix&)->ConcurrentMatrixView<2>;
+
+template <std::size_t Dim>
+class MatrixElementCache final
+{
+    static_assert(Dim == 1 || Dim == 2);
+    using GlobalMatView = ConcurrentMatrixView<Dim>;
+
+    static constexpr std::size_t cache_capacity = 1'000'000;
+
+public:
+    MatrixElementCache(GlobalMatView& mat_or_vec, Stats& stats)
+        : mat_or_vec_(mat_or_vec), stats_(stats)
+    {
+        cache_.reserve(cache_capacity);
+    }
+
+    void add(std::vector<double> const& local_data,
+             std::vector<GlobalIndexType> const& indices)
+    {
+        addToCache(local_data, indices);
+    }
+
+    ~MatrixElementCache() { addToGlobal(); }
+
+private:
+    void addToCache(std::vector<double> const& values,
+                    std::vector<GlobalIndexType> const& indices)
+    {
+        if (values.empty())
+        {
+            return;
+        }
+
+        ensureEnoughSpace(values.size());
+
+        addToCacheImpl(values, indices,
+                       std::integral_constant<std::size_t, Dim>{});
+    }
+
+    // Overload for vectors.
+    void addToCacheImpl(std::vector<double> const& values,
+                        std::vector<GlobalIndexType> const& indices,
+                        std::integral_constant<std::size_t, 1>)
+    {
+        auto const num_r_c = indices.size();
+
+        for (std::size_t r_local = 0; r_local < num_r_c; ++r_local)
+        {
+            ++stats_.count;
+            auto const value = values[r_local];
+
+            if (value == 0)
+            {
+                continue;
+            }
+            else
+            {
+                ++stats_.count_nonzero;
+            }
+
+            auto const r_global = indices[r_local];
+            cache_.emplace_back(std::array{r_global}, value);
+        }
+    }
+
+    // Overload for matrices.
+    void addToCacheImpl(std::vector<double> const& values,
+                        std::vector<GlobalIndexType> const& indices,
+                        std::integral_constant<std::size_t, 2>)
+    {
+        auto const num_r_c = indices.size();
+
+        // Note: There is an implicit storage order assumption, here!
+        auto const local_mat = MathLib::toMatrix(values, num_r_c, num_r_c);
+
+        for (std::size_t r_local = 0; r_local < num_r_c; ++r_local)
+        {
+            auto const r_global = indices[r_local];
+
+            for (std::size_t c_local = 0; c_local < num_r_c; ++c_local)
+            {
+                ++stats_.count;
+                auto const value = local_mat(r_local, c_local);
+
+                // TODO skipping zero values sometimes does not work together
+                // with the Eigen SparseLU linear solver. See also
+                // https://gitlab.opengeosys.org/ogs/ogs/-/merge_requests/4556#note_125561
+#if 0
+                if (value == 0)
+                {
+                    continue;
+                }
+#endif
+                ++stats_.count_nonzero;
+
+                auto const c_global = indices[c_local];
+                cache_.emplace_back(std::array{r_global, c_global}, value);
+            }
+        }
+    }
+
+    void ensureEnoughSpace(std::size_t const space_needed)
+    {
+        auto const size_initial = cache_.size();
+        auto const cap_initial = cache_.capacity();
+
+        if (size_initial + space_needed <= cap_initial)
+        {
+            return;
+        }
+
+        addToGlobal();
+
+        // ensure again that there is enough capacity (corner case, if initial
+        // capacity is too small because of whatever arcane reason)
+        auto const size_after = cache_.size();
+        auto const cap_after = cache_.capacity();
+
+        if (size_after + space_needed > cap_after)
+        {
+            cache_.reserve(size_after + 2 * space_needed);
+        }
+    }
+
+    void addToGlobal()
+    {
+        std::sort(cache_.begin(), cache_.end(),
+                  MatrixElementCacheEntry<Dim>::compareIndices);
+
+        mat_or_vec_.add(cache_);
+        stats_.count_global += cache_.size();
+        cache_.clear();
+    }
+
+    std::vector<MatrixElementCacheEntry<Dim>> cache_;
+    GlobalMatView& mat_or_vec_;
+    Stats& stats_;
+};
+
+class MultiMatrixElementCache final
+{
+    using GlobalMatrixView = ConcurrentMatrixView<2>;
+    using GlobalVectorView = ConcurrentMatrixView<1>;
+
+public:
+    MultiMatrixElementCache(GlobalMatrixView& M, GlobalMatrixView& K,
+                            GlobalVectorView& b, GlobalMatrixView& Jac,
+                            MultiStats& stats)
+        : cache_M_(M, stats.M),
+          cache_K_(K, stats.K),
+          cache_b_(b, stats.b),
+          cache_Jac_(Jac, stats.Jac)
+    {
+    }
+
+    void add(std::vector<double> const& local_M_data,
+             std::vector<double> const& local_K_data,
+             std::vector<double> const& local_b_data,
+             std::vector<double> const& local_Jac_data,
+             std::vector<GlobalIndexType> const& indices)
+    {
+        cache_M_.add(local_M_data, indices);
+        cache_K_.add(local_K_data, indices);
+        cache_b_.add(local_b_data, indices);
+        cache_Jac_.add(local_Jac_data, indices);
+    }
+
+private:
+    MatrixElementCache<2> cache_M_;
+    MatrixElementCache<2> cache_K_;
+    MatrixElementCache<1> cache_b_;
+    MatrixElementCache<2> cache_Jac_;
+};
+}  // namespace ProcessLib::Assembly