diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b13842f6..f98e65d0 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -2,9 +2,9 @@ name: build
 
 on:
   push:
-    branches: [ unstable ]
+    branches: [ unstable, '[0-9]+.[0-9]+.x' ]
   pull_request:
-    branches: [ unstable ]
+    branches: [ unstable, '[0-9]+.[0-9]+.x' ]
   workflow_call:
   workflow_dispatch:
 
@@ -25,107 +25,100 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        include:
-          - {os: ubuntu-22.04, cc: gcc-12, cxx: g++-12, doc: OFF}
-          - {os: ubuntu-22.04, cc: clang-15, cxx: clang++-15, doc: ON}
-          - {os: macos-12, cc: gcc-12, cxx: g++-12, doc: OFF}
-          - {os: macos-12, cc: clang, cxx: clang++, doc: OFF}
+        os:  [ubuntu-24.04, macos-15]
+        cxx: [g++, clang++]
 
     runs-on: ${{ matrix.os }}
 
     steps:
     - uses: actions/checkout@v4
 
-    - uses: actions/cache/restore@v3
+    - uses: actions/cache/restore@v4
       with:
         path: ${{ env.CCACHE_DIR }}
-        key: ccache-${{ matrix.os }}-${{ matrix.cc }}-${{ github.run_id }}
-        restore-keys:
-          ccache-${{ matrix.os }}-${{ matrix.cc }}-
+        key: ccache-${{ matrix.os }}-${{ matrix.cxx }}-${{ github.run_id }}-${{ github.run_attempt }}
+        restore-keys: |
+          ccache-${{ matrix.os }}-${{ matrix.cxx }}-
+
+    - name: Set cxx variables
+      run: |
+        if [[ ${{ matrix.os }} == 'macos-15' && ${{ matrix.cxx }} == 'g++' ]]; then
+          echo "CXX=g++-15" >> $GITHUB_ENV
+        else
+          echo "CXX=${{ matrix.cxx }}" >> $GITHUB_ENV
+        fi
 
     - name: Install ubuntu dependencies
-      if: matrix.os == 'ubuntu-22.04'
+      if: ${{ contains(matrix.os, 'ubuntu') }}
       run: >
         sudo apt-get update &&
         sudo apt-get install lsb-release wget software-properties-common &&
-        wget -O /tmp/llvm.sh https://apt.llvm.org/llvm.sh && sudo chmod +x /tmp/llvm.sh && sudo /tmp/llvm.sh 15 &&
         sudo apt-get install
         ccache
-        clang-15
-        g++-12
+        cmake
+        ninja-build
+        clang
+        clang-tools
+        g++
         gfortran
         hdf5-tools
-        libblas-dev
-        libclang-15-dev
-        libc++-15-dev
-        libc++abi-15-dev
-        libomp-15-dev
+        libboost-dev
+        libclang-dev
+        libc++-dev
+        libc++abi-dev
+        libomp-dev
         libfftw3-dev
         libgfortran5
         libgmp-dev
         libhdf5-dev
-        liblapack-dev
+        libopenblas-dev
         libopenmpi-dev
         openmpi-bin
         openmpi-common
         openmpi-doc
-        python3-clang-15
+        python3-clang
         python3-dev
         python3-mako
         python3-mpi4py
         python3-numpy
         python3-pip
         python3-scipy
-        python3-sphinx
-        python3-nbsphinx
+        python3-ipython
 
-    - name: Install homebrew dependencies
-      if: matrix.os == 'macos-12'
+    - name: Set up virtualenv
       run: |
-        brew install ccache gcc@12 llvm hdf5 open-mpi openblas
-        mkdir $HOME/.venv
-        python3 -m venv $HOME/.venv/my_python
+        mkdir -p $HOME/.venv
+        python3 -m venv --system-site-packages $HOME/.venv/my_python
         source $HOME/.venv/my_python/bin/activate
-        pip install mako numpy scipy mpi4py
-        pip install -r requirements.txt
         echo "VIRTUAL_ENV=$VIRTUAL_ENV" >> $GITHUB_ENV
         echo "PATH=$PATH" >> $GITHUB_ENV
 
-    - name: Build doxygen
-      if: matrix.doc == 'ON'
-      env:
-        CC: ${{ matrix.cc }}
-        CXX: ${{ matrix.cxx }}
-        LIBRARY_PATH: /usr/local/opt/llvm/lib
+    - name: Install homebrew dependencies
+      if: ${{ contains(matrix.os, 'macos') }}
       run: |
-        cd $HOME
-        git clone https://github.com/doxygen/doxygen.git
-        cd doxygen
-        git checkout 0a7e79813
-        mkdir build
-        cd build
-        cmake .. -Duse_libclang=ON -Dstatic_libclang=ON -Duse_libc++=OFF -DLLVM_ROOT=/usr/lib/llvm-15/lib/cmake/llvm -DClang_ROOT=/usr/lib/llvm-15/lib/cmake/clang
-        make -j 2 VERBOSE=1
-        cp bin/doxygen /usr/local/bin/doxygen
+        brew update
+        brew install ccache gcc llvm hdf5 open-mpi openblas doxygen
+        pip install mako numpy scipy mpi4py
+        pip install -r requirements.txt
+        echo "PATH=$(brew --prefix llvm)/bin:$(brew --prefix gcc)/bin:$PATH" >> $GITHUB_ENV
+        echo "PYTHONPATH=$(brew --prefix llvm)/lib/python3.13/site-packages" >> $GITHUB_ENV
+        echo "SDKROOT=$(xcrun --show-sdk-path)" >> $GITHUB_ENV
+        echo "LDFLAGS=-L$(brew --prefix llvm)/lib/c++ -L/opt/homebrew/opt/llvm/lib/unwind -lunwind" >> $GITHUB_ENV
 
-    - name: add clang cxxflags
-      if: ${{ contains(matrix.cxx, 'clang') }}
+    - name: Add clang CXXFLAGS
+      if: ${{ matrix.cxx == 'clang++' }}
       run: |
-        echo "PATH=/usr/local/opt/llvm/bin:$PATH" >> $GITHUB_ENV
         echo "CXXFLAGS=-stdlib=libc++" >> $GITHUB_ENV
 
     - name: Build mpi
       env:
-        CC: ${{ matrix.cc }}
-        CXX: ${{ matrix.cxx }}
-        LIBRARY_PATH: /usr/local/opt/llvm/lib
+        BUILD_DOXYGEN_DOCS: ${{ matrix.os == 'macos-15' && matrix.cxx == 'clang++' && 'ON' || 'OFF' }}
       run: |
-        mkdir build && cd build && cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/install -DBuild_Documentation=${{ matrix.doc }}
-        make -j2 || make -j1 VERBOSE=1
+        cmake -S . -B build -G Ninja -DCMAKE_INSTALL_PREFIX=$HOME/install -DBuild_Documentation=$BUILD_DOXYGEN_DOCS
+        cmake --build build --verbose
 
     - name: Test mpi
       env:
-        DYLD_FALLBACK_LIBRARY_PATH: /usr/local/opt/llvm/lib
         OPENBLAS_NUM_THREADS: "1"
       run: |
         cd build
@@ -135,14 +128,14 @@ jobs:
       if: always()
       run: ccache -sv
 
-    - uses: actions/cache/save@v3
+    - uses: actions/cache/save@v4
       if: always()
       with:
         path: ${{ env.CCACHE_DIR }}
-        key: ccache-${{ matrix.os }}-${{ matrix.cc }}-${{ github.run_id }}
+        key: ccache-${{ matrix.os }}-${{ matrix.cxx }}-${{ github.run_id }}-${{ github.run_attempt }}
 
     - name: Deploy documentation
-      if: matrix.doc == 'ON' && github.ref == 'refs/heads/unstable'
+      if: matrix.os == 'macos-15' && matrix.cxx == 'clang++' && github.ref == 'refs/heads/unstable'
       uses: JamesIves/github-pages-deploy-action@v4
       with:
         folder: build/doc/html
diff --git a/.github/workflows/build_multi_node.yml b/.github/workflows/build_multi_node.yml
new file mode 100644
index 00000000..0ebf4c30
--- /dev/null
+++ b/.github/workflows/build_multi_node.yml
@@ -0,0 +1,88 @@
+name: build_multi_node
+
+on:
+  push:
+    branches: [ unstable, '[0-9]+.[0-9]+.x' ]
+  pull_request:
+    branches: [ unstable, '[0-9]+.[0-9]+.x' ]
+  workflow_call:
+  workflow_dispatch:
+
+env:
+  CMAKE_C_COMPILER_LAUNCHER: ccache
+  CMAKE_CXX_COMPILER_LAUNCHER: ccache
+  CCACHE_COMPILERCHECK: content
+  CCACHE_BASEDIR: ${{ github.workspace }}
+  CCACHE_DIR: ${{ github.workspace }}/.ccache
+  CCACHE_MAXSIZE: 500M
+  CCACHE_SLOPPINESS: pch_defines,time_macros,include_file_mtime,include_file_ctime
+  CCACHE_COMPRESS: "1"
+  CCACHE_COMPRESSLEVEL: "1"
+
+jobs:
+  build_multi_node:
+
+    strategy:
+      fail-fast: false
+
+    runs-on: ubuntu-24.04
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - uses: actions/cache/restore@v4
+      with:
+        path: ${{ env.CCACHE_DIR }}
+        key: ccache-${{ matrix.os }}-${{ matrix.cc }}-${{ github.run_id }}-${{ github.run_attempt }}
+        restore-keys: |
+          ccache-${{ matrix.os }}-${{ matrix.cc }}-
+
+    - name: Install ubuntu dependencies
+      run: >
+        sudo apt-get update && sudo apt-get install ccache
+
+    - name: Build and start Docker Compose
+      run: |
+        docker compose build
+        docker compose up -d
+      working-directory: .github/workflows/docker
+
+    - name: Compile MPI inside the container
+      run: |
+        docker exec -t -u runner -w ${{ github.workspace }} docker-vm-1 /bin/bash -euxc '
+        cmake -S . -B build -DCMAKE_INSTALL_PREFIX=$HOME/install -DBuild_Documentation=Off
+        cmake --build build/ -j2
+        '
+
+    - name: Run tests inside the container
+      run: |
+        docker exec -t -u runner -w ${{ github.workspace }} docker-vm-1 /bin/bash -euxc '
+        export CTEST_OUTPUT_ON_FAILURE=1
+        cmake --build build/ --target test
+        '
+
+    - name: Run multi-node window tests
+      run: |
+        docker exec -t -u runner -w ${{ github.workspace }} docker-vm-1 /bin/bash -euxc '
+        cat <<EOF | tee hostfile
+        docker-vm-1 slots=3
+        docker-vm-2 slots=1
+        docker-vm-3 slots=2
+        EOF
+
+        # Test the communication
+        mpirun -hostfile ./hostfile /bin/bash -c "env | grep \"^OMPI_COMM_.*_RANK\""
+
+        # Run the actual test
+        mpirun -hostfile ./hostfile build/test/c++/mpi_window
+        '
+
+    - name: ccache statistics
+      if: always()
+      run: ccache -sv
+
+    - uses: actions/cache/save@v4
+      if: always()
+      with:
+        path: ${{ env.CCACHE_DIR }}
+        key: ccache-${{ matrix.os }}-${{ matrix.cc }}-${{ github.run_id }}-${{ github.run_attempt }}
diff --git a/.github/workflows/docker/Dockerfile b/.github/workflows/docker/Dockerfile
new file mode 100644
index 00000000..444099c2
--- /dev/null
+++ b/.github/workflows/docker/Dockerfile
@@ -0,0 +1,26 @@
+FROM ubuntu:24.04
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update \
+ && apt-get install --no-install-recommends -y \
+    ca-certificates \
+    build-essential \
+    ccache \
+    cmake \
+    g++ \
+    git \
+    libopenmpi-dev \
+    openmpi-bin \
+    openssh-server \
+ && rm -rf /var/cache/apt /var/lib/apt/lists
+RUN useradd -m runner \
+ && passwd -d runner \
+ && mkdir -pm0755 /run/sshd \
+ && echo "PermitRootLogin yes" >> /etc/ssh/sshd_config.d/99-insecure.conf \
+ && echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config.d/99-insecure.conf \
+ && echo "PermitEmptyPasswords yes" >> /etc/ssh/sshd_config.d/99-insecure.conf \
+ && echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config.d/99-insecure.conf \
+ && echo "LogLevel ERROR" >> /etc/ssh/ssh_config.d/99-insecure.conf
+ENV OMPI_MCA_btl_vader_single_copy_mechanism=none
+ENV OMPI_MCA_osc=sm,pt2pt
+ENV OMPI_MCA_rmaps_base_oversubscribe=yes
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/.github/workflows/docker/docker-compose.yml b/.github/workflows/docker/docker-compose.yml
new file mode 100644
index 00000000..39180ff7
--- /dev/null
+++ b/.github/workflows/docker/docker-compose.yml
@@ -0,0 +1,20 @@
+services:
+  vm:
+    build: .
+    deploy:
+      mode: replicated
+      replicas: 3
+    environment:
+      CMAKE_C_COMPILER_LAUNCHER: ${CMAKE_C_COMPILER_LAUNCHER:-ccache}
+      CMAKE_CXX_COMPILER_LAUNCHER: ${CMAKE_CXX_COMPILER_LAUNCHER:-ccache}
+      CCACHE_COMPILERCHECK: ${CCACHE_COMPILERCHECK:-content}
+      CCACHE_BASEDIR: ${CCACHE_BASEDIR}
+      CCACHE_DIR: ${CCACHE_DIR:-${CCACHE_BASEDIR}/.ccache}
+      CCACHE_MAXSIZE: ${CCACHE_MAXSIZE:-500M}
+      CCACHE_SLOPPINESS: ${CCACHE_SLOPPINESS:-pch_defines,time_macros,include_file_mtime,include_file_ctime}
+      CCACHE_COMPRESS: ${CCACHE_COMPRESS:-1}
+      CCACHE_COMPRESSLEVEL: ${CCACHE_COMPRESSLEVEL:-1}
+    stdin_open: true
+    tty: true
+    volumes:
+      - ${CCACHE_BASEDIR}:${CCACHE_BASEDIR}:z
diff --git a/.gitignore b/.gitignore
index 3eb5d7e7..601343ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,5 @@ compile_commands.json
 doc/_autosummary
 doc/cpp2rst_generated
 doc/html
+.claude
+build/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1fe878ee..60c6cd3f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,10 +15,7 @@
 # Authors: Philipp Dumitrescu, Olivier Parcollet, Dylan Simon, Nils Wentzell
 
 cmake_minimum_required(VERSION 3.20 FATAL_ERROR)
-cmake_policy(VERSION 3.20)
-if(POLICY CMP0144)
-  cmake_policy(SET CMP0144 NEW)
-endif()
+cmake_policy(VERSION ${CMAKE_VERSION})
 
 # ############
 # Define Project
diff --git a/Dockerfile.build b/Dockerfile.build
deleted file mode 100644
index ca53cecf..00000000
--- a/Dockerfile.build
+++ /dev/null
@@ -1,21 +0,0 @@
-# See packaging for various base options
-FROM flatironinstitute/triqs:base
-ARG APPNAME=mpi
-
-RUN useradd -u 990 -m build
-
-ENV SRC=/src \
-    BUILD=/home/build \
-    INSTALL=/usr/local \
-    PYTHONPATH=/usr/local/lib/python$PYTHON_VERSION/site-packages \
-    CMAKE_PREFIX_PATH=/usr/lib/cmake/$APPNAME
-
-COPY --chown=build . $SRC/$APPNAME
-WORKDIR $BUILD/$APPNAME
-RUN chown build .
-USER build
-ARG BUILD_ID
-ARG CMAKE_ARGS
-RUN cmake $SRC/$APPNAME -DCMAKE_INSTALL_PREFIX=$INSTALL -DCLANG_OPT="$CXXFLAGS" $CMAKE_ARGS && make -j4 || make -j1 VERBOSE=1
-USER root
-RUN make install
diff --git a/README.md b/README.md
index 5f4d1f3a..9a5d1783 100644
--- a/README.md
+++ b/README.md
@@ -7,3 +7,13 @@ mpi is a high-level C++ interface to the [Message Passing Interface](https://en.
 A reference documentation based on [Doxygen](https://www.doxygen.nl) is provided at [triqs.github.io/mpi](https://triqs.github.io/mpi).
 
 For usage examples we refer the users to our [tests](https://github.com/TRIQS/mpi/tree/unstable/test/c++).
+
+## Support
+
+<picture>
+  <source media="(prefers-color-scheme: dark)" width="20%" srcset="doc/_static/CCQ-dark.png">
+  <img alt="Flatiron Center for Computational Quantum Physics logo." width="20%" src="doc/_static/CCQ.png">
+</picture>
+
+TRIQS/mpi is supported by the Flatiron Institute, a division of the Simons Foundation.
+
diff --git a/c++/mpi/CMakeLists.txt b/c++/mpi/CMakeLists.txt
index 1ab36807..f6b11b27 100644
--- a/c++/mpi/CMakeLists.txt
+++ b/c++/mpi/CMakeLists.txt
@@ -26,6 +26,7 @@ target_link_libraries(${PROJECT_NAME}_c INTERFACE itertools::itertools_c)
 message(STATUS "-------- MPI detection -------------")
 
 set(MPI_CXX_SKIP_MPICXX TRUE CACHE BOOL "If true, the MPI-2 C++ bindings are disabled using definitions.")
+set(MPI_DETERMINE_LIBRARY_VERSION TRUE)
 find_package(MPI REQUIRED COMPONENTS CXX)
 
 # Create an interface target
@@ -51,11 +52,11 @@ if(NOT MPIEXEC_EXECUTABLE)
   set(MPIEXEC_EXECUTABLE ${MPIEXEC} CACHE FILENAME "MPI Executable")
 endif()
 
-# Compatibility to Open-MPI 3.0.0: check whether MPI executable has option --oversubscribe and add it
-execute_process(COMMAND ${MPIEXEC_EXECUTABLE} --oversubscribe ${MPIEXEC_NUMPROC_FLAG} 4 ${MPIEXEC_PREFLAGS} ls ${MPIEXEC_POSTFLAGS} RESULT_VARIABLE HAS_NO_OVERSUBSCRIBE OUTPUT_QUIET ERROR_QUIET)
-if(NOT HAS_NO_OVERSUBSCRIBE)
+# Open-MPI 3.0+ requires --oversubscribe flag
+if(MPI_CXX_LIBRARY_VERSION_STRING MATCHES "Open MPI v([0-9]+)" AND CMAKE_MATCH_1 GREATER_EQUAL 3)
+  message(STATUS "MPI Version: ${MPI_CXX_LIBRARY_VERSION_STRING}")
   list(APPEND MPIEXEC_PREFLAGS --oversubscribe)
-  set(MPIEXEC_PREFLAGS ${MPIEXEC_PREFLAGS} CACHE STRING "These flags will be directly before the executable that is being run by mpiexec." FORCE)
+  set(MPIEXEC_PREFLAGS ${MPIEXEC_PREFLAGS} CACHE STRING "Flags to pass to mpiexec directly before the executable to run." FORCE)
 endif()
 
 # ========= Static Analyzer Checks ==========
diff --git a/c++/mpi/array.hpp b/c++/mpi/array.hpp
new file mode 100644
index 00000000..cd4a7b5a
--- /dev/null
+++ b/c++/mpi/array.hpp
@@ -0,0 +1,106 @@
+// Copyright (c) 2019-2024 Simons Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Olivier Parcollet, Nils Wentzell
+
+/**
+ * @file
+ * @brief Provides an MPI broadcast and reduce for `std::array`.
+ */
+
+#pragma once
+
+#include "./communicator.hpp"
+#include "./ranges.hpp"
+#include "./utils.hpp"
+
+#include <mpi.h>
+
+#include <array>
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+
+namespace mpi {
+
+  /**
+   * @addtogroup coll_comm
+   * @{
+   */
+
+  /**
+   * @brief Implementation of an MPI broadcast for a `std::array`.
+   *
+   * @details It calls mpi::broadcast_range with the given array.
+   *
+   * @tparam T Value type of the array.
+   * @tparam N Size of the array.
+   * @param arr `std::array` to broadcast (into).
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   */
+  template <typename T, std::size_t N> void mpi_broadcast(std::array<T, N> &arr, communicator c = {}, int root = 0) { broadcast_range(arr, c, root); }
+
+  /**
+   * @brief Implementation of an MPI reduce for a `std::array`.
+   *
+   * @details It constructs the output array with its value type equal to the return type of `reduce(std::declval<T>())`
+   * and calls mpi::reduce_range with the input and constructed output array.
+   *
+   * Note that the output array will always have the same size as the input array, no matter if the rank receives the
+   * reduced data or not.
+   *
+   * @tparam T Value type of the array.
+   * @tparam N Size of the array.
+   * @param arr `std::array` to reduce.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the reduction.
+   * @param op `MPI_Op` used in the reduction.
+   * @return `std::array` containing the result of the reduction.
+   */
+  template <typename T, std::size_t N>
+  auto mpi_reduce(std::array<T, N> const &arr, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
+    using value_t = std::remove_cvref_t<decltype(reduce(std::declval<T>()))>;
+    std::array<value_t, N> res{};
+    reduce_range(arr, res, c, root, all, op);
+    return res;
+  }
+
+  /**
+   * @brief Implementation of an MPI reduce for a `std::array` that reduces directly into an existing output array.
+   *
+   * @details It calls mpi::reduce_range with the input and output array. The output array must be the same size as the
+   * input array on receiving ranks.
+   *
+   * @tparam T1 Value type of the array to be reduced.
+   * @tparam N1 Size of the array to be reduced.
+   * @tparam T2 Value type of the array to be reduced into.
+   * @tparam N2 Size of the array to be reduced into.
+   * @param arr_in `std::array` to reduce.
+   * @param arr_out `std::array` to reduce into.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the reduction.
+   * @param op `MPI_Op` used in the reduction.
+   */
+  template <typename T1, std::size_t N1, typename T2, std::size_t N2>
+  void mpi_reduce_into(std::array<T1, N1> const &arr_in, std::array<T2, N2> &arr_out, communicator c = {}, int root = 0, bool all = false,
+                       MPI_Op op = MPI_SUM) {
+    reduce_range(arr_in, arr_out, c, root, all, op);
+  }
+
+  /** @} */
+
+} // namespace mpi
diff --git a/c++/mpi/chunk.hpp b/c++/mpi/chunk.hpp
index 3423e72a..85466cd1 100644
--- a/c++/mpi/chunk.hpp
+++ b/c++/mpi/chunk.hpp
@@ -22,6 +22,7 @@
 #pragma once
 
 #include "./communicator.hpp"
+#include "./macros.hpp"
 
 #include <itertools/itertools.hpp>
 
@@ -32,16 +33,24 @@ namespace mpi {
 
   /**
    * @ingroup utilities
-   * @brief Get the length of the i<sup>th</sup> subrange after splitting the integer range `[0, end)` evenly across n subranges.
+   * @brief Get the length of the i<sup>th</sup> subrange after splitting the integer range `[0, end)` as evenly as
+   * possible across `n` subranges.
+   *
+   * @details The optional parameter `min_size` can be used to first divide the range into equal parts of size
+   * `min_size` before distributing them as evenly as possible across the number of specified subranges.
+   *
+   * It is expected that `min_size > 0` and that `min_size` is a divisor of `end`.
    *
    * @param end End of the integer range `[0, end)`.
-   * @param n Number of subranges.
+   * @param nranges Number of subranges.
    * @param i Index of the subrange of interest.
+   * @param min_size Minimum size of the subranges.
    * @return Length of the i<sup>th</sup> subrange.
    */
-  [[nodiscard]] inline long chunk_length(long end, int n, int i) {
-    auto [node_begin, node_end] = itertools::chunk_range(0, end, n, i);
-    return node_end - node_begin;
+  [[nodiscard]] inline long chunk_length(long end, int nranges, int i, long min_size = 1) {
+    EXPECTS_WITH_MESSAGE(min_size > 0 && end % min_size == 0, "Error in mpi::chunk_length: min_size must be a divisor of end");
+    auto [node_begin, node_end] = itertools::chunk_range(0, end / min_size, nranges, i);
+    return (node_end - node_begin) * min_size;
   }
 
   /**
diff --git a/c++/mpi/communicator.hpp b/c++/mpi/communicator.hpp
index 23b531c3..6ce1a723 100644
--- a/c++/mpi/communicator.hpp
+++ b/c++/mpi/communicator.hpp
@@ -22,6 +22,7 @@
 #pragma once
 
 #include "./environment.hpp"
+#include "./utils.hpp"
 
 #include <mpi.h>
 
@@ -30,17 +31,21 @@
 
 namespace mpi {
 
+  // Forward declaration.
+  class shared_communicator;
+
   /**
    * @ingroup mpi_essentials
    * @brief C++ wrapper around `MPI_Comm` providing various convenience functions.
    *
-   * @details It stores an `MPI_Comm` object as its only member which by default is set to `MPI_COMM_WORLD`.
-   * Note that copying the communicator simply copies the `MPI_Comm` object, without calling `MPI_Comm_dup`.
+   * @details It stores an `MPI_Comm` object as its only member which by default is set to `MPI_COMM_WORLD`. The
+   * underlying `MPI_Comm` object is not freed when a communicator goes out of scope. It is the user's responsibility to
+   * do so, in case it is needed. Note that copying the communicator simply copies the `MPI_Comm` object, without
+   * calling `MPI_Comm_dup`.
+   *
+   * All functions that make direct calls to the MPI C library throw an exception in case the call fails.
    */
   class communicator {
-    // Wrapped `MPI_Comm` object.
-    MPI_Comm _com = MPI_COMM_WORLD;
-
     public:
     /// Construct a communicator with `MPI_COMM_WORLD`.
     communicator() = default;
@@ -48,23 +53,24 @@ namespace mpi {
     /**
      * @brief Construct a communicator with a given `MPI_Comm` object.
      * @details The `MPI_Comm` object is copied without calling `MPI_Comm_dup`.
+     * @param c `MPI_Comm` object to wrap.
      */
-    communicator(MPI_Comm c) : _com(c) {}
+    communicator(MPI_Comm c) : comm_(c) {}
 
     /// Get the wrapped `MPI_Comm` object.
-    [[nodiscard]] MPI_Comm get() const noexcept { return _com; }
+    [[nodiscard]] MPI_Comm get() const noexcept { return comm_; }
+
+    /// Check if the contained `MPI_Comm` is `MPI_COMM_NULL`.
+    [[nodiscard]] bool is_null() const noexcept { return comm_ == MPI_COMM_NULL; }
 
     /**
      * @brief Get the rank of the calling process in the communicator.
      * @return The result of `MPI_Comm_rank` if mpi::has_env is true, otherwise 0.
      */
     [[nodiscard]] int rank() const {
-      if (has_env) {
-        int num = 0;
-        MPI_Comm_rank(_com, &num);
-        return num;
-      } else
-        return 0;
+      int r = 0;
+      if (has_env) check_mpi_call(MPI_Comm_rank(comm_, &r), "MPI_Comm_rank");
+      return r;
     }
 
     /**
@@ -72,79 +78,154 @@ namespace mpi {
      * @return The result of `MPI_Comm_size` if mpi::has_env is true, otherwise 1.
      */
     [[nodiscard]] int size() const {
-      if (has_env) {
-        int num = 0;
-        MPI_Comm_size(_com, &num);
-        return num;
-      } else
-        return 1;
+      int s = 1;
+      if (has_env) check_mpi_call(MPI_Comm_size(comm_, &s), "MPI_Comm_size");
+      return s;
     }
 
     /**
      * @brief Split the communicator into disjoint subgroups.
      *
-     * @details Calls `MPI_Comm_split` with the given color and key arguments. See the MPI documentation for more details,
-     * e.g. <a href="https://docs.open-mpi.org/en/v5.0.x/man-openmpi/man3/MPI_Comm_split.3.html">open-mpi docs</a>.
+     * @details Calls `MPI_Comm_split` with the given color and key arguments. See the MPI documentation for more
+     * details, e.g. <a href="https://docs.open-mpi.org/en/v5.0.x/man-openmpi/man3/MPI_Comm_split.3.html">open-mpi
+     * docs</a>.
      *
+     * @warning This allocates a new communicator object. Make sure to call free() on the returned communicator when it
+     * is no longer needed.
+     *
+     * @param color Determines which processes are put into the same group.
+     * @param key Determines the rank of the process in the new communicator.
      * @return If mpi::has_env is true, return the split `MPI_Comm` object wrapped in a new mpi::communicator, otherwise
      * return a default constructed mpi::communicator.
      */
     [[nodiscard]] communicator split(int color, int key = 0) const {
-      if (has_env) {
-        communicator c;
-        MPI_Comm_split(_com, color, key, &c._com);
-        return c;
-      } else
-        return {};
+      communicator c{};
+      if (has_env) check_mpi_call(MPI_Comm_split(comm_, color, key, &c.comm_), "MPI_Comm_split");
+      return c;
     }
 
     /**
-     * @brief If mpi::has_env is true, `MPI_Abort` is called with the given error code, otherwise std::abort is called.
+     * @brief Partition the communicator into subcommunicators according to their type.
+     *
+     * @details In the MPI3.0 standard the only supported split type is `MPI_COMM_TYPE_SHARED`. OpenMPI (and possibly
+     * other implementations) provide more custom split types, however, they are not portable.
+     *
+     * @warning This allocates a new communicator object. Make sure to call free on the returned communicator when it
+     * is no longer needed.
+     *
+     * @param split_type Type of processes to be grouped together.
+     * @param key Determines the rank of the process in the new communicator.
+     * @return If mpi::has_env is true, return the split `MPI_Comm` object wrapped in a new mpi::communicator, otherwise
+     * return a default constructed mpi::communicator.
+     */
+    [[nodiscard]] shared_communicator split_shared(int split_type = MPI_COMM_TYPE_SHARED, int key = 0) const;
+
+    /**
+     * @brief Duplicate the communicator.
+     *
+     * @details Calls `MPI_Comm_dup` to duplicate the communicator. See the MPI documentation for more details, e.g.
+     * <a href="https://docs.open-mpi.org/en/v5.0.x/man-openmpi/man3/MPI_Comm_dup.3.html">open-mpi docs</a>.
+     *
+     * @warning This allocates a new communicator object. Make sure to call free on the returned communicator when it
+     * is no longer needed.
+     *
+     * @return If mpi::has_env is true, return the duplicated `MPI_Comm` object wrapped in a new mpi::communicator,
+     * otherwise return a default constructed mpi::communicator.
+     */
+    [[nodiscard]] communicator duplicate() const {
+      communicator c{};
+      if (has_env) check_mpi_call(MPI_Comm_dup(comm_, &c.comm_), "MPI_Comm_dup");
+      return c;
+    }
+
+    /**
+     * @brief Free the communicator.
+     *
+     * @details Calls `MPI_Comm_free` to mark the communicator for deallocation. See the MPI documentation for more
+     * details, e.g. <a href="https://docs.open-mpi.org/en/v5.0.x/man-openmpi/man3/MPI_Comm_free.3.html">open-mpi docs
+     * </a>.
+     *
+     * Does nothing, if mpi::has_env is false.
+     */
+    void free() {
+      if (has_env && !is_null()) check_mpi_call(MPI_Comm_free(&comm_), "MPI_Comm_free");
+    }
+
+    /**
+     * @brief If mpi::has_env is true, `MPI_Abort` is called with the given error code, otherwise it calls `std::abort`.
      * @param error_code The error code to pass to `MPI_Abort`.
      */
-    void abort(int error_code) {
-      if (has_env)
-        MPI_Abort(_com, error_code);
-      else
+    void abort(int error_code) const {
+      if (has_env) {
+        check_mpi_call(MPI_Abort(comm_, error_code), "MPI_Abort");
+      } else {
         std::abort();
+      }
     }
 
 #ifdef BOOST_MPI_HPP
     // Conversion to and from boost communicator, Keep for backward compatibility
-    inline operator boost::mpi::communicator() const { return boost::mpi::communicator(_com, boost::mpi::comm_duplicate); }
-    inline communicator(boost::mpi::communicator c) : _com(c) {}
+    inline operator boost::mpi::communicator() const { return boost::mpi::communicator(comm_, boost::mpi::comm_duplicate); }
+    inline communicator(boost::mpi::communicator c) : comm_(c) {}
 #endif // BOOST_MPI_HPP
 
     /**
      * @brief Barrier synchronization.
      *
-     * @details Does nothing if mpi::has_env is false. Otherwise, it either uses a blocking `MPI_Barrier`
-     * (if the given argument is 0) or a non-blocking `MPI_Ibarrier` call. The given parameter determines
-     * in milliseconds how often each process calls `MPI_Test` to check if all processes have reached the barrier.
+     * @details Does nothing if mpi::has_env is false. Otherwise, it either uses a blocking `MPI_Barrier` (if the given
+     * argument is 0) or a non-blocking `MPI_Ibarrier` call. The given parameter determines in milliseconds how often
+     * each process calls `MPI_Test` to check if all processes have reached the barrier.
+     *
      * This can considerably reduce the CPU load:
-     *     - 1 msec ~ 1% cpu load
-     *     - 10 msec ~ 0.5% cpu load
-     *     - 100 msec ~ 0.01% cpu load
+     * - 1 msec ~ 1% cpu load
+     * - 10 msec ~ 0.5% cpu load
+     * - 100 msec ~ 0.01% cpu load
      *
      * For a very unbalanced load that takes a long time to finish, 1000 msec is a good choice.
      *
-     * @param poll_msec The polling interval in milliseconds. If set to 0, a simple `MPI_Barrier` call is used.
+     * @param poll_msec Polling interval in milliseconds. If set to 0, a simple `MPI_Barrier` call is used.
      */
-    void barrier(long poll_msec = 1) {
+    void barrier(long poll_msec = 1) const {
       if (has_env) {
         if (poll_msec == 0) {
-          MPI_Barrier(_com);
+          check_mpi_call(MPI_Barrier(comm_), "MPI_Barrier");
         } else {
           MPI_Request req{};
           int flag = 0;
-          MPI_Ibarrier(_com, &req);
+          check_mpi_call(MPI_Ibarrier(comm_, &req), "MPI_Ibarrier");
           while (!flag) {
-            MPI_Test(&req, &flag, MPI_STATUS_IGNORE);
+            check_mpi_call(MPI_Test(&req, &flag, MPI_STATUS_IGNORE), "MPI_Test");
             usleep(poll_msec * 1000);
           }
         }
       }
     }
+
+    private:
+    MPI_Comm comm_ = MPI_COMM_WORLD;
+  };
+
+  /**
+   * @ingroup mpi_osc_shm
+   * @brief C++ wrapper around `MPI_Comm` that is a result of the mpi::communicator::split_shared operation.
+   *
+   * @details In the plain MPI C API it is not distinguishable whether an `MPI_Comm` is local to a shared memory island
+   * or not. Thus we introduce an extra type for that whose only purpose is to make that distinction on the type-level
+   * to prevent wrong usage of the shared memory APIs.
+   */
+  class shared_communicator : public communicator {
+    public:
+    // Make the constructors of mpi::communicator accessible.
+    using communicator::communicator;
+
+    /// Construct a shared communicator with `MPI_COMM_NULL`.
+    shared_communicator() : communicator(MPI_COMM_NULL) {}
   };
 
+  [[nodiscard]] inline shared_communicator communicator::split_shared(int split_type, int key) const {
+    shared_communicator c{};
+    if (has_env) check_mpi_call(MPI_Comm_split_type(comm_, split_type, key, MPI_INFO_NULL, &c.comm_), "MPI_Comm_split_type");
+    return c;
+  }
+
 } // namespace mpi
diff --git a/c++/mpi/datatypes.hpp b/c++/mpi/datatypes.hpp
index ca273e0e..49a6b964 100644
--- a/c++/mpi/datatypes.hpp
+++ b/c++/mpi/datatypes.hpp
@@ -21,6 +21,8 @@
 
 #pragma once
 
+#include "./utils.hpp"
+
 #include <mpi.h>
 
 #include <algorithm>
@@ -30,6 +32,7 @@
 #include <tuple>
 #include <type_traits>
 #include <utility>
+#include <vector>
 
 namespace mpi {
 
@@ -74,6 +77,14 @@ namespace mpi {
   D(unsigned long long, MPI_UNSIGNED_LONG_LONG);
 #undef D
 
+  /**
+   * @brief Specialization of mpi::mpi_type for enum types.
+   * @tparam E C++ enum type.
+   */
+  template <typename E>
+    requires(std::is_enum_v<E>)
+  struct mpi_type<E> : mpi_type<std::underlying_type_t<E>> {};
+
   /**
    * @brief Specialization of mpi::mpi_type for `const` types.
    * @tparam T C++ type.
@@ -81,8 +92,9 @@ namespace mpi {
   template <typename T> struct mpi_type<const T> : mpi_type<T> {};
 
   /**
-   * @brief Type trait to check if a type T has a corresponding MPI datatype, i.e. if mpi::mpi_type has been specialized.
-   * @tparam T Type to be checked.
+   * @brief Type trait to check if a type `T` has a corresponding MPI datatype, i.e. if mpi::mpi_type has been
+   * specialized.
+   * @tparam `T` Type to be checked.
    */
   template <typename T, typename = void> constexpr bool has_mpi_type = false;
 
@@ -92,12 +104,36 @@ namespace mpi {
    */
   template <typename T> constexpr bool has_mpi_type<T, std::void_t<decltype(mpi_type<T>::get())>> = true;
 
+  namespace detail {
+
+    // Helper struct to check if member types are mpi-serializable, i.e. have an associated mpi_type
+    struct serialize_checker {
+      template <typename T>
+      void operator&(T &)
+        requires(has_mpi_type<T>)
+      {}
+    };
+
+  } // namespace detail
+
+  /**
+   * @brief A concept that checks if objects of a type can be serialized and deserialized.
+   * @tparam T Type to check.
+   */
+  template <typename T>
+  concept Serializable = requires(const T ac, T a, detail::serialize_checker ar) {
+    { ac.serialize(ar) } -> std::same_as<void>;
+    { a.deserialize(ar) } -> std::same_as<void>;
+  };
+
   /**
    * @brief Create a new `MPI_Datatype` from a tuple.
    *
    * @details The tuple element types must have corresponding MPI datatypes, i.e. they must have mpi::mpi_type
    * specializtions. It uses `MPI_Type_create_struct` to create a new datatype consisting of the tuple element types.
    *
+   * It throws an exception in case a call to the MPI C library fails.
+   *
    * @tparam Ts Tuple element types.
    * @param tup Tuple object.
    * @return `MPI_Datatype` consisting of the types of the tuple elements.
@@ -113,17 +149,17 @@ namespace mpi {
     // displacements of the blocks in bytes w.r.t. to the memory address of the first block
     std::array<MPI_Aint, N> disp;
     // initialize displacement array from the tuple element addresses
-    []<size_t... Is>(std::index_sequence<Is...>, auto &tup, MPI_Aint *disp) {
-      ((disp[Is] = (char *)&std::get<Is>(tup) - (char *)&std::get<0>(tup)), ...);
+    []<size_t... Is>(std::index_sequence<Is...>, auto &t, MPI_Aint *d) {
+      ((d[Is] = (char *)&std::get<Is>(t) - (char *)&std::get<0>(t)), ...);
       // account for non-trivial memory layouts of the tuple elements
-      auto min_el = *std::min_element(disp, disp + sizeof...(Ts));
-      ((disp[Is] -= min_el), ...);
+      auto min_el = *std::min_element(d, d + sizeof...(Ts));
+      ((d[Is] -= min_el), ...);
     }(std::index_sequence_for<Ts...>{}, tup, disp.data());
 
     // create and return MPI datatype
     MPI_Datatype cty{};
-    MPI_Type_create_struct(N, blocklen.data(), disp.data(), types.data(), &cty);
-    MPI_Type_commit(&cty);
+    check_mpi_call(MPI_Type_create_struct(N, blocklen.data(), disp.data(), types.data(), &cty), "MPI_Type_create_struct");
+    check_mpi_call(MPI_Type_commit(&cty), "MPI_Type_commit");
     return cty;
   }
 
@@ -131,8 +167,11 @@ namespace mpi {
    * @brief Specialization of mpi::mpi_type for std::tuple.
    * @tparam Ts Tuple element types.
    */
-  template <typename... T> struct mpi_type<std::tuple<T...>> {
-    [[nodiscard]] static MPI_Datatype get() noexcept { return get_mpi_type(std::tuple<T...>{}); }
+  template <typename... Ts> struct mpi_type<std::tuple<Ts...>> {
+    [[nodiscard]] static MPI_Datatype get() noexcept {
+      static MPI_Datatype type = get_mpi_type(std::tuple<Ts...>{});
+      return type;
+    }
   };
 
   /**
@@ -152,15 +191,81 @@ namespace mpi {
    * auto tie_data(foo f) {
    *   return std::tie(f.x, f.y);
    * }
+   * @endcode
    *
-   * // provide a specialization of mpi_type
-   * template <> struct mpi::mpi_type<foo> : mpi::mpi_type_from_tie<foo> {};
+   * @tparam U Type to be converted to an `MPI_Datatype`.
+   */
+  template <typename U>
+    requires(not Serializable<U>) and requires(U u) { tie_data(u); }
+  struct mpi_type<U> {
+    [[nodiscard]] static MPI_Datatype get() noexcept {
+      static MPI_Datatype type = get_mpi_type(tie_data(U{}));
+      return type;
+    }
+  };
+
+  namespace detail {
+
+    // Archive helper class to obtain MPI custom type info using references to class members.
+    struct mpi_archive {
+      std::vector<int> block_lengths{};
+      std::vector<MPI_Aint> displacements{};
+      std::vector<MPI_Datatype> types{};
+      MPI_Aint base_address{};
+
+      // Constructor sets the base address of the object.
+      explicit mpi_archive(const void *base) { MPI_Get_address(base, &base_address); }
+
+      // Overloaded operator& to process members to set the block lengths, displacements and MPI types.
+      template <typename T>
+        requires(has_mpi_type<T>)
+      mpi_archive &operator&(const T &member) {
+        types.push_back(mpi_type<T>::get());
+        MPI_Aint address{};
+        MPI_Get_address(&member, &address);
+        displacements.push_back(MPI_Aint_diff(address, base_address));
+        block_lengths.push_back(1);
+        return *this;
+      }
+    };
+
+  } // namespace detail
+
+  /**
+   * @brief Create an `MPI_Datatype` from a serializable type.
+   *
+   * @details It is assumed that the type has a member function `serialize`
+   * which feeds all its class members into an archive using the `operator&`.
+   *
+   * @code{.cpp}
+   * // type to use for MPI communication
+   * struct foo {
+   *   double x;
+   *   int y;
+   *   void serialize(auto& ar) const { ar & x & y; }
+   * };
    * @endcode
    *
    * @tparam T Type to be converted to an `MPI_Datatype`.
    */
-  template <typename T> struct mpi_type_from_tie {
-    [[nodiscard]] static MPI_Datatype get() noexcept { return get_mpi_type(tie_data(T{})); }
+  template <Serializable T> [[nodiscard]] MPI_Datatype get_mpi_type(const T &obj) {
+    detail::mpi_archive ar(&obj);
+    obj.serialize(ar);
+    MPI_Datatype mpi_type{};
+    MPI_Type_create_struct(static_cast<int>(ar.block_lengths.size()), ar.block_lengths.data(), ar.displacements.data(), ar.types.data(), &mpi_type);
+    MPI_Type_commit(&mpi_type);
+    return mpi_type;
+  }
+
+  /**
+   * @brief Specialization of mpi::mpi_type for serializable types.
+   * @tparam S Serializable type.
+   */
+  template <Serializable S> struct mpi_type<S> {
+    [[nodiscard]] static MPI_Datatype get() noexcept {
+      static MPI_Datatype type = get_mpi_type(S{});
+      return type;
+    }
   };
 
   /** @} */
diff --git a/c++/mpi/environment.hpp b/c++/mpi/environment.hpp
index 148ae5d5..a8aa42dd 100644
--- a/c++/mpi/environment.hpp
+++ b/c++/mpi/environment.hpp
@@ -21,6 +21,8 @@
 
 #pragma once
 
+#include "./utils.hpp"
+
 #include <mpi.h>
 
 #include <cstdlib>
@@ -34,17 +36,18 @@ namespace mpi {
 
   /**
    * @brief Check if MPI has been initialized.
+   * @details It throws an exception in case a call to the MPI C library fails.
    * @return True if `MPI_Init` has been called, false otherwise.
    */
   [[nodiscard]] inline bool is_initialized() noexcept {
     int flag = 0;
-    MPI_Initialized(&flag);
+    check_mpi_call(MPI_Initialized(&flag), "MPI_Initialized");
     return flag;
   }
 
   /**
-   * @brief Boolean variable that is true, if one of the environment variables `OMPI_COMM_WORLD_RANK`,
-   * `PMI_RANK`, `CRAY_MPICH_VERSION` or `FORCE_MPI_INIT` is set, false otherwise.
+   * @brief Boolean variable that is true, if one of the environment variables `OMPI_COMM_WORLD_RANK`, `PMI_RANK`,
+   * `CRAY_MPICH_VERSION` or `FORCE_MPI_INIT` is set, false otherwise.
    *
    * @details The environment variables are set, when a program is executed with `mpirun` or `mpiexec`.
    */
@@ -59,31 +62,33 @@ namespace mpi {
   /**
    * @brief RAII class to initialize and finalize MPI.
    *
-   * @details Calls `MPI_Init` upon construction and `MPI_Finalize` upon destruction i.e. when the environment object goes out of scope.
-   * If mpi::has_env is false, this struct does nothing.
+   * @details Calls `MPI_Init` upon construction and `MPI_Finalize` upon destruction i.e. when the environment object
+   * goes out of scope. If mpi::has_env is false, this struct does nothing.
+   *
+   * All functions that make direct calls to the MPI C library throw an exception in case the call fails.
    */
   struct environment {
     /**
      * @brief Construct a new mpi environment object by calling `MPI_Init`.
      *
-     * @details Checks first if the program is run with an MPI runtime environment and if it has not been
-     * initialized before to avoid errors.
+     * @details Checks first if the program is run with an MPI runtime environment and if it has not been initialized
+     * before to avoid errors.
      *
      * @param argc Number of command line arguments.
      * @param argv Command line arguments.
      */
     environment(int argc, char *argv[]) { //  NOLINT (C-style array is wanted here)
-      if (has_env && !is_initialized()) MPI_Init(&argc, &argv);
+      if (has_env && !is_initialized()) check_mpi_call(MPI_Init(&argc, &argv), "MPI_Init");
     }
 
     /**
      * @brief Destroy the mpi environment object by calling `MPI_Finalize`.
      *
-     * @details Checks first if the program is run with an MPI runtime environment. Called automatically when the environment
-     * object goes out of scope.
+     * @details Checks first if the program is run with an MPI runtime environment. Called automatically when the
+     * environment object goes out of scope.
      */
     ~environment() {
-      if (has_env) MPI_Finalize();
+      if (has_env) check_mpi_call(MPI_Finalize(), "MPI_Finalize");
     }
   };
 
diff --git a/c++/mpi/generic_communication.hpp b/c++/mpi/generic_communication.hpp
index c81e55d0..e0d613c7 100644
--- a/c++/mpi/generic_communication.hpp
+++ b/c++/mpi/generic_communication.hpp
@@ -16,72 +16,67 @@
 
 /**
  * @file
- * @brief Provides generic implementations for a subset of collective MPI communications (broadcast, reduce, gather, scatter).
- * @details The generic functions (mpi::broadcast, mpi::reduce, mpi::scatter, ...) call their more specialized counterparts
- * (e.g. mpi::mpi_broadcast, mpi::mpi_reduce, mpi::mpi_scatter, ...). They depend on ADL.
+ * @brief Provides generic implementations for a subset of collective MPI communications (broadcast, reduce, gather,
+ * scatter).
+ * @details The generic functions (mpi::broadcast, mpi::reduce, mpi::scatter, ...) call their more specialized
+ * counterparts (e.g. mpi::mpi_broadcast, mpi::mpi_reduce, mpi::mpi_scatter, ...). They depend on ADL.
  */
 
 #pragma once
 
+#include "./communicator.hpp"
 #include "./datatypes.hpp"
-#include "./lazy.hpp"
+#include "./macros.hpp"
+#include "./utils.hpp"
 
 #include <mpi.h>
 
+#include <algorithm>
+#include <concepts>
+#include <ranges>
 #include <type_traits>
-#include <utility>
 #include <vector>
 
 namespace mpi {
 
+  /**
+   * @ingroup utilities
+   * @brief A concept that checks if a range type is contiguous and sized and has an MPI compatible value type.
+   * @tparam R Range type.
+   */
+  template <typename R>
+  concept MPICompatibleRange = std::ranges::contiguous_range<R> && std::ranges::sized_range<R> && has_mpi_type<std::ranges::range_value_t<R>>;
+
   /**
    * @addtogroup coll_comm
    * @{
    */
 
-  namespace detail {
-
-    // Type trait to check if a type is a std::vector.
-    template <typename T> inline constexpr bool is_std_vector = false;
-
-    // Spezialization of is_std_vector for std::vector<T>.
-    template <typename T> inline constexpr bool is_std_vector<std::vector<T>> = true;
-
-    // Convert an object of type V to an object of type T.
-    template <typename T, typename V> T convert(V v) {
-      if constexpr (is_std_vector<T>) {
-        T res;
-        res.reserve(v.size());
-        for (auto &x : v) res.emplace_back(convert<typename T::value_type>(std::move(x)));
-        return res;
-      } else
-        return T{std::move(v)};
-    }
-
-  } // namespace detail
-
   /**
    * @brief Generic MPI broadcast.
    *
-   * @details If mpi::has_env is true, this function calls the specialized `mpi_broadcast` function for the given object,
-   * otherwise it does nothing.
+   * @details It calls the specialized `mpi_broadcast` function.
+   *
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
    *
    * @tparam T Type to be broadcasted.
-   * @param x Object to be broadcasted.
+   * @param x Object to be broadcasted (into).
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    */
-  template <typename T> [[gnu::always_inline]] void broadcast(T &x, communicator c = {}, int root = 0) {
-    static_assert(not std::is_const_v<T>, "mpi::broadcast cannot be called on const objects");
-    if (has_env) mpi_broadcast(x, c, root);
+  template <typename T> [[gnu::always_inline]] void broadcast(T &&x, communicator c = {}, int root = 0) { // NOLINT (forwarding is not needed)
+    mpi_broadcast(x, c, root);
   }
 
   /**
    * @brief Generic MPI reduce.
    *
-   * @details If mpi::has_env is true or if the return type of the specialized `mpi_reduce` is lazy, this function calls
-   * the specialized `mpi_reduce` function for the given object. Otherwise, it simply converts the input object to the
-   * output type `mpi_reduce` would return.
+   * @details If there is a specialized `mpi_reduce` for the given type, we call it. Otherwise, we call mpi::reduce_into
+   * with the given input object and a default constructed output object of type `T`.
+   *
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
    *
    * @tparam T Type to be reduced.
    * @param x Object to be reduced.
@@ -89,136 +84,253 @@ namespace mpi {
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
    * @param op `MPI_Op` used in the reduction.
-   * @return The result of the specialized `mpi_reduce` call.
+   * @return Result of the specialized `mpi_reduce` call.
    */
   template <typename T>
-  [[gnu::always_inline]] inline decltype(auto) reduce(T &&x, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    // return type of mpi_reduce
-    using r_t = decltype(mpi_reduce(std::forward<T>(x), c, root, all, op));
-    if constexpr (is_mpi_lazy<r_t>) {
-      return mpi_reduce(std::forward<T>(x), c, root, all, op);
+  [[gnu::always_inline]] decltype(auto) reduce(T &&x, communicator c = {}, int root = 0, bool all = false, // NOLINT (forwarding is not needed)
+                                               MPI_Op op = MPI_SUM) {
+    if constexpr (requires { mpi_reduce(x, c, root, all, op); }) {
+      return mpi_reduce(x, c, root, all, op);
     } else {
-      if (has_env)
-        return mpi_reduce(std::forward<T>(x), c, root, all, op);
-      else
-        return detail::convert<r_t>(std::forward<T>(x));
+      std::remove_cvref_t<T> res;
+      reduce_into(x, res, c, root, all, op);
+      return res;
     }
   }
 
   /**
-   * @brief Generic in-place MPI reduce.
+   * @brief Generic in place MPI reduce.
+   *
+   * @details We call mpi::reduce_into with the given object as the input and output argument.
    *
-   * @details If mpi::has_env is true, this functions calls the specialized `mpi_reduce_in_place` function for the given object.
-   * Otherwise, it does nothing.
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
    *
    * @tparam T Type to be reduced.
-   * @param x Object to be reduced.
+   * @param x Object to be reduced (into).
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
    * @param op `MPI_Op` used in the reduction.
    */
   template <typename T>
-  [[gnu::always_inline]] inline void reduce_in_place(T &x, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    static_assert(not std::is_const_v<T>, "In-place mpi functions cannot be called on const objects");
-    if (has_env) mpi_reduce_in_place(x, c, root, all, op);
+  [[gnu::always_inline]] void reduce_in_place(T &&x, communicator c = {}, int root = 0, bool all = false, // NOLINT (forwarding is not needed)
+                                              MPI_Op op = MPI_SUM) {
+    mpi_reduce_into(x, x, c, root, all, op);
+  }
+
+  /**
+   * @brief Generic MPI reduce that reduces directly into an existing output object.
+   *
+   * @details It calls the specialized `mpi_reduce_into` function.
+   *
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
+   *
+   * @tparam T1 Type to be reduced.
+   * @tparam T2 Type to be reduced into.
+   * @param x_in Object to be reduced.
+   * @param x_out Object to be reduced into.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the reduction.
+   * @param op `MPI_Op` used in the reduction.
+   */
+  template <typename T1, typename T2>
+  [[gnu::always_inline]] void reduce_into(T1 &&x_in, T2 &&x_out, communicator c = {}, int root = 0, // NOLINT (forwarding is not needed)
+                                          bool all = false, MPI_Op op = MPI_SUM) {
+    mpi_reduce_into(x_in, x_out, c, root, all, op);
   }
 
   /**
    * @brief Generic MPI scatter.
    *
-   * @details If mpi::has_env is true or if the return type of the specialized `mpi_scatter` is lazy, this function
-   * calls the specialized `mpi_scatter` function for the given object. Otherwise, it simply converts the input
-   * object to the output type `mpi_scatter` would return.
+   * @details If there is a specialized `mpi_scatter` for the given type, we call it. Otherwise, we call
+   * mpi::scatter_into with the given input object and a default constructed output object of type `T`.
+   *
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
    *
    * @tparam T Type to be scattered.
    * @param x Object to be scattered.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
-   * @return The result of the specialized `mpi_scatter` call.
+   * @return Result of the specialized `mpi_scatter` call.
    */
-  template <typename T> [[gnu::always_inline]] inline decltype(auto) scatter(T &&x, mpi::communicator c = {}, int root = 0) {
-    // return type of mpi_scatter
-    using r_t = decltype(mpi_scatter(std::forward<T>(x), c, root));
-    if constexpr (is_mpi_lazy<r_t>) {
-      return mpi_scatter(std::forward<T>(x), c, root);
+  template <typename T>
+  [[gnu::always_inline]] decltype(auto) scatter(T &&x, mpi::communicator c = {}, int root = 0) { // NOLINT (forwarding is not needed)
+    if constexpr (requires { mpi_scatter(x, c, root); }) {
+      return mpi_scatter(x, c, root);
     } else {
-      if (has_env)
-        return mpi_scatter(std::forward<T>(x), c, root);
-      else
-        return detail::convert<r_t>(std::forward<T>(x));
+      std::remove_cvref_t<T> res;
+      scatter_into(x, res, c, root);
+      return res;
     }
   }
 
+  /**
+   * @brief Generic MPI scatter that scatters directly into an existing output object.
+   *
+   * @details It calls the specialized `mpi_scatter_into` function.
+   *
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
+   *
+   * @tparam T1 Type to be scattered.
+   * @tparam T2 Type to be scattered into.
+   * @param x_in Object to be scattered.
+   * @param x_out Object to be scattered into.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   */
+  template <typename T1, typename T2>
+  [[gnu::always_inline]] void scatter_into(T1 &&x_in, T2 &&x_out, communicator c = {}, int root = 0) { // NOLINT (forwarding is not needed)
+    mpi_scatter_into(x_in, x_out, c, root);
+  }
+
   /**
    * @brief Generic MPI gather.
    *
-   * @details If mpi::has_env is true or if the return type of the specialized `mpi_gather` is lazy, this function
-   * calls the specialized `mpi_gather` function for the given object. Otherwise, it simply converts the input
-   * object to the output type `mpi_gather` would return.
+   * @details If there is a specialized `mpi_gather` for the given type, we call it. Otherwise, we call mpi::gather_into
+   * with the given input object and a default constructed output object of type `T`.
+   *
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
    *
    * @tparam T Type to be gathered.
    * @param x Object to be gathered.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the gather.
-   * @return The result of the specialized `mpi_gather` call.
+   * @return Result of the specialized `mpi_gather` call.
    */
-  template <typename T> [[gnu::always_inline]] inline decltype(auto) gather(T &&x, mpi::communicator c = {}, int root = 0, bool all = false) {
-    // return type of mpi_gather
-    using r_t = decltype(mpi_gather(std::forward<T>(x), c, root, all));
-    if constexpr (is_mpi_lazy<r_t>) {
-      return mpi_gather(std::forward<T>(x), c, root, all);
+  template <typename T>
+  [[gnu::always_inline]] decltype(auto) gather(T &&x, communicator c = {}, int root = 0, bool all = false) { // NOLINT (forwarding is not needed)
+    if constexpr (requires { mpi_gather(x, c, root, all); }) {
+      return mpi_gather(x, c, root, all);
     } else {
-      if (has_env)
-        return mpi_gather(std::forward<T>(x), c, root, all);
-      else
-        return detail::convert<r_t>(std::forward<T>(x));
+      std::remove_cvref_t<T> res;
+      gather_into(x, res, c, root, all);
+      return res;
     }
   }
 
+  /**
+   * @brief Generic MPI gather that gathers directly into an existing output object.
+   *
+   * @details It calls the specialized `mpi_gather_into` function.
+   *
+   * @note We do not check if an MPI runtime environment is being used, i.e. if mpi::has_env is true. It is the
+   * responsibility of the specializations to do this check, in case they make direct calls to the MPI C library.
+   *
+   * @tparam T1 Type to be gathered.
+   * @tparam T2 Type to be gathered into.
+   * @param x_in Object to be gathered.
+   * @param x_out Object to be gathered into.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the gather.
+   */
+  template <typename T1, typename T2>
+  [[gnu::always_inline]] void gather_into(T1 &&x_in, T2 &&x_out, communicator c = {}, int root = 0, // NOLINT (forwarding is not needed)
+                                          bool all = false) {
+    mpi_gather_into(x_in, x_out, c, root, all);
+  }
+
   /**
    * @brief Generic MPI all-reduce.
    * @details It simply calls mpi::reduce with `all = true`.
    */
-  template <typename T> [[gnu::always_inline]] inline decltype(auto) all_reduce(T &&x, communicator c = {}, MPI_Op op = MPI_SUM) {
-    return reduce(std::forward<T>(x), c, 0, true, op);
+  template <typename T>
+  [[gnu::always_inline]] decltype(auto) all_reduce(T &&x, communicator c = {}, MPI_Op op = MPI_SUM) { // NOLINT (forwarding is not needed)
+    return reduce(x, c, 0, true, op);
   }
 
   /**
-   * @brief Generic MPI all-reduce in-place.
+   * @brief Generic MPI all-reduce in place.
    * @details It simply calls mpi::reduce_in_place with `all = true`.
    */
-  template <typename T> [[gnu::always_inline]] inline void all_reduce_in_place(T &&x, communicator c = {}, MPI_Op op = MPI_SUM) {
-    reduce_in_place(std::forward<T>(x), c, 0, true, op);
+  template <typename T>
+  [[gnu::always_inline]] void all_reduce_in_place(T &&x, communicator c = {}, MPI_Op op = MPI_SUM) { // NOLINT (forwarding is not needed)
+    reduce_in_place(x, c, 0, true, op);
+  }
+
+  /**
+   * @brief Generic MPI all-reduce that reduces directly into an existing output object.
+   * @details It simply calls mpi::reduce_into with `all = true`.
+   */
+  template <typename T1, typename T2>
+  [[gnu::always_inline]] void all_reduce_into(T1 &&x_in, T2 &&x_out, communicator c = {}, MPI_Op op = MPI_SUM) { // NOLINT (forwarding is not needed)
+    return reduce_into(x_in, x_out, c, 0, true, op);
   }
 
   /**
    * @brief Generic MPI all-gather.
    * @details It simply calls mpi::gather with `all = true`.
    */
-  template <typename T> [[gnu::always_inline]] inline decltype(auto) all_gather(T &&x, communicator c = {}) {
-    return gather(std::forward<T>(x), c, 0, true);
+  template <typename T> [[gnu::always_inline]] decltype(auto) all_gather(T &&x, communicator c = {}) { // NOLINT (forwarding is not needed)
+    return gather(x, c, 0, true);
+  }
+
+  /**
+   * @brief Generic MPI all-gather that gathers directly into an existing output object.
+   * @details It simply calls mpi::gather_into with `all = true`.
+   */
+  template <typename T1, typename T2>
+  [[gnu::always_inline]] void all_gather_into(T1 &&x_in, T2 &&x_out, communicator c = {}) { // NOLINT (forwarding is not needed)
+    return gather_into(x_in, x_out, c, 0, true);
   }
 
   /**
-   * @brief Implementation of an MPI broadcast for types that have a corresponding MPI datatype, i.e. for which
-   * a specialization of mpi::mpi_type has been defined.
+   * @brief Checks if a given object is equal across all ranks in the given communicator.
+   *
+   * @details It makes two calls to mpi::all_reduce, one with `MPI_MIN` and the other with `MPI_MAX`, and compares their
+   * results.
+   *
+   * @note `MPI_MIN` and `MPI_MAX` need to make sense for the given type `T`.
+   *
+   * @tparam T Type to be checked.
+   * @param x Object to be equality compared.
+   * @param c mpi::communicator.
+   * @return If the given object is equal on all ranks, it returns true. Otherwise, it returns false.
+   */
+  template <typename T> bool all_equal(T const &x, communicator c = {}) {
+    if (!has_env || c.size() < 2) return true;
+    auto min_obj = all_reduce(x, c, MPI_MIN);
+    auto max_obj = all_reduce(x, c, MPI_MAX);
+    return min_obj == max_obj;
+  }
+
+  /**
+   * @brief Implementation of an MPI broadcast for types that have a corresponding MPI datatype.
+   *
+   * @details If mpi::has_env is false or if the communicator size is < 2, it does nothing. Otherwise, it calls
+   * `MPI_Bcast`.
+   *
+   * It throws an exception in case the call to the MPI C library fails.
    *
    * @tparam T Type to be broadcasted.
-   * @param x Object to be broadcasted.
+   * @param x Object to be broadcasted (into).
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    */
   template <typename T>
     requires(has_mpi_type<T>)
   void mpi_broadcast(T &x, communicator c = {}, int root = 0) {
-    MPI_Bcast(&x, 1, mpi_type<T>::get(), root, c.get());
+    // in case there is no active MPI environment or if the communicator size is < 2, do nothing
+    if (!has_env || c.size() < 2) return;
+
+    // make the MPI C library call
+    check_mpi_call(MPI_Bcast(&x, 1, mpi_type<T>::get(), root, c.get()), "MPI_Bcast");
   }
 
   /**
-   * @brief Implementation of an MPI reduce for types that have a corresponding MPI datatype, i.e. for which
-   * a specialization of mpi::mpi_type has been defined.
+   * @brief Implementation of an MPI reduce for types that have a corresponding MPI datatype.
+   *
+   * @details If mpi::has_env is false or if the communicator size is < 2, it returns a copy of the input object.
+   * Otherwise, it calls `MPI_Allreduce` or `MPI_Reduce` with a default constructed output object.
+   *
+   * It throws an exception in case the call to the MPI C library fails.
    *
    * @tparam T Type to be reduced.
    * @param x Object to be reduced.
@@ -226,27 +338,39 @@ namespace mpi {
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
    * @param op `MPI_Op` used in the reduction.
-   * @return The result of the reduction.
+   * @return Result of the reduction.
    */
   template <typename T>
     requires(has_mpi_type<T>)
   T mpi_reduce(T const &x, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    T b;
-    auto d = mpi_type<T>::get();
-    if (!all)
-      // old MPI implementations may require a non-const send buffer
-      MPI_Reduce(const_cast<T *>(&x), &b, 1, d, op, root, c.get()); // NOLINT
-    else
-      MPI_Allreduce(const_cast<T *>(&x), &b, 1, d, op, c.get()); // NOLINT
-    return b;
+    // in case there is no active MPI environment or if the communicator size is < 2, return the input object
+    if (!has_env || c.size() < 2) return x;
+
+    // make the MPI C library call with a default constructed output object
+    T res;
+    if (all) {
+      check_mpi_call(MPI_Allreduce(&x, &res, 1, mpi_type<T>::get(), op, c.get()), "MPI_Allreduce");
+    } else {
+      check_mpi_call(MPI_Reduce(&x, &res, 1, mpi_type<T>::get(), op, root, c.get()), "MPI_Reduce");
+    }
+    return res;
   }
 
   /**
-   * @brief Implementation of an in-place MPI reduce for types that have a corresponding MPI datatype, i.e. for which
-   * a specialization of mpi::mpi_type has been defined.
+   * @brief Implementation of an MPI reduce that reduces directly into an existing output object for types that have a
+   * corresponding MPI datatype.
+   *
+   * @details If the addresses of the input and output objects are equal, the reduction is done in place.
+   *
+   * If mpi::has_env is false or if the communicator size is < 2, it either does nothing (in place) or copies the input
+   * into the output object. Otherwise, it calls `MPI_Allreduce` or `MPI_Reduce` (with `MPI_IN_PLACE`).
+   *
+   * It throws an exception in case the call to the MPI C library fails and it is expected that either all or none of
+   * the receiving processes choose the in place option.
    *
    * @tparam T Type to be reduced.
-   * @param x Object to be reduced.
+   * @param x_in Object to be reduced.
+   * @param x_out Object to be reduced into.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
@@ -254,11 +378,91 @@ namespace mpi {
    */
   template <typename T>
     requires(has_mpi_type<T>)
-  void mpi_reduce_in_place(T &x, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    if (!all)
-      MPI_Reduce((c.rank() == root ? MPI_IN_PLACE : &x), &x, 1, mpi_type<T>::get(), op, root, c.get());
-    else
-      MPI_Allreduce(MPI_IN_PLACE, &x, 1, mpi_type<T>::get(), op, c.get());
+  void mpi_reduce_into(T const &x_in, T &x_out, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
+    // check if the reduction is in place
+    auto in_ptr         = static_cast<void const *>(&x_in);
+    auto out_ptr        = static_cast<void *>(&x_out);
+    bool const in_place = (in_ptr == out_ptr);
+    if (all) {
+      EXPECTS_WITH_MESSAGE(all_equal(static_cast<int>(in_place), c),
+                           "Either zero or all receiving processes have to choose the in place option in mpi_reduce_into");
+    }
+
+    // in case there is no active MPI environment or if the communicator size is < 2, do nothing (in place) or copy
+    if (!has_env || c.size() < 2) {
+      if (!in_place) x_out = x_in;
+      return;
+    }
+
+    // make the MPI C library call
+    if (in_place && (c.rank() == root || all)) in_ptr = MPI_IN_PLACE;
+    if (all) {
+      check_mpi_call(MPI_Allreduce(in_ptr, out_ptr, 1, mpi_type<T>::get(), op, c.get()), "MPI_Allreduce");
+    } else {
+      check_mpi_call(MPI_Reduce(in_ptr, out_ptr, 1, mpi_type<T>::get(), op, root, c.get()), "MPI_Reduce");
+    }
+  }
+
+  /**
+   * @brief Implementation of an MPI gather for types that have a corresponding MPI datatype.
+   *
+   * @details It constructs an output vector, resizes it on receiving ranks to the size of the communicator and calls
+   * mpi::mpi_gather_into. On non-receiving ranks the output vector is empty.
+   *
+   * @tparam T Type to be gathered.
+   * @param x Object to be gathered.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the gather.
+   * @return `std::vector` containing the gathered objects.
+   */
+  template <typename T>
+    requires(has_mpi_type<T>)
+  std::vector<T> mpi_gather(T const &x, communicator c = {}, int root = 0, bool all = false) {
+    std::vector<T> res(c.rank() == root || all ? c.size() : 0);
+    mpi_gather_into(x, res, c, root, all);
+    return res;
+  }
+
+  /**
+   * @brief Implementation of an MPI gather that gathers directly into an existing output range for types that have a
+   * corresponding MPI datatype.
+   *
+   * @details If mpi::has_env is false or if the communicator size is < 2, it copies the input object into the range.
+   * Otherwise, it calls `MPI_Allgather` or `MPI_Gather.
+   *
+   * It throws an exception in case a call to the MPI C library fails and it expects that the range size on receiving
+   * processes is equal the communicator size.
+   *
+   * @tparam T Type to be gathered.
+   * @tparam R MPICompatibleRange type to be gathered into.
+   * @param x Object to be gathered.
+   * @param rg Range to be gathered into.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the gather.
+   */
+  template <typename T, MPICompatibleRange R>
+    requires(has_mpi_type<T> && std::same_as<T, std::remove_cvref_t<std::ranges::range_value_t<R>>>)
+  void mpi_gather_into(T const &x, R &&rg, communicator c = {}, int root = 0, bool all = false) { // NOLINT (ranges need not be forwarded)
+    // check the size of the output range
+    if (c.rank() == root || all) {
+      EXPECTS_WITH_MESSAGE(c.size() == std::ranges::size(rg), "Output range size is not equal the number of ranks in mpi_gather_into");
+    }
+
+    // in case there is no active MPI environment or if the communicator size is < 2, copy the input into the range
+    if (!has_env || c.size() < 2) {
+      std::ranges::copy(std::views::single(x), std::ranges::begin(rg));
+      return;
+    }
+
+    // make the MPI C library call
+    using value_t = std::ranges::range_value_t<R>;
+    if (all) {
+      check_mpi_call(MPI_Allgather(&x, 1, mpi_type<T>::get(), std::ranges::data(rg), 1, mpi_type<value_t>::get(), c.get()), "MPI_Allgather");
+    } else {
+      check_mpi_call(MPI_Gather(&x, 1, mpi_type<T>::get(), std::ranges::data(rg), 1, mpi_type<value_t>::get(), root, c.get()), "MPI_Gather");
+    }
   }
 
   /** @} */
diff --git a/c++/mpi/group.hpp b/c++/mpi/group.hpp
new file mode 100644
index 00000000..46832f47
--- /dev/null
+++ b/c++/mpi/group.hpp
@@ -0,0 +1,135 @@
+// Copyright (c) 2024 Simons Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Alexander Hampel, Olivier Parcollet, Nils Wentzell
+
+/**
+ * @file
+ * @brief Provides a C++ wrapper class for an `MPI_Group` object.
+ */
+
+#pragma once
+
+#include "./communicator.hpp"
+#include "./environment.hpp"
+#include "./utils.hpp"
+
+#include <mpi.h>
+
+#include <utility>
+#include <vector>
+
+namespace mpi {
+
+  /**
+   * @ingroup mpi_essentials
+   * @brief C++ wrapper around `MPI_Group` providing various convenience functions.
+   *
+   * @details It stores an `MPI_Group` object as its only member which by default is set to `MPI_GROUP_NULL`.
+   * The underlying `MPI_Group` object is automatically freed when a group object goes out of scope.
+   *
+   * This class follows move-only semantics and takes ownership of the wrapped `MPI_Group` object.
+   *
+   * All functions that make direct calls to the MPI C library throw an exception in case the call fails.
+   */
+  class group {
+    public:
+    /// Construct a group with `MPI_GROUP_NULL`.
+    group() = default;
+
+    /// Deleted copy constructor.
+    group(group const &) = delete;
+
+    /// Deleted copy assignment operator.
+    group &operator=(group const &) = delete;
+
+    /// Move constructor leaves moved-from object with `MPI_GROUP_NULL`.
+    group(group &&other) noexcept : grp_{std::exchange(other.grp_, MPI_GROUP_NULL)} {}
+
+    /// Move assignment operator leaves moved-from object with `MPI_GROUP_NULL`.
+    group &operator=(group &&rhs) noexcept {
+      if (this != std::addressof(rhs)) {
+        free();
+        grp_ = std::exchange(rhs.grp_, MPI_GROUP_NULL);
+      }
+      return *this;
+    }
+
+    /// Destructor calls free() to release the group.
+    ~group() { free(); }
+
+    /**
+     * @brief Take ownership of an existing `MPI_Group` object.
+     * @param grp `MPI_Group` to be handled.
+     */
+    explicit group(MPI_Group grp) : grp_(grp) {}
+
+    /**
+     * @brief Create a group from a communicator by calling `MPI_Comm_group`.
+     * @param c mpi::communicator from which to create a group.
+     */
+    explicit group(communicator c) {
+      if (has_env) check_mpi_call(MPI_Comm_group(c.get(), &grp_), "MPI_Comm_group");
+    }
+
+    /// Get the wrapped `MPI_Group` object.
+    [[nodiscard]] MPI_Group get() const noexcept { return grp_; }
+
+    /// Check if the contained `MPI_Group` is `MPI_GROUP_NULL`.
+    [[nodiscard]] bool is_null() const noexcept { return grp_ == MPI_GROUP_NULL; }
+
+    /**
+     * @brief Get the rank of the calling process in the group.
+     * @return The result of `MPI_Group_rank` if mpi::has_env is true, otherwise 0.
+     */
+    [[nodiscard]] int rank() const {
+      int r = 0;
+      if (has_env) check_mpi_call(MPI_Group_rank(grp_, &r), "MPI_Group_rank");
+      return r;
+    }
+
+    /**
+     * @brief Get the size of the group.
+     * @return The result of `MPI_Group_size` if mpi::has_env is true, otherwise 1.
+     */
+    [[nodiscard]] int size() const {
+      int s = 1;
+      if (has_env) check_mpi_call(MPI_Group_size(grp_, &s), "MPI_Group_size");
+      return s;
+    }
+
+    /**
+     * @brief Create a new group by calling `MPI_Group_incl`.
+     *
+     * @details It produces a new group by reordering the existing group and taking only listed members.
+     *
+     * @param ranks List of ranks to include in the new group.
+     * @return New group containing only the listed members.
+     */
+    [[nodiscard]] group include(std::vector<int> const &ranks) const {
+      MPI_Group newgroup = MPI_GROUP_NULL;
+      if (has_env) check_mpi_call(MPI_Group_incl(grp_, static_cast<int>(ranks.size()), ranks.data(), &newgroup), "MPI_Group_incl");
+      return group{newgroup};
+    }
+
+    /// Free the group by calling `MPI_Group_free` (if it is not is_null()).
+    void free() noexcept {
+      if (has_env && !is_null()) MPI_Group_free(&grp_);
+    }
+
+    private:
+    MPI_Group grp_ = MPI_GROUP_NULL;
+  };
+
+} // namespace mpi
diff --git a/c++/mpi/lazy.hpp b/c++/mpi/lazy.hpp
deleted file mode 100644
index e142ca56..00000000
--- a/c++/mpi/lazy.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (c) 2024 Simons Foundation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Authors: Thomas Hahn, Alexander Hampel, Olivier Parcollet, Nils Wentzell
-
-/**
- * @file
- * @brief Provides a struct and tags to represent lazy MPI communication.
- */
-
-#pragma once
-
-#include "./communicator.hpp"
-
-#include <mpi.h>
-
-namespace mpi {
-
-  namespace tag {
-
-    /**
-     * @ingroup mpi_lazy
-     * @brief Tag to specify a lazy MPI reduce call.
-     */
-    struct reduce {};
-
-    /**
-     * @ingroup mpi_lazy
-     * @brief Tag to specify a lazy MPI scatter call.
-     */
-    struct scatter {};
-
-    /**
-     * @ingroup mpi_lazy
-     * @brief Tag to specify a lazy MPI gather call.
-     */
-    struct gather {};
-
-  } // namespace tag
-
-  /**
-   * @addtogroup mpi_lazy
-   * @{
-   */
-
-  /**
-   * @brief Represents a lazy MPI communication.
-   *
-   * @tparam Tag An mpi::tag to specify the kind of MPI communication.
-   * @tparam T Type to be communicated.
-   */
-  template <typename Tag, typename T> struct lazy {
-    /// Object to be communicated.
-    T rhs;
-
-    /// mpi::communicator used in the lazy communication.
-    communicator c;
-
-    /// Rank of the root process.
-    int root{};
-
-    /// Whether to use the `MPI_Allxxx` operation
-    bool all{};
-
-    /// `MPI_Op` used in the lazy communication (only relevant if mpi::tag::reduce is used).
-    MPI_Op op{};
-  };
-
-  /**
-   * @brief Type trait to check if a type is mpi::lazy.
-   * @tparam T Type to be checked.
-   */
-  template <typename T> inline constexpr bool is_mpi_lazy = false;
-
-  /**
-   * @brief Spezialization of mpi::is_mpi_lazy.
-   *
-   * @tparam Tag Type to specify the kind of MPI call.
-   * @tparam T Type to be checked.
-   */
-  template <typename Tag, typename T> inline constexpr bool is_mpi_lazy<lazy<Tag, T>> = true;
-
-  /** @} */
-
-} // namespace mpi
diff --git a/c++/mpi/macros.hpp b/c++/mpi/macros.hpp
index 46e8acbc..cd566580 100644
--- a/c++/mpi/macros.hpp
+++ b/c++/mpi/macros.hpp
@@ -87,16 +87,17 @@
 
 #ifdef NDEBUG
 
-#define EXPECTS(X)
-#define ASSERT(X)
-#define ENSURES(X)
-#define EXPECTS_WITH_MESSAGE(X, ...)
-#define ASSERT_WITH_MESSAGE(X, ...)
-#define ENSURES_WITH_MESSAGE(X, ...)
+#define EXPECTS(X) {}
+#define ASSERT(X) {}
+#define ENSURES(X) {}
+#define EXPECTS_WITH_MESSAGE(X, ...) {}
+#define ASSERT_WITH_MESSAGE(X, ...) {}
+#define ENSURES_WITH_MESSAGE(X, ...) {}
 
 #else
 
 #include <iostream>
+#include <exception>
 
 #define EXPECTS(X)                                                                                                                                   \
   if (!(X)) {                                                                                                                                        \
diff --git a/c++/mpi/monitor.hpp b/c++/mpi/monitor.hpp
index d61ccc1b..47931cd2 100644
--- a/c++/mpi/monitor.hpp
+++ b/c++/mpi/monitor.hpp
@@ -16,14 +16,14 @@
 
 /**
  * @file
- * @brief Provides a class for monitoring and communicating exceptions and other errors of
- * individual processes.
+ * @brief Provides a class for monitoring and communicating events across multiple processes.
  */
 
 #pragma once
 
+#include "./communicator.hpp"
 #include "./macros.hpp"
-#include "./mpi.hpp"
+#include "./utils.hpp"
 
 #include <mpi.h>
 
@@ -33,41 +33,56 @@
 namespace mpi {
 
   /**
-   * @ingroup err_handling
-   * @brief Constructed on top of an MPI communicator, this class helps to monitor and communicate
-   * exceptions and other errors of individual processes.
+   * @ingroup event_handling
+   * @brief Constructed on top of an MPI communicator, this class helps to monitor and communicate events across
+   * multiple processes.
    *
-   * @details The root process (process with rank 0) monitors all other processes. If a process encounters
-   * an error, it sends an emergeny stop request to the root process which forwards it to all the other
-   * processes.
+   * @details The root process (rank == 0) monitors all other processes. If a process encounters an event, it sends a
+   * message to the root process by calling monitor::report_local_event. The root process then broadcasts this
+   * information to all other processes.
+   *
+   * It can be used to check
+   * - if an event has occurred on any process (monitor::event_on_any_rank) or
+   * - if an event has occurred on all processes (monitor::event_on_all_ranks).
+   *
+   * It uses a duplicate communicator to not interfere with other MPI communications. The communicator is freed in the
+   * `finalize_communications` function (which is called in the destructor if not called before).
+   *
+   * All functions that make direct calls to the MPI C library throw an exception in case the call fails.
    */
   class monitor {
-    // Future struct for the non-blocking send/receive done on the root process.
+    // Future struct for non-blocking MPI communication.
     struct future {
-      // MPI request for the non-blocking receive on the root process.
+      // MPI request of the non-blocking MPI call.
       MPI_Request request{};
 
-      // 0 means that no error has occurred, 1 means that an error has occurred.
-      int node_stop = 0;
+      // 0 means that no event has occurred, 1 means that an event has occurred.
+      int event = 0;
     };
 
     // MPI communicator.
-    mpi::communicator com;
+    mpi::communicator comm;
 
-    // Future objects stored on the root process for every non-root process.
+    // Future objects stored on the root process for local events on non-root processes.
     std::vector<future> root_futures;
 
-    // MPI request for broadcasting the emergency stop to all non-root processes.
-    MPI_Request req_ibcast{};
+    // MPI request for the broadcasting done on the root process in case an event has occurred on any rank.
+    MPI_Request req_ibcast_any{};
 
-    // MPI request for sending the emergency stop request to the root process.
+    // MPI request for the broadcasting done on the root process in case an event has occurred on all ranks.
+    MPI_Request req_ibcast_all{};
+
+    // MPI request for the sending done on non-root processes.
     MPI_Request req_isent{};
 
-    // Set to 1, if the process has encountered a local error and requested an emergency stop.
-    int local_stop = 0;
+    // Set to 1, if a local event has occurred on this process.
+    int local_event = 0;
+
+    // Set to 1, if an event has occurred on any process.
+    int any_event = 0;
 
-    // Set to 1, if the process has received an emergency stop broadcasted by the root process.
-    int global_stop = 0;
+    // Set to 1, if an event has occurred on all processes.
+    int all_events = 0;
 
     // Set to true, if finialize_communications() has been called.
     bool finalized = false;
@@ -76,20 +91,26 @@ namespace mpi {
     /**
      * @brief Construct a monitor on top of a given mpi::communicator.
      *
-     * @details The root process performs a non-blocking receive for every non-root process and waits for
-     * a non-root process to send an emergency stop request. Non-root processes make a non-blocking broadcast
-     * call and wait for the root process to broadcast any emergency stop request it has received.
+     * @details The communicator is duplicated to not interfere with other MPI communications.
+     *
+     * The root process (rank == 0) performs a non-blocking receive for every non-root process and waits for a
+     * non-root process to send a message that an event has occurred.
+     *
+     * Non-root processes make two non-blocking broadcast calls and wait for the root process to broadcast a message in
+     * case an event has occurred on any or on all processes.
      *
      * @param c mpi::communicator.
      */
-    monitor(mpi::communicator c) : com(c) {
-      if (com.rank() == 0) {
+    monitor(mpi::communicator c) : comm(c.duplicate()) {
+      if (comm.rank() == 0) {
         root_futures.resize(c.size() - 1);
         for (int rank = 1; rank < c.size(); ++rank) {
-          MPI_Irecv(&(root_futures[rank - 1].node_stop), 1, MPI_INT, rank, 0, MPI_COMM_WORLD, &(root_futures[rank - 1].request));
+          check_mpi_call(MPI_Irecv(&(root_futures[rank - 1].event), 1, MPI_INT, rank, rank, comm.get(), &(root_futures[rank - 1].request)),
+                         "MPI_Irecv");
         }
       } else {
-        MPI_Ibcast(&global_stop, 1, MPI_INT, 0, MPI_COMM_WORLD, &req_ibcast);
+        check_mpi_call(MPI_Ibcast(&any_event, 1, MPI_INT, 0, comm.get(), &req_ibcast_any), "MPI_Ibcast");
+        check_mpi_call(MPI_Ibcast(&all_events, 1, MPI_INT, 0, comm.get(), &req_ibcast_all), "MPI_Ibcast");
       }
     }
 
@@ -103,106 +124,162 @@ namespace mpi {
     ~monitor() { finalize_communications(); }
 
     /**
-     * @brief Request an emergency stop.
+     * @brief Report a local event to the root process (rank == 0).
+     *
+     * @details This function can be called on any process in case a local event has occurred.
      *
-     * @details This function can be called on any process in case a local error has occurred. On the
-     * root process, it sets its `local_stop` and `global_stop` members to 1 and broadcasts `global_stop`
-     * to all non-root processes. On non-root processes, it sets `local_stop` to 1 and sends it to the
-     * root process.
+     * On the root process, it immediately broadcasts to all other processes that an event has occurred and further
+     * checks if all other processes have reported an event as well. If so, it additionally broadcasts to all processes
+     * that an event has occurred on all processes.
+     *
+     * On non-root processes, it sends a message to the root process that a local event has occurred.
      */
-    void request_emergency_stop() {
-      EXPECTS(!finalized);
+    void report_local_event() {
       // prevent sending multiple signals
-      if (local_stop) { return; }
-
-      // an error has occurred
-      local_stop = 1;
-      if (com.rank() == 0) {
-        // root broadcasts the global_stop variable
-        global_stop = 1;
-        MPI_Ibcast(&global_stop, 1, MPI_INT, 0, MPI_COMM_WORLD, &req_ibcast);
+      if (local_event or finalized) { return; }
+
+      // a local event has occurred
+      local_event = 1;
+      if (comm.rank() == 0) {
+        // on root process, check all other nodes and perform necessary broadcasts
+        root_check_nodes_and_bcast();
       } else {
-        // non-root sends the local_stop variable to root
-        MPI_Isend(&local_stop, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &req_isent);
+        // on non-root processes, let the root process know about the local event
+        check_mpi_call(MPI_Isend(&local_event, 1, MPI_INT, 0, comm.rank(), comm.get(), &req_isent), "MPI_Isend");
       }
     }
 
     /**
-     * @brief Check if an emergency stop has been requested.
+     * @brief Check if an event has occurred on any process.
+     *
+     * @details This function can be called on any process to check if an event has occurred somewhere.
      *
-     * @details This function can be called on any process to check if an emergency has occurred somewhere.
-     * It first checks, if its `local_stop` or `global_stop` members are set to 1 and returns `true` in case
-     * one of them is. Otherwise, on the root process, it calls `root_check_nodes_and_bcast()` to check if
-     * some other process has sent an emergency message and to possibly forward the received signal.
-     * On non-root processes, it checks if the root process has broadcasted an emergency stop, which it has
-     * received from some other process.
+     * It returns true, if
+     * - a local event has occurred or
+     * - if an event has occurred on some other process which has already been reported to the root process and
+     * broadcasted to all other processes.
      *
-     * @return True, if an emergency stop has been requested. Otherwise, it returns false.
+     * On the root process (rank == 0), it checks the status of all non-root processes and performs the necessary
+     * broadcasts in case they have not been done yet.
+     *
+     * @return True, if an event has occurred on any process.
      */
-    [[nodiscard]] bool emergency_occured() {
-      // if final_communications() has already been called, global_stop == 0 if no error has occurred, otherwise it is 1
-      if (finalized) return global_stop;
+    [[nodiscard]] bool event_on_any_rank() {
+      // if final_communications() has already been called, any_event == 0 if no event has occurred, otherwise it is 1
+      if (finalized) return any_event;
+
+      // if a local event has occurred, we return true
+      if (local_event) return true;
+
+      // on the root process, we first check the status of all non-root processes, perform the necessary broadcasts and
+      // return true if an event has occurred
+      if (comm.rank() == 0) {
+        root_check_nodes_and_bcast();
+        return any_event;
+      }
+
+      // on non-root processes, we check the status of the corresponding broadcast and return true if an event has
+      // occurred
+      MPI_Status status;
+      int has_received = 0;
+      check_mpi_call(MPI_Test(&req_ibcast_any, &has_received, &status), "MPI_Test");
+      return has_received and any_event;
+    }
 
-      // either a local error has occurred or some other process has requested an emergency stop
-      if (global_stop or local_stop) return true;
+    /**
+     * @brief Check if an event has occurred on all processes.
+     *
+     * @details This function can be called on any process to check if an event has occurred on all processes.
+     *
+     * It returns true, if an event has occurred on all processes which has already been reported to the root process
+     * and broadcasted to all other processes.
+     *
+     * On the root process (rank == 0), it checks the status of all non-root processes and performs the necessary
+     * broadcasts in case it has not been done yet.
+     *
+     * @return True, if an event has occurred on all processes.
+     */
+    [[nodiscard]] bool event_on_all_ranks() {
+      // if final_communications() has already been called, all_events == 0 if an event has not occurred on every
+      // process, otherwise it is 1
+      if (finalized) return all_events;
 
-      if (com.rank() == 0) {
-        // root checks if some other process has requested an emergency stop
+      // on the root process, we first check the status of all non-root processes, perform the necessary broadcasts and
+      // return true if an event has occurred on all of them
+      if (comm.rank() == 0) {
         root_check_nodes_and_bcast();
+        return all_events;
       }
-      return global_stop;
+
+      // on non-root processes, we check the status of the broadcast and return true if an event has occurred on all
+      // processes
+      MPI_Status status;
+      int has_received = 0;
+      check_mpi_call(MPI_Test(&req_ibcast_all, &has_received, &status), "MPI_Test");
+      return has_received and all_events;
     }
 
     /**
      * @brief Finalize all pending communications.
      *
-     * @details At the end of this function, all processes have completed their work or have had a local
-     * emergency stop. The member `global_stop` is guaranteed to be the same on all processes when this
-     * function returns.
+     * @details At the end of this function, all MPI communications have been completed and the values of the member
+     * variables will not change anymore due to some member function calls.
+     *
+     * Furthermore, it frees the used communicator.
      */
     void finalize_communications() {
+      // prevent multiple calls
       if (finalized) return;
-      if (com.rank() == 0) {
-        // root just listens to the other processes and bcasts the global_stop until everyone is done
-        while (root_check_nodes_and_bcast()) { usleep(100); } // 100 us (micro seconds)
-        // all others node have finished
-        // if the root has never emitted the ibcast, we do it now
-        if (not global_stop) { MPI_Ibcast(&global_stop, 1, MPI_INT, 0, MPI_COMM_WORLD, &req_ibcast); }
+
+      if (comm.rank() == 0) {
+        // on root process, wait for all non-root processes to finish their MPI_Isend calls
+        while (root_check_nodes_and_bcast()) {
+          usleep(100); // 100 us (micro seconds)
+        }
+        // and perform broadcasts in case they have not been done yet
+        if (not any_event) { check_mpi_call(MPI_Ibcast(&any_event, 1, MPI_INT, 0, comm.get(), &req_ibcast_any), "MPI_Ibcast"); }
+        if (not all_events) { check_mpi_call(MPI_Ibcast(&all_events, 1, MPI_INT, 0, comm.get(), &req_ibcast_all), "MPI_Ibcast"); }
       } else {
-        // on non-root node: either Isend was done when local_stop was set to 1 during request_emergency_stop,
-        // or it has to happen now, i.e, work is done, and fine.
-        if (not local_stop) { MPI_Isend(&local_stop, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &req_isent); }
+        // on non-root processes, perform MPI_Isend call in case it has not been done yet
+        if (not local_event) { check_mpi_call(MPI_Isend(&local_event, 1, MPI_INT, 0, comm.rank(), comm.get(), &req_isent), "MPI_Isend"); }
       }
-      // all nodes wait for the ibcast of the global_stop to be complete
-      MPI_Status status;
-      MPI_Wait(&req_ibcast, &status);
+
+      // all nodes wait for the broadcasts to be completed
+      MPI_Status status_any, status_all;
+      check_mpi_call(MPI_Wait(&req_ibcast_any, &status_any), "MPI_Wait");
+      check_mpi_call(MPI_Wait(&req_ibcast_all, &status_all), "MPI_Wait");
+
+      // free the communicator
+      comm.free();
       finalized = true;
     }
 
     private:
-    /**
-     * @brief Check if any non-root process has sent a stop request. If so, broadcast to all other processes
-     * in case it has not been done yet.
-     *
-     * @return True, if at least one process has not finished the `MPI_Isend` of the `local_stop` variable to
-     * the root process. Otherwise, it returns false.
-     */
+    // Root process checks the status of all non-root processes, performs necessary broadcasts and returns a boolean
+    // that is true if at least one non-root process has not performed its MPI_Isend call yet.
     bool root_check_nodes_and_bcast() {
       EXPECTS(!finalized);
-      EXPECTS(com.rank() == 0);
-      // loop over all non-root processes
-      bool some_nodes_are_still_running = false;
-      for (auto &[request, node_stop] : root_futures) {
-        // check for an emergency stop request
+      EXPECTS(comm.rank() == 0);
+      bool any      = false;
+      bool all      = true;
+      bool finished = true;
+      for (auto &[request, rank_event] : root_futures) {
         MPI_Status status;
-        int comm_received = 0;
-        MPI_Test(&request, &comm_received, &status);
-        // for the first time an emergency stop has been requested -> root calls request_emergency_stop()
-        // to broadcast to all other processes
-        if (comm_received and (not global_stop) and node_stop) request_emergency_stop(); // the root requires the stop now. It also stops itself...
-        some_nodes_are_still_running |= (not comm_received);
+        int rank_received = 0;
+        check_mpi_call(MPI_Test(&request, &rank_received, &status), "MPI_Test");
+        any |= (rank_received and rank_event);
+        all &= (rank_received and rank_event);
+        finished &= rank_received;
+      }
+      if (not any_event and (any or local_event)) {
+        any_event = 1;
+        check_mpi_call(MPI_Ibcast(&any_event, 1, MPI_INT, 0, comm.get(), &req_ibcast_any), "MPI_Ibcast");
+      }
+      if (not all_events and all and local_event) {
+        all_events = 1;
+        check_mpi_call(MPI_Ibcast(&all_events, 1, MPI_INT, 0, comm.get(), &req_ibcast_all), "MPI_Ibcast");
       }
-      return some_nodes_are_still_running;
+      return not finished;
     }
   };
 
diff --git a/c++/mpi/mpi.hpp b/c++/mpi/mpi.hpp
index 838b76a6..0c565abe 100644
--- a/c++/mpi/mpi.hpp
+++ b/c++/mpi/mpi.hpp
@@ -21,13 +21,21 @@
 
 #pragma once
 
+#include "./array.hpp"
 #include "./chunk.hpp"
 #include "./communicator.hpp"
 #include "./datatypes.hpp"
 #include "./environment.hpp"
 #include "./generic_communication.hpp"
-#include "./lazy.hpp"
+#include "./monitor.hpp"
 #include "./operators.hpp"
+#include "./optional.hpp"
+#include "./pair.hpp"
+#include "./ranges.hpp"
+#include "./string.hpp"
+#include "./utils.hpp"
+#include "./vector.hpp"
+#include "./window.hpp"
 
 namespace mpi {
 
diff --git a/c++/mpi/operators.hpp b/c++/mpi/operators.hpp
index bfcef42d..3d20dfdf 100644
--- a/c++/mpi/operators.hpp
+++ b/c++/mpi/operators.hpp
@@ -21,6 +21,8 @@
 
 #pragma once
 
+#include "./utils.hpp"
+
 #include <mpi.h>
 
 namespace mpi {
@@ -35,6 +37,8 @@ namespace mpi {
    *
    * @details The binary function must have the following signature `(T const&, T const&) -> T`.
    *
+   * It throws an exception in case a call to the MPI C library fails.
+   *
    * @tparam T Type on which the binary function operates.
    * @tparam F Binary function pointer to be mapped.
    * @return `MPI_Op` created from the binary function.
@@ -46,7 +50,7 @@ namespace mpi {
       auto *inoutT = static_cast<T *>(inout);
       for (int i = 0; i < *len; ++i, ++inT, ++inoutT) { *inoutT = F(*inoutT, *inT); }
     };
-    MPI_Op_create(map_function, true, &myOp);
+    check_mpi_call(MPI_Op_create(map_function, true, &myOp), "MPI_Op_create");
     return myOp;
   }
 
diff --git a/c++/mpi/optional.hpp b/c++/mpi/optional.hpp
new file mode 100644
index 00000000..7cdd1014
--- /dev/null
+++ b/c++/mpi/optional.hpp
@@ -0,0 +1,134 @@
+// Copyright (c) 2019-2024 Simons Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Nils Wentzell
+
+/**
+ * @file
+ * @brief Provides an MPI broadcast and reduce for `std::optional`.
+ */
+
+#pragma once
+
+#include "./communicator.hpp"
+#include "./generic_communication.hpp"
+
+#include <mpi.h>
+
+#include <optional>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+namespace mpi {
+
+  /**
+   * @addtogroup coll_comm
+   * @{
+   */
+
+  /**
+   * @brief Implementation of an MPI broadcast for a `std::optional`.
+   *
+   * @details It first broadcasts a flag indicating whether the optional has a value. If the root's optional has a
+   * value, the value is broadcast to all other processes. If the root's optional is empty, all other processes reset
+   * their optional to empty.
+   *
+   * @tparam T Value type of the optional.
+   * @param opt `std::optional` to broadcast.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   */
+  template <typename T> void mpi_broadcast(std::optional<T> &opt, communicator c = {}, int root = 0) {
+    bool has_val = opt.has_value();
+    broadcast(has_val, c, root);
+
+    if (has_val) {
+      if (!opt.has_value()) opt.emplace();
+      broadcast(*opt, c, root);
+    } else {
+      opt.reset();
+    }
+  }
+
+  /**
+   * @brief Implementation of an MPI reduce for a `std::optional`.
+   *
+   * @details All ranks must have consistent has_value state (all have values or all are empty). If this condition is
+   * violated, a `std::runtime_error` is thrown.
+   *
+   * If all optionals have values, the values are reduced and returned in an optional. If all optionals are empty, an
+   * empty optional is returned.
+   *
+   * @tparam T Value type of the optional.
+   * @param opt `std::optional` to reduce.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the reduction.
+   * @param op `MPI_Op` used in the reduction.
+   * @return `std::optional` containing the result of the reduction.
+   */
+  template <typename T>
+  std::optional<T> mpi_reduce(std::optional<T> const &opt, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
+    // Verify consistency: sum of has_value should be 0 or c.size()
+    int has_val = opt.has_value() ? 1 : 0;
+    int total   = mpi::all_reduce(has_val, c, MPI_SUM);
+    if (total != 0 && total != c.size()) {
+      throw std::runtime_error("mpi::reduce for std::optional requires all ranks to have consistent has_value state");
+    }
+
+    if (opt.has_value()) return reduce(*opt, c, root, all, op);
+    else return {};
+  }
+
+  /**
+   * @brief Implementation of an MPI reduce for a `std::optional` that reduces directly into a given output optional.
+   *
+   * @details All ranks must have consistent has_value state (all have values or all are empty). If this condition is
+   * violated, a `std::runtime_error` is thrown.
+   *
+   * If all input optionals have values, the values are reduced into the output optional on all ranks, but only root
+   * (or all ranks if `all` is true) receives the meaningful result. If all input optionals are empty, the output
+   * optional is reset to empty on all ranks.
+   *
+   * @tparam T1 Value type of the optional to be reduced.
+   * @tparam T2 Value type of the optional to be reduced into.
+   * @param opt_in `std::optional` to reduce.
+   * @param opt_out `std::optional` to reduce into.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the reduction.
+   * @param op `MPI_Op` used in the reduction.
+   */
+  template <typename T1, typename T2>
+  void mpi_reduce_into(std::optional<T1> const &opt_in, std::optional<T2> &opt_out, communicator c = {}, int root = 0, bool all = false,
+                       MPI_Op op = MPI_SUM) {
+    // Verify consistency
+    int has_val = opt_in.has_value() ? 1 : 0;
+    int total   = mpi::all_reduce(has_val, c, MPI_SUM);
+    if (total != 0 && total != c.size()) {
+      throw std::runtime_error("mpi::reduce_into for std::optional requires all ranks to have consistent has_value state");
+    }
+
+    if (opt_in.has_value()) {
+      if (!opt_out.has_value()) opt_out.emplace();
+      reduce_into(*opt_in, *opt_out, c, root, all, op);
+    } else {
+      opt_out.reset();
+    }
+  }
+
+  /** @} */
+
+} // namespace mpi
diff --git a/c++/mpi/pair.hpp b/c++/mpi/pair.hpp
index e8900ea8..115d6ed1 100644
--- a/c++/mpi/pair.hpp
+++ b/c++/mpi/pair.hpp
@@ -16,7 +16,7 @@
 
 /**
  * @file
- * @brief Provides an MPI broadcast and reduce for std::pair.
+ * @brief Provides an MPI broadcast and reduce for `std::pair`.
  */
 
 #pragma once
@@ -35,13 +35,13 @@ namespace mpi {
    */
 
   /**
-   * @brief Implementation of an MPI broadcast for a std::pair.
+   * @brief Implementation of an MPI broadcast for a `std::pair`.
    *
-   * @details Simply calls the generic mpi::broadcast for the first and second element of the pair.
+   * @details It calls the generic mpi::broadcast for the first and second element of the pair.
    *
    * @tparam T1 Type of the first element of the pair.
    * @tparam T2 Type of the second element of the pair.
-   * @param p std::pair to broadcast.
+   * @param p `std::pair` to broadcast.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    */
@@ -51,18 +51,18 @@ namespace mpi {
   }
 
   /**
-   * @brief Implementation of an MPI reduce for a std::pair.
+   * @brief Implementation of an MPI reduce for a `std::pair`.
    *
-   * @details Simply calls the generic mpi::reduce for the first and second element of the pair.
+   * @details It calls the generic mpi::reduce for the first and second element of the pair separately.
    *
    * @tparam T1 Type of the first element of the pair.
    * @tparam T2 Type of the second element of the pair.
-   * @param p std::pair to be reduced.
+   * @param p `std::pair` to be reduced.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
    * @param op `MPI_Op` used in the reduction.
-   * @return std::pair<T1, T2> containing the result of each individual reduction.
+   * @return `std::pair` containing the results of the two reductions.
    */
   template <typename T1, typename T2>
   auto mpi_reduce(std::pair<T1, T2> const &p, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
diff --git a/c++/mpi/ranges.hpp b/c++/mpi/ranges.hpp
new file mode 100644
index 00000000..63ff0d11
--- /dev/null
+++ b/c++/mpi/ranges.hpp
@@ -0,0 +1,314 @@
+// Copyright (c) 2024 Simons Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Alexander Hampel, Olivier Parcollet, Nils Wentzell
+
+/**
+ * @file
+ * @brief Provides an MPI broadcast, reduce, scatter and gather for generic ranges.
+ */
+
+#pragma once
+
+#include "./chunk.hpp"
+#include "./communicator.hpp"
+#include "./datatypes.hpp"
+#include "./environment.hpp"
+#include "./generic_communication.hpp"
+#include "./macros.hpp"
+#include "./utils.hpp"
+
+#include <itertools/itertools.hpp>
+#include <mpi.h>
+
+#include <algorithm>
+#include <concepts>
+#include <limits>
+#include <numeric>
+#include <ranges>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace mpi {
+
+  /**
+   * @addtogroup coll_comm
+   * @{
+   */
+
+  /**
+   * @brief Implementation of an MPI broadcast for `std::ranges::sized_range` objects.
+   *
+   * @details The behaviour of this function is as follows:
+   * - If the number of elements to be broadcasted is zero, it does nothing.
+   * - If the range is contiguous with an MPI compatible value type, it calls `MPI_Bcast` and broadcasts the elements
+   * from the input range on the root process to all other processes.
+   * - Otherwise, it calls mpi::broadcast for each element separately.
+   *
+   * It throws an exception in case a call to the MPI C library fails and it expects that the input range size is equal
+   * on all processes.
+   *
+   * @tparam R `std::ranges::sized_range` type.
+   * @param rg Range to be broadcasted (into).
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   */
+  template <std::ranges::sized_range R> void broadcast_range(R &&rg, communicator c = {}, int root = 0) { // NOLINT (ranges need not be forwarded)
+    // check the size of the range
+    auto size = static_cast<long>(std::ranges::size(rg));
+    EXPECTS_WITH_MESSAGE(all_equal(size, c), "Range sizes are not equal on all processes in mpi::broadcast_range");
+
+    // do nothing if no elements are broadcasted
+    if (size <= 0) return;
+
+    // call the MPI C library if the ranges are contiguous with MPI compatible value types, otherwise do element-wise
+    // broadcasts
+    if constexpr (MPICompatibleRange<R>) {
+      // in case there is no active MPI environment or if the communicator size is < 2, do nothing
+      if (!has_env || c.size() < 2) return;
+
+      // make the MPI C library call (allow the number of elements to larger than INT_MAX)
+      constexpr long max_int = std::numeric_limits<int>::max();
+      for (long offset = 0; size > 0; offset += max_int, size -= max_int) {
+        auto const count = static_cast<int>(std::min(size, max_int));
+        check_mpi_call(MPI_Bcast(std::ranges::data(rg) + offset, count, mpi_type<std::ranges::range_value_t<R>>::get(), root, c.get()), "MPI_Bcast");
+      }
+    } else {
+      // otherwise call the generic broadcast for each element separately
+      for (auto &x : rg) broadcast(x, c, root);
+    }
+  }
+
+  /**
+   * @brief Implementation of an MPI reduce for `std::ranges::sized_range` objects.
+   *
+   * @details The behaviour of this function is as follows:
+   * - If the number of elements to be reduced is zero, it does nothing.
+   * - If the range is contiguous with an MPI compatible value type, it calls `MPI_Reduce` or `MPI_Allreduce` to reduce
+   * the elements in the input ranges into the output ranges on receiving ranks.
+   *   - If the input and output ranges point to the same data, the reduction is done in place.
+   * - Otherwise, it calls mpi::reduce_into for each input-output element pair separately.
+   *
+   * It throws an exception in case a call to the MPI C library fails and it expects
+   * - that the input range size on all processes and the output range size on receiving processes are equal and
+   * - that either all or none of the receiving processes choose the in place option.
+   *
+   * @tparam R1 `std::ranges::sized_range` type.
+   * @tparam R2 `std::ranges::sized_range` type.
+   * @param in_rg Range to be reduced.
+   * @param out_rg Range to be reduced into.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the reduction.
+   * @param op `MPI_Op` used in the reduction.
+   */
+  template <std::ranges::sized_range R1, std::ranges::sized_range R2>
+  void reduce_range(R1 &&in_rg, R2 &&out_rg, communicator c = {}, int root = 0, bool all = false, // NOLINT (ranges need not be forwarded)
+                    MPI_Op op = MPI_SUM) {
+    // check the size of the input range
+    auto size = static_cast<long>(std::ranges::size(in_rg));
+    EXPECTS_WITH_MESSAGE(all_equal(size, c), "Input range sizes are not equal on all processes in mpi::reduce_range");
+
+    // do nothing if no elements are reduced
+    if (size <= 0) return;
+
+    // check the size of the output range
+    bool const receives = (c.rank() == root || all);
+    if (receives) EXPECTS_WITH_MESSAGE(size == std::ranges::size(out_rg), "Input and output range sizes are not equal in mpi::reduce_range");
+
+    // call the MPI C library if the ranges are contiguous with MPI compatible value types
+    if constexpr (MPICompatibleRange<R1> && MPICompatibleRange<R2>) {
+      static_assert(std::same_as<std::remove_cvref_t<std::ranges::range_value_t<R1>>, std::remove_cvref_t<std::ranges::range_value_t<R2>>>,
+                    "Value types of input and output ranges not compatible in mpi::reduce_range");
+
+      // check if the reduction is in place
+      bool const in_place = (static_cast<void const *>(std::ranges::data(in_rg)) == static_cast<void *>(std::ranges::data(out_rg)));
+      if (all) {
+        EXPECTS_WITH_MESSAGE(all_equal(static_cast<int>(in_place), c),
+                             "Either zero or all receiving processes have to choose the in place option in mpi::reduce_range");
+      }
+
+      // in case there is no active MPI environment or if the communicator size is < 2, copy to the output range
+      if (!has_env || c.size() < 2) {
+        std::ranges::copy(std::forward<R1>(in_rg), std::ranges::data(out_rg));
+        return;
+      }
+
+      // make the MPI C library call (allow the number of elements to larger than INT_MAX)
+      constexpr long max_int = std::numeric_limits<int>::max();
+      for (long offset = 0; size > 0; offset += max_int, size -= max_int) {
+        auto in_data  = static_cast<void const *>(std::ranges::data(in_rg) + offset);
+        auto out_data = std::ranges::data(out_rg) + offset;
+        if (receives and in_place) in_data = MPI_IN_PLACE;
+        auto const count = static_cast<int>(std::min(size, max_int));
+        if (all) {
+          check_mpi_call(MPI_Allreduce(in_data, out_data, count, mpi_type<std::ranges::range_value_t<R1>>::get(), op, c.get()), "MPI_Allreduce");
+        } else {
+          check_mpi_call(MPI_Reduce(in_data, out_data, count, mpi_type<std::ranges::range_value_t<R1>>::get(), op, root, c.get()), "MPI_Reduce");
+        }
+      }
+    } else {
+      // fallback to element-wise reduction if the range is not contiguous with an MPI compatible value type
+      if (size <= std::ranges::size(out_rg)) {
+        // on ranks where the output range size is large enough, reduce into the output elements
+        for (auto &&[x_in, x_out] : itertools::zip(in_rg, out_rg)) reduce_into(x_in, x_out, c, root, all, op);
+      } else {
+        // on all other ranks, reduce into a dummy output object (needs to be default constructible)
+        using out_value_t = std::ranges::range_value_t<R2>;
+        if constexpr (std::is_default_constructible_v<out_value_t>) {
+          out_value_t out_dummy{};
+          for (auto &&x_in : in_rg) reduce_into(x_in, out_dummy, c, root, all, op);
+        } else {
+          // if it is not default constructible, is there something we can do?
+          throw std::runtime_error("Cannot default construct dummy object in mpi::reduce_range");
+        }
+      }
+    }
+  }
+
+  /**
+   * @brief Implementation of an MPI scatter for mpi::MPICompatibleRange objects.
+   *
+   * @details The behaviour of this function is as follows:
+   * - If the number of elements to be scattered is zero, it does nothing.
+   * - Otherwise, it calls `MPI_Scatterv` to scatter the input range from the root process to the output ranges on all
+   * other processes.
+   *
+   * By default, the input range is scattered as evenly as possible from the root process to all other processes in the
+   * communicator. To change that, the user can specify a chunk size which is used to divide the number of elements to
+   * be scattered into chunks of the specified size. Then, instead of single elements, the chunks are distributed evenly
+   * across the processes in the communicator.
+   *
+   * It throws an exception if call to the MPI C library fails and it expects
+   * - that the number of elements to be scattered is equal on all processes,
+   * - that the size of the input range on the root process is equal the number of elements to be scattered and
+   * - that the output range size is equal the number of elements to be received on all processes.
+   *
+   * @note In place scattering is not supported.
+   *
+   * @tparam R1 mpi::MPICompatibleRange type.
+   * @tparam R2 mpi::MPICompatibleRange type.
+   * @param in_rg Range to be scattered.
+   * @param out_rg Range to be scattered into.
+   * @param scatter_size Number of elements to be scattered.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   * @param chunk_size Size of the chunks to scatter.
+   */
+  template <MPICompatibleRange R1, MPICompatibleRange R2>
+    requires(std::same_as<std::remove_cvref_t<std::ranges::range_value_t<R1>>, std::remove_cvref_t<std::ranges::range_value_t<R2>>>)
+  void scatter_range(R1 &&in_rg, R2 &&out_rg, long scatter_size, communicator c = {}, int root = 0, // NOLINT (ranges need not be forwarded)
+                     long chunk_size = 1) {
+    // check the number of elements to be scattered
+    EXPECTS_WITH_MESSAGE(all_equal(scatter_size, c), "Number of elements to be scattered is not equal on all processes in mpi::scatter_range");
+
+    // do nothing if no elements are scattered
+    if (scatter_size == 0) return;
+
+    // check the size of the input range on root
+    if (c.rank() == root) {
+      EXPECTS_WITH_MESSAGE(scatter_size == std::ranges::size(in_rg),
+                           "Input range size on root is not equal the number of elements to be scattered in mpi::scatter_range");
+    }
+
+    // check the size of the output range
+    auto const recvcount = static_cast<int>(chunk_length(scatter_size, c.size(), c.rank(), chunk_size));
+    EXPECTS_WITH_MESSAGE(recvcount == std::ranges::size(out_rg),
+                         "Output range size is not equal the number of elements to be received in mpi::scatter_range");
+
+    // in case there is no active MPI environment or if the communicator size is < 2, copy to output range
+    if (!has_env || c.size() < 2) {
+      std::ranges::copy(std::forward<R1>(in_rg), std::ranges::data(out_rg));
+      return;
+    }
+
+    // prepare arguments for the MPI call
+    auto sendcounts = std::vector<int>(c.size());
+    auto displs     = std::vector<int>(c.size() + 1, 0);
+    for (int i = 0; i < c.size(); ++i) {
+      sendcounts[i] = static_cast<int>(chunk_length(scatter_size, c.size(), i, chunk_size));
+      displs[i + 1] = sendcounts[i] + displs[i];
+    }
+
+    // make the MPI C library call
+    check_mpi_call(MPI_Scatterv(std::ranges::data(in_rg), sendcounts.data(), displs.data(), mpi_type<std::ranges::range_value_t<R1>>::get(),
+                                std::ranges::data(out_rg), recvcount, mpi_type<std::ranges::range_value_t<R2>>::get(), root, c.get()),
+                   "MPI_Scatterv");
+  }
+
+  /**
+   * @brief Implementation of an MPI gather for mpi::MPICompatibleRange objects.
+   *
+   * @details The behaviour of this function is as follows:
+   * - If the number of elements to be gathered is zero, it does nothing.
+   * - Otherwise, it calls `MPI_Gatherv` or `MPI_Allgatherv` to gather the elements from the input ranges on all
+   * processes into the output ranges on receiving processes.
+   *
+   * This is the inverse operation of mpi::scatter_range. The numbers of elements to be gathered do not have to be equal
+   * on all processes.
+   *
+   * It throws an exception in case a call to the MPI C library fails and it expects that the output range sizes on
+   * receiving processes is the number of elements to be gathered.
+   *
+   * @note In place gathering is not supported.
+   *
+   * @tparam R1 mpi::MPICompatibleRange type.
+   * @tparam R2 mpi::MPICompatibleRange type.
+   * @param in_rg Range to be gathered.
+   * @param out_rg Range to be gathered into.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the gather operation.
+   */
+  template <MPICompatibleRange R1, MPICompatibleRange R2>
+    requires(std::same_as<std::remove_cvref_t<std::ranges::range_value_t<R1>>, std::remove_cvref_t<std::ranges::range_value_t<R2>>>)
+  void gather_range(R1 &&in_rg, R2 &&out_rg, communicator c = {}, int root = 0, bool all = false) { // NOLINT (ranges need not be forwarded)
+    // get the receive counts (sendcount from each process) and the displacements
+    auto sendcount  = static_cast<int>(std::ranges::size(in_rg));
+    auto recvcounts = all_gather(sendcount, c);
+    auto displs     = std::vector<int>(c.size() + 1, 0);
+    std::partial_sum(recvcounts.begin(), recvcounts.end(), displs.begin() + 1);
+
+    // do nothing if there are no elements to gather
+    if (displs.back() == 0) return;
+
+    // check the size of the output range on receiving ranks
+    if (c.rank() == root || all) {
+      EXPECTS_WITH_MESSAGE(displs.back() == std::ranges::size(out_rg),
+                           "Output range size is not equal the number of elements to be received in mpi::gather_range");
+    }
+
+    // in case there is no active MPI environment or if the communicator size is < 2, copy to the output range
+    if (!has_env || c.size() < 2) {
+      std::ranges::copy(std::forward<R1>(in_rg), std::ranges::data(out_rg));
+      return;
+    }
+
+    // make the MPI C library call
+    if (all) {
+      check_mpi_call(MPI_Allgatherv(std::ranges::data(in_rg), sendcount, mpi_type<std::ranges::range_value_t<R1>>::get(), std::ranges::data(out_rg),
+                                    recvcounts.data(), displs.data(), mpi_type<std::ranges::range_value_t<R2>>::get(), c.get()),
+                     "MPI_Allgatherv");
+    } else {
+      check_mpi_call(MPI_Gatherv(std::ranges::data(in_rg), sendcount, mpi_type<std::ranges::range_value_t<R1>>::get(), std::ranges::data(out_rg),
+                                 recvcounts.data(), displs.data(), mpi_type<std::ranges::range_value_t<R2>>::get(), root, c.get()),
+                     "MPI_Gatherv");
+    }
+  }
+
+  /** @} */
+
+} // namespace mpi
diff --git a/c++/mpi/string.hpp b/c++/mpi/string.hpp
index 904fc07c..d5ce188f 100644
--- a/c++/mpi/string.hpp
+++ b/c++/mpi/string.hpp
@@ -16,34 +16,60 @@
 
 /**
  * @file
- * @brief Provides an MPI broadcast for std::string.
+ * @brief Provides an MPI broadcast and gather for `std::string`.
  */
 
 #pragma once
 
-#include "./mpi.hpp"
-
-#include <mpi.h>
+#include "./communicator.hpp"
+#include "./generic_communication.hpp"
+#include "./ranges.hpp"
 
 #include <string>
 
 namespace mpi {
 
   /**
-   * @ingroup coll_comm
-   * @brief Implementation of an MPI broadcast for a std::string.
+   * @addtogroup coll_comm
+   * @{
+   */
+
+  /**
+   * @brief Implementation of an MPI broadcast for a `std::string`.
    *
-   * @details Simply calls `MPI_Bcast` for the underlying C-string.
+   * @details It first broadcasts the size of the string from the root process to all other processes, then resizes the
+   * string on all non-root processes and calls mpi::broadcast_range with the (resized) input string.
    *
-   * @param s std::string to broadcast.
+   * @param s `std::string` to broadcast (into).
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    */
   inline void mpi_broadcast(std::string &s, communicator c, int root) {
-    size_t len = s.size();
-    broadcast(len, c, root);
-    if (c.rank() != root) s.resize(len);
-    if (len != 0) MPI_Bcast((void *)s.c_str(), static_cast<int>(s.size()), mpi_type<char>::get(), root, c.get());
+    auto count = s.size();
+    broadcast(count, c, root);
+    if (c.rank() != root) s.resize(count);
+    broadcast_range(s, c, root);
   }
 
+  /**
+   * @brief Implementation of an MPI gather for a `std::string` that gathers directly into an existing output string.
+   *
+   * @details It first all-reduces the sizes of the input strings from all processes. On receiving ranks, the output
+   * string is resized to the reduced size in case it has not the correct size. On non-receiving ranks, the output
+   * string is always unmodified. Then mpi::gather_range with the input and (resized) output strings is called.
+   *
+   * @param s_in `std::string` to gather.
+   * @param s_out `std::string` to gather into.
+   * @param c mpi::communicator.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result.
+   */
+  inline void mpi_gather_into(std::string const &s_in, std::string &s_out, communicator c = {}, int root = 0, bool all = false) {
+    auto const gather_size = mpi::all_reduce(s_in.size(), c);
+    if ((c.rank() == root || all) && s_out.size() != s_in.size()) s_out.resize(gather_size);
+    gather_range(s_in, s_out, c, root, all);
+  }
+
+  /** @} */
+
 } // namespace mpi
diff --git a/c++/mpi/utils.hpp b/c++/mpi/utils.hpp
new file mode 100644
index 00000000..597514e5
--- /dev/null
+++ b/c++/mpi/utils.hpp
@@ -0,0 +1,57 @@
+// Copyright (c) 2024 Simons Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Alexander Hampel, Olivier Parcollet, Nils Wentzell
+
+/**
+ * @file
+ * @brief Provides general utilities related to MPI.
+ */
+
+#pragma once
+
+#include <mpi.h>
+
+#include <stdexcept>
+#include <string>
+
+namespace mpi {
+
+  /**
+   * @addtogroup utilities
+   * @{
+   */
+
+  /**
+   * @brief Check the success of an MPI call.
+   * @details It checks if the given error code returned by an MPI routine is equal to `MPI_SUCCESS`. If it isn't, it
+   * throws an exception.
+   *
+   * It is intended to simply wrap any calls to the MPI C library:
+   * @code{.cpp}
+   * int value = 5;
+   * int result = 0;
+   * check_mpi_call(MPI_Allreduce(&value, &result, 1, mpi::mpi_type<int>::get(), MPI_MAX, comm.get()), "MPI_Allreduce");
+   * @endcode
+   *
+   * @param errcode Error code returned by an MPI routine.
+   * @param mpi_routine Name of the MPI routine used in the error message.
+   */
+  inline void check_mpi_call(int errcode, const std::string &mpi_routine) {
+    if (errcode != MPI_SUCCESS) throw std::runtime_error("MPI error " + std::to_string(errcode) + " in MPI routine " + mpi_routine);
+  }
+
+  /** @} */
+
+} // namespace mpi
diff --git a/c++/mpi/vector.hpp b/c++/mpi/vector.hpp
index b00a17ef..91de1a29 100644
--- a/c++/mpi/vector.hpp
+++ b/c++/mpi/vector.hpp
@@ -16,21 +16,20 @@
 
 /**
  * @file
- * @brief Provides an MPI broadcast, reduce, scatter and gather for std::vector.
+ * @brief Provides an MPI broadcast, reduce, scatter and gather for `std::vector`.
  */
 
 #pragma once
 
-#include "./mpi.hpp"
+#include "./communicator.hpp"
+#include "./generic_communication.hpp"
+#include "./ranges.hpp"
+#include "./utils.hpp"
 
 #include <mpi.h>
 
-#include <algorithm>
-#include <cstddef>
-#include <cstdlib>
-#include <iostream>
-#include <stdexcept>
 #include <type_traits>
+#include <utility>
 #include <vector>
 
 namespace mpi {
@@ -41,213 +40,108 @@ namespace mpi {
    */
 
   /**
-   * @brief Implementation of an MPI broadcast for a std::vector.
+   * @brief Implementation of an MPI broadcast for a `std::vector`.
    *
-   * @details If mpi::has_mpi_type<T> is true then the vector is broadcasted using a simple `MPI_Bcast`. Otherwise,
-   * the generic mpi::broadcast is called for each element of the vector.
+   * @details It first broadcasts the size of the vector from the root process to all other processes, then resizes the
+   * vector on all non-root processes and calls mpi::broadcast_range with the (resized) input vector.
    *
    * @tparam T Value type of the vector.
-   * @param v std::vector to broadcast.
+   * @param v `std::vector` to broadcast.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    */
   template <typename T> void mpi_broadcast(std::vector<T> &v, communicator c = {}, int root = 0) {
-    auto s = v.size();
-    broadcast(s, c, root);
-    if (c.rank() != root) v.resize(s);
-    if constexpr (has_mpi_type<T>) {
-      if (s != 0) MPI_Bcast(v.data(), v.size(), mpi_type<T>::get(), root, c.get());
-    } else {
-      for (auto &x : v) broadcast(x, c, root);
-    }
+    auto count = v.size();
+    broadcast(count, c, root);
+    if (c.rank() != root) v.resize(count);
+    broadcast_range(v, c, root);
   }
 
   /**
-   * @brief Implementation of an in-place MPI reduce for a std::vector.
+   * @brief Implementation of an MPI reduce for a `std::vector`.
    *
-   * @details If mpi::has_mpi_type<T> is true then the vector is reduced using a simple `MPI_Reduce` or `MPI_Allreduce`.
-   * Otherwise, the specialized `mpi_reduce_in_place` is called for each element of the vector.
+   * @details It first constructs the output vector with its value type equal to the return type of
+   * `reduce(std::declval<T>())`. On receiving ranks, the output vector is then resized to the size of the input vector.
+   * On non-receiving ranks, the output vector is always empty.
+   *
+   * It calls mpi::reduce_range with the input and constructed output vector.
    *
    * @tparam T Value type of the vector.
-   * @param v std::vector to reduce.
+   * @param v `std::vector` to reduce.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
    * @param op `MPI_Op` used in the reduction.
+   * @return `std::vector` containing the result of the reduction.
    */
-  template <typename T> void mpi_reduce_in_place(std::vector<T> &v, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    if (v.size() == 0) return;
-    if constexpr (has_mpi_type<T>) {
-      if (!all)
-        MPI_Reduce((c.rank() == root ? MPI_IN_PLACE : v.data()), v.data(), v.size(), mpi_type<T>::get(), op, root, c.get());
-      else
-        MPI_Allreduce(MPI_IN_PLACE, v.data(), v.size(), mpi_type<T>::get(), op, c.get());
-    } else {
-      for (auto &x : v) mpi_reduce_in_place(v, c, root, all);
-    }
+  template <typename T> auto mpi_reduce(std::vector<T> const &v, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
+    using value_type = std::remove_cvref_t<decltype(reduce(std::declval<T>()))>;
+    std::vector<value_type> res(c.rank() == root || all ? v.size() : 0);
+    reduce_range(v, res, c, root, all, op);
+    return res;
   }
 
-  namespace detail {
-
-    // Helper struct to get the regular type of a type.
-    template <typename T, typename Enable = void> struct _regular {
-      using type = T;
-    };
-
-    // Spezialization of _regular for types with a `regular_type` type alias.
-    template <typename T> struct _regular<T, std::void_t<typename T::regular_type>> {
-      using type = typename T::regular_type;
-    };
-
-  } // namespace detail
-
   /**
-   * @ingroup utilities
-   * @brief Type trait to get the regular type of a type.
-   * @tparam T Type to check.
-   */
-  template <typename T> using regular_t = typename detail::_regular<std::decay_t<T>>::type;
-
-  /**
-   * @brief Implementation of an MPI reduce for a std::vector.
+   * @brief Implementation of an MPI reduce for a `std::vector` that reduces directly into a given output vector.
    *
-   * @details If mpi::has_mpi_type<T> is true then the vector is reduced using a simple `MPI_Reduce` or `MPI_Allreduce`
-   * (in this case, mpi::regular_t<T> has to be the same as T). Otherwise, the generic mpi::reduce is called for each
-   * element of the vector.
+   * @details It first resizes the output vector to the size of the input vector on receiving ranks and then calls
+   * mpi::reduce_range with the input and (resized) output vector.
    *
-   * @tparam T Value type of the vector.
-   * @param v std::vector to reduce.
+   * @tparam T1 Value type of the vector to be reduced.
+   * @tparam T2 Value type of the vector to be reduced into.
+   * @param v_in `std::vector` to reduce.
+   * @param v_out `std::vector` to reduce into.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
    * @param op `MPI_Op` used in the reduction.
-   * @return std::vector containing the result of each individual reduction.
    */
-  template <typename T>
-  std::vector<regular_t<T>> mpi_reduce(std::vector<T> const &v, communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    auto s = v.size();
-
-    // check if all vectors are of the same size, otherwise abort
-    if (all) {
-      auto max_size = mpi_reduce(s, c, root, all, MPI_MAX);
-      if (s != max_size) {
-        std::cerr << "Cannot all_reduce vectors of different sizes\n";
-        std::abort();
-      }
-    }
-
-    // return an empty vector if size is 0
-    if (s == 0) return {};
-
-    // perform the reduction for every element of the vector
-    if constexpr (has_mpi_type<T>) {
-      static_assert(std::is_same_v<regular_t<T>, T>, "Internal error");
-      std::vector<T> res(s);
-      if (!all)
-        MPI_Reduce((void *)v.data(), res.data(), s, mpi_type<T>::get(), op, root, c.get());
-      else
-        MPI_Allreduce((void *)v.data(), res.data(), s, mpi_type<T>::get(), op, c.get());
-      return res;
-    } else {
-      std::vector<regular_t<T>> r;
-      r.reserve(s);
-      for (size_t i = 0; i < s; ++i) r.push_back(reduce(v[i], c, root, all, op));
-      return r;
-    }
+  template <typename T1, typename T2>
+  void mpi_reduce_into(std::vector<T1> const &v_in, std::vector<T2> &v_out, communicator c = {}, int root = 0, bool all = false,
+                       MPI_Op op = MPI_SUM) {
+    if ((c.rank() == root || all) && v_out.size() != v_in.size()) v_out.resize(v_in.size());
+    reduce_range(v_in, v_out, c, root, all, op);
   }
 
   /**
-   * @brief Implementation of an MPI scatter for a std::vector.
+   * @brief Implementation of an MPI scatter for a `std::vector` that scatters directly into an existing output vector.
    *
-   * @details If mpi::has_mpi_type<T> is true then the vector is scattered as evenly as possible across the processes
-   * in the communicator using a simple `MPI_Scatterv`.
+   * @details It first broadcasts the size of the input vector from the root process to all other processes and
+   * resizes the output vector if it has not the correct size. The size of the output vector is determined with
+   * mpi::chunk_length. Then mpi::scatter_range is called with the input and (resized) output vector.
    *
    * @tparam T Value type of the vector.
-   * @param v std::vector to scatter.
+   * @param v_in `std::vector` to scatter.
+   * @param v_out `std::vector` to scatter into.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
-   * @return std::vector containing the result of the scatter operation.
    */
-  template <typename T> std::vector<T> mpi_scatter(std::vector<T> const &v, communicator c = {}, int root = 0) {
-    auto s = v.size();
-
-    // return an empty vector if size is 0
-    if (s == 0) return {};
-
-    // arguments for the MPI call
-    auto sendcounts = std::vector<int>(c.size());          // number of elements sent to each process
-    auto displs     = std::vector<int>(c.size() + 1, 0);   // displacements given in number of elements not in bytes
-    int recvcount   = chunk_length(s, c.size(), c.rank()); // number of elements received by the calling process
-    for (int r = 0; r < c.size(); ++r) {
-      sendcounts[r] = chunk_length(s, c.size(), r);
-      displs[r + 1] = sendcounts[r] + displs[r];
-    }
-
-    // do the scattering
-    std::vector<T> res(recvcount);
-    if constexpr (has_mpi_type<T>) {
-      MPI_Scatterv((void *)v.data(), &sendcounts[0], &displs[0], mpi_type<T>::get(), (void *)res.data(), recvcount, mpi_type<T>::get(), root,
-                   c.get());
-    } else {
-      std::copy(cbegin(v) + displs[c.rank()], cbegin(v) + displs[c.rank() + 1], begin(res));
-    }
-
-    return res;
+  template <typename T> void mpi_scatter_into(std::vector<T> const &v_in, std::vector<T> &v_out, communicator c = {}, int root = 0) {
+    auto scatter_size = static_cast<int>(v_in.size());
+    broadcast(scatter_size, c, root);
+    auto const recvcount = chunk_length(scatter_size, c.size(), c.rank());
+    if (v_out.size() != recvcount) v_out.resize(recvcount);
+    scatter_range(v_in, v_out, scatter_size, c, root);
   }
 
   /**
-   * @brief Implementation of an MPI gather for a std::vector.
+   * @brief Implementation of an MPI gather for a `std::vector` that gathers directly into an existing output vector.
    *
-   * @details If mpi::has_mpi_type<T> is true then the vector is gathered using a simple `MPI_Gatherv` or `MPI_Allgatherv`.
-   * Otherwise, each process broadcasts its elements to all other processes which implies that `all == true` is required
-   * in this case.
+   * @details It first all-reduces the sizes of the input vectors from all processes. On receiving ranks, the output
+   * vector is resized to the reduced size in case it has not the correct size. On non-receiving ranks, the output
+   * vector is always unmodified. Then mpi::gather_range with the input and (resized) output vector is called.
    *
    * @tparam T Value type of the vector.
-   * @param v std::vector to gather.
+   * @param v_in `std::vector` to gather.
+   * @param v_out `std::vector` to gather into.
    * @param c mpi::communicator.
    * @param root Rank of the root process.
-   * @param all Should all processes receive the result of the reduction.
-   * @return std::vector containing the result of the gather operation.
+   * @param all Should all processes receive the result.
    */
-  template <typename T> std::vector<T> mpi_gather(std::vector<T> const &v, communicator c = {}, int root = 0, bool all = false) {
-    long s = mpi_reduce(v.size(), c, root, all);
-
-    // return an empty vector if size is 0
-    if (s == 0) return {};
-
-    // arguments for the MPI call
-    auto mpi_ty     = mpi_type<int>::get();
-    auto recvcounts = std::vector<int>(c.size());        // number of elements received from each process
-    auto displs     = std::vector<int>(c.size() + 1, 0); // displacements given in number of elements not in bytes
-    int sendcount   = v.size();                          // number of elements sent by the calling process
-    if (!all)
-      MPI_Gather(&sendcount, 1, mpi_ty, &recvcounts[0], 1, mpi_ty, root, c.get());
-    else
-      MPI_Allgather(&sendcount, 1, mpi_ty, &recvcounts[0], 1, mpi_ty, c.get());
-
-    for (int r = 0; r < c.size(); ++r) displs[r + 1] = recvcounts[r] + displs[r];
-
-    // do the gathering
-    std::vector<T> res((all || (c.rank() == root) ? s : 0));
-    if constexpr (has_mpi_type<T>) {
-      if (!all)
-        MPI_Gatherv((void *)v.data(), sendcount, mpi_type<T>::get(), (void *)res.data(), &recvcounts[0], &displs[0], mpi_type<T>::get(), root,
-                    c.get());
-      else
-        MPI_Allgatherv((void *)v.data(), sendcount, mpi_type<T>::get(), (void *)res.data(), &recvcounts[0], &displs[0], mpi_type<T>::get(), c.get());
-    } else {
-      if (!all)
-        throw std::runtime_error{"mpi_gather for custom types only implemented with 'all = true'\n"};
-      else {
-        for (int r = 0; r < c.size(); ++r) {
-          for (auto i = displs[r]; i < displs[r + 1]; ++i) {
-            if (c.rank() == r) res[i] = v[i - displs[r]];
-            mpi::broadcast(res[i], c, r);
-          }
-        }
-      }
-    }
-    return res;
+  template <typename T> void mpi_gather_into(std::vector<T> const &v_in, std::vector<T> &v_out, communicator c = {}, int root = 0, bool all = false) {
+    auto const gather_size = mpi::all_reduce(v_in.size(), c);
+    if ((c.rank() == root || all) && v_out.size() != gather_size) v_out.resize(gather_size);
+    gather_range(v_in, v_out, c, root, all);
   }
 
   /** @} */
diff --git a/c++/mpi/window.hpp b/c++/mpi/window.hpp
new file mode 100644
index 00000000..9d6325d4
--- /dev/null
+++ b/c++/mpi/window.hpp
@@ -0,0 +1,454 @@
+// Copyright (c) 2024 Simons Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Alexander Hampel, Olivier Parcollet, Nils Wentzell
+
+/**
+ * @file
+ * @brief Provides a C++ wrapper class for an `MPI_Win` object.
+ */
+
+#pragma once
+
+#include "./communicator.hpp"
+#include "./datatypes.hpp"
+#include "./group.hpp"
+#include "./macros.hpp"
+#include "./utils.hpp"
+
+#include <mpi.h>
+
+#include <algorithm>
+#include <memory>
+#include <tuple>
+#include <utility>
+
+namespace mpi {
+
+  /**
+   * @addtogroup mpi_osc_shm
+   * @{
+   */
+
+  /**
+   * @brief A C++ wrapper around `MPI_Win` providing convenient memory window management.
+   *
+   * @details This class abstracts the complexities of MPI window management, allowing processes in an MPI communicator
+   * to create and share memory regions efficiently. It supports both local buffer-based windows and dynamically
+   * allocated memory windows.
+   *
+   * If a base pointer is not specified, the constructor will allocate memory internally.
+   *
+   * This class follows move-only semantics and takes ownership of the wrapped `MPI_Win` object.
+   *
+   * @tparam BaseType The type of elements stored in the memory window.
+   */
+  template <class BaseType> class window {
+    public:
+    /// Type of the base pointer.
+    using base_type = BaseType;
+
+    /// Construct a window with `MPI_WIN_NULL`.
+    window() = default;
+
+    /// Deleted copy constructor.
+    window(window const &) = delete;
+
+    /// Deleted copy assignment operator.
+    window &operator=(window const &) = delete;
+
+    /// Move constructor takes ownership of the moved-from MPI window and leaves it with `MPI_WIN_NULL`.
+    window(window &&other) noexcept
+       : win_{std::exchange(other.win_, MPI_WIN_NULL)},
+         comm_{std::exchange(other.comm_, communicator{MPI_COMM_NULL})},
+         owned_{std::exchange(other.owned_, false)},
+         data_{std::exchange(other.data_, nullptr)},
+         size_{std::exchange(other.size_, 0)} {}
+
+    /// Move assignment operator takes ownership of the moved-from MPI window and leaves it with `MPI_WIN_NULL`.
+    window &operator=(window &&rhs) noexcept {
+      if (this != std::addressof(rhs)) {
+        free();
+        win_   = std::exchange(rhs.win_, MPI_WIN_NULL);
+        comm_  = std::exchange(rhs.comm_, communicator{MPI_COMM_NULL});
+        owned_ = std::exchange(rhs.owned_, false);
+        data_  = std::exchange(rhs.data_, nullptr);
+        size_  = std::exchange(rhs.size_, 0);
+      }
+      return *this;
+    }
+
+    /**
+     * @brief Construct an MPI window over an existing local memory buffer.
+     *
+     * @details This constructor allows creating a window using a pre-allocated memory buffer by calling
+     * `MPI_Win_create`. The window provides access to the specified memory region across MPI processes within the given
+     * communicator. The buffer is not freed upon destruction.
+     *
+     * @param c mpi::communicator that defines the group of processes sharing the window.
+     * @param base_ptr Pointer to the base address of the memory buffer.
+     * @param sz Number of elements in the buffer.
+     * @param info Additional MPI information. Default is `MPI_INFO_NULL`.
+     */
+    explicit window(communicator const &c, BaseType *base_ptr, MPI_Aint sz, MPI_Info info = MPI_INFO_NULL)
+       : comm_(c.get()), data_(base_ptr), size_(sz) {
+      ASSERT(size_ >= 0)
+      ASSERT(!(data_ == nullptr && size_ > 0))
+      if (has_env) check_mpi_call(MPI_Win_create(data_, size_ * sizeof(BaseType), sizeof(BaseType), info, c.get(), &win_), "MPI_Win_create");
+    }
+
+    /**
+     * @brief Construct an MPI window with dynamically allocated memory.
+     *
+     * @details This constructor allocates a new memory buffer locally and creates an MPI window over it by calling
+     * `MPI_Win_allocate`. The allocated memory is automatically freed when the window is destroyed. This is useful when
+     * the memory region is meant to be shared across processes without needing an external buffer.
+     *
+     * @param c mpi::communicator that defines the group of processes sharing the window.
+     * @param sz Number of elements to allocate for the calling process.
+     * @param info Additional MPI information. Default is `MPI_INFO_NULL`.
+     */
+    explicit window(communicator const &c, MPI_Aint sz, MPI_Info info = MPI_INFO_NULL) : comm_(c.get()), size_(sz) {
+      ASSERT(size_ >= 0)
+      if (has_env) {
+        check_mpi_call(MPI_Win_allocate(size_ * sizeof(BaseType), sizeof(BaseType), info, c.get(), &data_, &win_), "MPI_Win_allocate");
+      } else {
+        data_  = new BaseType[size_]; // NOLINT (new is fine here)
+      }
+      owned_ = true;
+    }
+
+    /// Convert the window to the wrapped `MPI_Win` object.
+    explicit operator MPI_Win() const { return win_; };
+
+    /// Convert a pointer to the window to a pointer to the wrapped `MPI_Win` object.
+    explicit operator MPI_Win *() { return &win_; };
+
+    /// Destructor calls free() to release the window.
+    virtual ~window() { free(); }
+
+    /**
+     * @brief Release allocated resources owned by the window.
+     *
+     * @details Before freeing the owned memory or the `MPI_Win` handle, a window must have completed all its
+     * involvement in RMA communications. For that reason we call `MPI_Win_fence` before `MPI_Win_free`.
+     *
+     * The window also must be unlocked if it has been previously locked. However, this cannot be detected and is
+     * therefore the responsibility of the user.
+     *
+     * If the window owns an allocated memory buffer, it will be automatically freed. Otherwise, only the MPI window
+     * handle is released.
+     */
+    void free() noexcept {
+      if (has_env) {
+        if (win_ != MPI_WIN_NULL) {
+          MPI_Win_fence(0, win_);
+          MPI_Win_free(&win_);
+        }
+      } else if (owned_) {
+        delete[] data_;
+      }
+      owned_ = false;
+      data_  = nullptr;
+      size_  = 0;
+    }
+
+    /**
+     * @brief Synchronize all RMA operations within an access epoch by calling `MPI_Win_fence`.
+     *
+     * @details This function acts as a barrier for remote memory access (RMA) operations, ensuring all previous
+     * operations on the window are completed before continuing. The call is collective on the group of the window.
+     *
+     * @param assert Program assertion.
+     */
+    void fence(int assert = 0) const {
+      if (has_env) check_mpi_call(MPI_Win_fence(assert, win_), "MPI_Win_fence");
+    }
+
+    /**
+     * @brief Ensure completion of all outstanding RMA operations.
+     *
+     * @details If the given target rank is \f$ < 0 \f$, it calls `MPI_Win_flush_all`. Otherwise, it calls
+     * `MPI_Win_flush`.
+     *
+     * @param rank Target rank.
+     */
+    void flush(int rank = -1) const {
+      if (has_env) {
+        if (rank < 0) {
+          check_mpi_call(MPI_Win_flush_all(win_), "MPI_Win_flush_all");
+        } else {
+          check_mpi_call(MPI_Win_flush(rank, win_), "MPI_Win_flush");
+        }
+      }
+    }
+
+    /**
+     * @brief Synchronize the public and private copies of the window.
+     *
+     * @details It ensures that any updates to the local memory are visible in the public window and vice versa by
+     * calling `MPI_Win_sync`.
+     */
+    void sync() const {
+      if (has_env) check_mpi_call(MPI_Win_sync(win_), "MPI_Win_sync");
+    }
+
+    /**
+     * @brief Start an RMA access epoch.
+     *
+     * @details It locks access to the memory window on a specific rank or all ranks, preventing concurrent
+     * modifications.
+     *
+     * If the given target rank is \f$ < 0 \f$, it calls `MPI_Win_lock_all`. Otherwise, it calls `MPI_Win_lock`.
+     *
+     * @param rank Target rank.
+     * @param lock_type Type of the lock (e.g. `MPI_LOCK_SHARED` or `MPI_LOCK_EXCLUSIVE`).
+     * @param assert An assertion flag providing optimization hints to MPI.
+     */
+    void lock(int rank = -1, int lock_type = MPI_LOCK_SHARED, int assert = 0) const {
+      if (has_env) {
+        if (rank < 0) {
+          check_mpi_call(MPI_Win_lock_all(assert, win_), "MPI_Win_lock_all");
+        } else {
+          check_mpi_call(MPI_Win_lock(lock_type, rank, assert, win_), "MPI_Win_lock");
+        }
+      }
+    }
+
+    /**
+     * @brief Complete an RMA access epoch started by lock().
+     *
+     * @details It unlocks access to the memory window on a specific rank or all ranks, allowing other processes to
+     * access or modify the window.
+     *
+     * If the given target rank is \f$ < 0 \f$, it calls `MPI_Win_unlock_all`. Otherwise, it calls `MPI_Win_unlock`.
+     *
+     * @param rank Target rank.
+     */
+    void unlock(int rank = -1) const {
+      if (has_env) {
+        if (rank < 0) {
+          check_mpi_call(MPI_Win_unlock_all(win_), "MPI_Win_unlock_all");
+        } else {
+          check_mpi_call(MPI_Win_unlock(rank, win_), "MPI_Win_unlock");
+        }
+      }
+    }
+
+    /**
+     * @brief Start an RMA access epoch by calling `MPI_Win_start` (see also complete()).
+     *
+     * @param grp mpi::group of target processes.
+     * @param assert An assertion flag providing optimization hints to MPI.
+     */
+    void start(group const &grp, int assert = 0) const {
+      if (has_env) check_mpi_call(MPI_Win_start(grp.get(), assert, win_), "MPI_Win_start");
+    }
+
+    /// Completes an RMA access epoch by calling `MPI_Win_complete` (see also start()).
+    void complete() const {
+      if (has_env) check_mpi_call(MPI_Win_complete(win_), "MPI_Win_complete");
+    }
+
+    /**
+     * @brief Start an RMA exposure epoch by calling `MPI_Win_post` (see also wait()).
+     *
+     * @param grp mpi::group of origin processes.
+     * @param assert An assertion flag providing optimization hints to MPI.
+     */
+    void post(group const &grp, int assert = 0) const {
+      if (has_env) check_mpi_call(MPI_Win_post(grp.get(), assert, win_), "MPI_Win_post");
+    }
+
+    /// Completes an RMA exposure epoch by calling `MPI_Win_wait` (see also post()).
+    void wait() const {
+      if (has_env) check_mpi_call(MPI_Win_wait(win_), "MPI_Win_wait");
+    }
+
+    /**
+     * @brief Read data from a remote memory window.
+     *
+     * @details This function retrieves data from the memory window on the given process by calling `MPI_get` and stores
+     * it in a local buffer.
+     *
+     * @tparam TargetType Value type of the target memory.
+     * @tparam OriginType Value type of the origin memory.
+     * @param origin_addr Pointer to the memory buffer where the data will be stored.
+     * @param origin_count Number of elements to retrieve.
+     * @param target_rank Rank of the target process from which data is fetched.
+     * @param target_disp Displacement from the start of the target memory window.
+     * @param target_count Number of elements to read from the target. If negative or not specified, defaults to
+     * `origin_count`.
+     */
+    template <typename TargetType = BaseType, typename OriginType>
+      requires(has_mpi_type<OriginType> && has_mpi_type<TargetType>)
+    void get(OriginType *origin_addr, int origin_count, int target_rank, MPI_Aint target_disp = 0, int target_count = -1) const {
+      ASSERT(origin_count >= 0 && target_disp >= 0);
+      target_count = target_count < 0 ? origin_count : target_count;
+      if (has_env) {
+        auto origin_datatype = mpi_type<OriginType>::get();
+        auto target_datatype = mpi_type<TargetType>::get();
+        check_mpi_call(MPI_Get(origin_addr, origin_count, origin_datatype, target_rank, target_disp, target_count, target_datatype, win_), "MPI_Get");
+      } else {
+        ASSERT(target_rank == 0); // there are no other ranks
+        std::copy(data_, data_ + target_count, origin_addr);
+      }
+    }
+
+    /**
+     * @brief Write data to a remote memory window.
+     *
+     * @details This function transfers data from a local buffer to the memory window on the given process by calling
+     * `MPI_Put`.
+     *
+     * @tparam TargetType Value type at the target memory.
+     * @tparam OriginType Value type at the origin memory.
+     * @param origin_addr Pointer to the local memory buffer containing the data to be sent.
+     * @param origin_count Number of elements to transfer.
+     * @param target_rank Rank of the target process to which data is written.
+     * @param target_disp Displacement from the start of the target memory window.
+     * @param target_count Number of elements to write to the target. If negative or not specified, defaults to
+     * `origin_count`.
+     */
+    template <typename TargetType = BaseType, typename OriginType>
+      requires(has_mpi_type<OriginType> && has_mpi_type<TargetType>)
+    void put(OriginType *origin_addr, int origin_count, int target_rank, MPI_Aint target_disp = 0, int target_count = -1) const {
+      ASSERT(origin_count >= 0 && target_disp >= 0);
+      target_count = target_count < 0 ? origin_count : target_count;
+      if (has_env) {
+        auto origin_datatype = mpi_type<OriginType>::get();
+        auto target_datatype = mpi_type<TargetType>::get();
+        check_mpi_call(MPI_Put(origin_addr, origin_count, origin_datatype, target_rank, target_disp, target_count, target_datatype, win_), "MPI_Put");
+      } else {
+        ASSERT(target_rank == 0); // there are no other ranks
+        std::copy(origin_addr, origin_addr + origin_count, data_);
+      }
+    }
+
+    /// Get a pointer to the beginning of the window memory.
+    [[nodiscard]] BaseType *base() const { return data_; }
+
+    /// Get the size of the window in number of elements.
+    [[nodiscard]] MPI_Aint size() const { return size_; }
+
+    /// Get the displacement unit in bytes.
+    [[nodiscard]] int disp_unit() const { return sizeof(BaseType); }
+
+    /// Get the mpi::communicator associated with the window.
+    [[nodiscard]] communicator get_communicator() const { return comm_; }
+
+    protected:
+    MPI_Win win_{MPI_WIN_NULL};
+    communicator comm_{MPI_COMM_NULL};
+    bool owned_{false};
+    BaseType *data_{nullptr};
+    MPI_Aint size_{0};
+  };
+
+  /**
+   * @brief A C++ wrapper around `MPI_Win` representing a shared memory window.
+   *
+   * @details This class provides an interface for creating and managing an MPI shared memory window.
+   *
+   * @tparam BaseType The type of elements stored in the shared memory window.
+   */
+  template <class BaseType> class shared_window : public window<BaseType> {
+    public:
+    /// Construct a shared memory window with `MPI_WIN_NULL`.
+    shared_window() = default;
+
+    /**
+     * @brief Construct a shared memory window by dynamically allocating memory.
+     *
+     * @details This constructor allocates a shared memory window within the given communicator by calling
+     * `MPI_Win_allocate_shared`. The allocated memory is automatically freed when the window is destroyed.
+     *
+     * @param c mpi::shared_communicator object.
+     * @param sz Number of elements to allocate for the calling process.
+     * @param info Additional MPI information.
+     */
+    explicit shared_window(shared_communicator const &c, MPI_Aint sz, MPI_Info info = MPI_INFO_NULL) {
+      ASSERT(sz >= 0)
+      comm_ = c.get();
+      size_ = sz;
+      if (has_env) {
+        check_mpi_call(MPI_Win_allocate_shared(size_ * sizeof(BaseType), sizeof(BaseType), info, c.get(), &data_, &win_), "MPI_Win_allocate_shared");
+      } else {
+        data_  = new BaseType[size_]; // NOLINT (new is fine here)
+      }
+      owned_ = true;
+    }
+
+    /**
+     * @brief Query attributes of a shared memory window.
+     *
+     * @details Retrieves the byte-size, displacement unit, and a pointer to the beginning of the shared memory region for a
+     * specific rank.
+     *
+     * When `MPI_PROC_NULL` is passed for the rank, MPI returns information about the memory segment with the lowest rank that has a
+     * non-zero size.
+     *
+     * @param rank Rank within the shared communicator.
+     * @return A tuple containing the byte-size, the displacement unit in bytes and the base pointer.
+     */
+    [[nodiscard]] std::tuple<MPI_Aint, int, void *> query(int rank = MPI_PROC_NULL) const {
+      if (has_env) {
+        MPI_Aint sz   = 0;
+        int du        = 0;
+        void *baseptr = nullptr;
+        check_mpi_call(MPI_Win_shared_query(win_, rank, &sz, &du, &baseptr), "MPI_Win_shared_query");
+        return {sz, du, baseptr};
+      } else {
+        return {size_, sizeof(BaseType), data_};
+      }
+    }
+
+    /**
+     * @brief Get a pointer to the beginning of the shared memory region of a specific rank.
+     *
+     * @param rank Rank within the shared communicator.
+     * @return Pointer to the shared window of the given rank.
+     */
+    [[nodiscard]] BaseType *base(int rank = MPI_PROC_NULL) const { return static_cast<BaseType *>(std::get<2>(query(rank))); }
+
+    /**
+     * @brief Get the size of the shared memory region of a specific rank.
+     *
+     * @param rank Rank within the shared communicator.
+     * @return Number of elements in the shared window of the given rank.
+     */
+    [[nodiscard]] MPI_Aint size(int rank = MPI_PROC_NULL) const { return std::get<0>(query(rank)) / sizeof(BaseType); }
+
+    /**
+     * @brief Get the displacement unit of the shared memory region of a specific rank.
+     *
+     * @param rank Rank within the shared communicator.
+     * @return Displacement unit in bytes.
+     */
+    [[nodiscard]] int disp_unit(int rank = MPI_PROC_NULL) const { return std::get<1>(query(rank)); }
+
+    /// Get the mpi::shared_communicator associated with the window.
+    [[nodiscard]] shared_communicator get_communicator() const { return comm_.get(); }
+
+    private:
+    using window<BaseType>::win_;
+    using window<BaseType>::comm_;
+    using window<BaseType>::owned_;
+    using window<BaseType>::data_;
+    using window<BaseType>::size_;
+  };
+
+  /** @} */
+
+} // namespace mpi
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index d08105f3..01ee10df 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -249,7 +249,7 @@ PYTHON_DOCSTRING       = YES
 # documentation from any documented member that it re-implements.
 # The default value is: YES.
 
-INHERIT_DOCS           = YES
+INHERIT_DOCS           = NO
 
 # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
 # page for each member. If set to NO, the documentation of a member will be part
diff --git a/doc/DoxygenLayout.xml b/doc/DoxygenLayout.xml
index 240145da..16179c30 100644
--- a/doc/DoxygenLayout.xml
+++ b/doc/DoxygenLayout.xml
@@ -22,75 +22,46 @@
       <tab type="user" url="@ref ex1" title="Example 1: Hello world!"/>
       <tab type="user" url="@ref ex2" title="Example 2: Use monitor to communicate errors"/>
       <tab type="user" url="@ref ex3" title="Example 3: Custom type and operator"/>
+      <tab type="user" url="@ref ex4" title="Example 4: Provide custom spezializations"/>
     </tab>
     <tab type="usergroup" url="@ref documentation" title="API Documentation">
       <tab type="usergroup" url="@ref mpi_essentials" title="MPI essentials">
         <tab type="user" url="@ref mpi::communicator" title="communicator"/>
         <tab type="user" url="@ref mpi::environment" title="environment"/>
-        <tab type="user" url="@ref mpi::is_initialized" title="is_initialized"/>
-        <tab type="user" url="@ref mpi::has_env" title="has_env"/>
+        <tab type="user" url="@ref mpi::group" title="group"/>
       </tab>
       <tab type="usergroup" url="@ref mpi_types_ops" title="MPI datatypes and operations">
+        <tab type="user" url="@ref mpi::Serializable" title="Serializable"/>
         <tab type="usergroup" url="@ref mpi::mpi_type" title="mpi_type">
           <tab type="user" url="@ref mpi::mpi_type< bool >" title="mpi_type<bool>"/>
           <tab type="user" url="@ref mpi::mpi_type< char >" title="mpi_type<char>"/>
+          <tab type="user" url="@ref mpi::mpi_type< const T >" title="mpi_type<const T>"/>
+          <tab type="user" url="@ref mpi::mpi_type< double >" title="mpi_type<double>"/>
+          <tab type="user" url="@ref mpi::mpi_type< E >" title="mpi_type<E>"/>
+          <tab type="user" url="@ref mpi::mpi_type< float >" title="mpi_type<float>"/>
           <tab type="user" url="@ref mpi::mpi_type< int >" title="mpi_type<int>"/>
           <tab type="user" url="@ref mpi::mpi_type< long >" title="mpi_type<long>"/>
           <tab type="user" url="@ref mpi::mpi_type< long long >" title="mpi_type<long long>"/>
-          <tab type="user" url="@ref mpi::mpi_type< double >" title="mpi_type<double>"/>
-          <tab type="user" url="@ref mpi::mpi_type< float >" title="mpi_type<float>"/>
+          <tab type="user" url="@ref mpi::mpi_type< S >" title="mpi_type<S>"/>
           <tab type="user" url="@ref mpi::mpi_type< std::complex< double > >" title="mpi_type<std::complex<double>>"/>
+          <tab type="user" url="@ref mpi::mpi_type< std::tuple< Ts... > >" title="mpi_type<std::tuple>"/>
+          <tab type="user" url="@ref mpi::mpi_type< U >" title="mpi_type<U>"/>
           <tab type="user" url="@ref mpi::mpi_type< unsigned int >" title="mpi_type<unsigned int>"/>
           <tab type="user" url="@ref mpi::mpi_type< unsigned long >" title="mpi_type<unsigned long>"/>
           <tab type="user" url="@ref mpi::mpi_type< unsigned long long >" title="mpi_type<unsigned long long>"/>
-          <tab type="user" url="@ref mpi::mpi_type< std::tuple< Ts... > >" title="mpi_type<std::tuple>"/>
-        </tab>
-        <tab type="user" url="@ref mpi::mpi_type_from_tie" title="mpi_type_from_tie"/>
-        <tab type="user" url="@ref mpi::get_mpi_type" title="get_mpi_type"/>
-        <tab type="user" url="@ref mpi::has_mpi_type" title="has_mpi_type"/>
-        <tab type="user" url="@ref mpi::map_C_function" title="map_C_function"/>
-        <tab type="user" url="@ref mpi::map_add" title="map_add"/>
-      </tab>
-      <tab type="usergroup" url="@ref coll_comm" title="Collective MPI communication">
-        <tab type="user" url="@ref mpi::all_gather" title="all_gather"/>
-        <tab type="user" url="@ref mpi::all_reduce" title="all_reduce"/>
-        <tab type="user" url="@ref mpi::all_reduce_in_place" title="all_reduce_in_place"/>
-        <tab type="usergroup" url="@ref mpi::broadcast" title="broadcast">
-          <tab type="user" url="@ref mpi::mpi_broadcast" title="mpi_broadcast"/>
-          <tab type="user" url="@ref mpi::mpi_broadcast(std::pair< T1, T2 >&amp;, mpi::communicator, int)" title="mpi_broadcast for std::pair"/>
-          <tab type="user" url="@ref mpi::mpi_broadcast(std::string&amp;, mpi::communicator, int)" title="mpi_broadcast for std::string"/>
-          <tab type="user" url="@ref mpi::mpi_broadcast(std::vector< T >&amp;, mpi::communicator, int)" title="mpi_broadcast for std::vector"/>
-        </tab>
-        <tab type="usergroup" url="@ref mpi::gather" title="gather">
-          <tab type="user" url="@ref mpi::mpi_gather" title="mpi_gather for std::vector"/>
-        </tab>
-        <tab type="usergroup" url="@ref mpi::reduce" title="reduce">
-          <tab type="user" url="@ref mpi::mpi_reduce" title="mpi_reduce"/>
-          <tab type="user" url="@ref mpi::mpi_reduce(std::pair< T1, T2 > const &amp;, mpi::communicator, int, bool, MPI_Op)" title="mpi_reduce for std::pair"/>
-          <tab type="user" url="@ref mpi::mpi_reduce(std::vector< T > const &amp;, mpi::communicator, int, bool, MPI_Op)" title="mpi_reduce for std::vector"/>
-        </tab>
-        <tab type="usergroup" url="@ref mpi::reduce_in_place" title="reduce_in_place">
-          <tab type="user" url="@ref mpi::mpi_reduce_in_place" title="mpi_reduce_in_place"/>
-          <tab type="user" url="@ref mpi::mpi_reduce_in_place(std::vector< T >&amp;, mpi::communicator, int, bool, MPI_Op)" title="mpi_reduce_in_place for std::vector"/>
-        </tab>
-        <tab type="usergroup" url="@ref mpi::scatter" title="scatter">
-          <tab type="user" url="@ref mpi::mpi_scatter" title="mpi_scatter"/>
         </tab>
       </tab>
-      <tab type="usergroup" url="@ref mpi_lazy" title="Lazy MPI communication">
-        <tab type="user" url="@ref mpi::lazy" title="lazy"/>
-        <tab type="user" url="@ref mpi::tag::gather" title="gather tag"/>
-        <tab type="user" url="@ref mpi::tag::reduce" title="reduce tag"/>
-        <tab type="user" url="@ref mpi::tag::scatter" title="scatter tag"/>
-        <tab type="user" url="@ref mpi::is_mpi_lazy" title="is_mpi_lazy"/>
+      <tab type="user" url="@ref coll_comm" title="Collective MPI communication"/>
+      <tab type="usergroup" url="@ref mpi_osc_shm" title="MPI one-sided communication and shared memory">
+        <tab type="user" url="@ref mpi::shared_communicator" title="shared_communicator"/>
+        <tab type="user" url="@ref mpi::window" title="window"/>
+        <tab type="user" url="@ref mpi::shared_window" title="shared_window"/>
       </tab>
-      <tab type="usergroup" url="@ref err_handling" title="Error handling">
+      <tab type="usergroup" url="@ref event_handling" title="Event handling">
         <tab type="user" url="@ref mpi::monitor" title="monitor"/>
       </tab>
       <tab type="usergroup" url="@ref utilities" title="Utilities">
-        <tab type="user" url="@ref mpi::regular_t" title="regular_t"/>
-        <tab type="user" url="@ref mpi::chunk" title="chunk"/>
-        <tab type="user" url="@ref mpi::chunk_length" title="chunk_length"/>
+        <tab type="user" url="@ref mpi::MPICompatibleRange" title="MPICompatibleRange"/>
       </tab>
       <tab type="filelist" visible="yes" title="" intro=""/>
     </tab>
@@ -316,4 +287,4 @@
     </memberdecl>
     <detaileddescription title=""/>
   </directory>
-</doxygenlayout>
\ No newline at end of file
+</doxygenlayout>
diff --git a/doc/_static/CCQ-dark.png b/doc/_static/CCQ-dark.png
new file mode 100644
index 00000000..fbaef522
Binary files /dev/null and b/doc/_static/CCQ-dark.png differ
diff --git a/doc/_static/CCQ.png b/doc/_static/CCQ.png
new file mode 100644
index 00000000..e13f9083
Binary files /dev/null and b/doc/_static/CCQ.png differ
diff --git a/doc/documentation.md b/doc/documentation.md
index 2722befd..9f464e88 100644
--- a/doc/documentation.md
+++ b/doc/documentation.md
@@ -4,8 +4,8 @@
 
 **mpi** implements various high-level C++ wrappers around their low-level C counterparts.
 It is not intended as a full replacement for the C implementation.
-Instead it tries to help the user with the most common tasks like initializing and finalizing
-an @ref mpi::environment "MPI environment" or sending data via @ref coll_comm "collective communications".
+Instead it tries to help the user with the most common tasks like initializing and finalizing an @ref mpi::environment
+"MPI environment" or sending data via @ref coll_comm "collective communications".
 
 The following provides a detailed reference documentation grouped into logical units.
 
@@ -16,34 +16,32 @@ If you are looking for a specific function, class, etc., try using the search ba
 @ref mpi_essentials provide the user with two classes necessary for any MPI program:
 
 * The mpi::environment class is used to initialize and finialize the MPI execution environment.
-    It calls `MPI_Init` in its constructor and `MPI_Finalize` in its destructor.
-    There should be at most one instance in every program and it is usually created at the very beginning of the `main` function.
+  It calls `MPI_Init` in its constructor and `MPI_Finalize` in its destructor.
+  There should be at most one instance in every program and it is usually created at the very beginning of the `main`
+  function.
 
 * The mpi::communicator class is a simple wrapper around an `MPI_Comm` object.
-    Besides storing the `MPI_Comm` object, it also provides some convient functions for getting the size of the communicator,
-    the rank of the current process or for splitting an existing communicator.
+  Besides storing the `MPI_Comm` object, it also provides some convient functions for getting the size of the
+  communicator, the rank of the current process or for splitting an existing communicator.
 
-## MPI datatypes and operations
+* The mpi::group class is a simple wrapper around an `MPI_Group` object.
+  Besides storing the `MPI_Group` object, it also provides some convient functions for getting the size of the
+  group, the rank of the current process or for splitting the group based on include rules.
 
-@ref mpi_types_ops map various C++ datatypes to MPI datatypes and help the user with registering their own datatypes to be
-used in MPI communications.
-Furthermore, it offers tools to simplify the creation of custom MPI operations usually required in `MPI_Reduce` or `MPI_Accumulate` functions.
+It further contains the convenient function mpi::is_initialized and the static boolean mpi::has_env.
 
-## Collective MPI communication
+## MPI datatypes and operations
 
-The following generic collective communications are defined in @ref coll_comm "Collective MPI communication":
+@ref mpi_types_ops map various C++ datatypes to MPI datatypes and help the user with registering their own datatypes to
+be used in MPI communications.
+Furthermore, it offers tools to simplify the creation of custom MPI operations usually required in `MPI_Reduce` or
+`MPI_Accumulate` functions.
 
-* @ref mpi::all_gather "all_gather"
-* @ref mpi::all_reduce "all_reduce"
-* @ref mpi::all_reduce_in_place "all_reduce_in_place"
-* @ref mpi::broadcast "broadcast"
-* @ref mpi::gather "gather"
-* @ref mpi::reduce "reduce"
-* @ref mpi::reduce_in_place "reduce_in_place"
-* @ref mpi::scatter "scatter"
+## Collective MPI communication
 
-They offer a much simpler interface than their MPI C library analogs. For example, the following broadcasts a `std::vector<double>`
-from the process with rank 0 to all others:
+**mpi** provides several generic @ref coll_comm "Collective MPI communication".
+They offer a much simpler interface than their MPI C library analogs.
+For example, the following broadcasts a `std::vector<double>` from the process with rank 0 to all others:
 
 ```cpp
 mpi::broadcast(vec);
@@ -55,31 +53,42 @@ Compare this with the call to the C library:
 MPI_Bcast(vec.data(), static_cast<int>(vec.size()), MPI_DOUBLE, 0, MPI_COMM_WORLD);
 ```
 
-Under the hood, the generic mpi::broadcast implementation calls the specialized @ref "mpi::mpi_broadcast(std::vector< T >&, mpi::communicator, int)".
-The other generic functions are implement in the same way.
-See the "Functions" section in @ref coll_comm to check which datatypes are supported out of the box.
+Under the hood, the generic mpi::broadcast implementation calls the specialized
+@ref "mpi::mpi_broadcast(std::vector< T >&, mpi::communicator, int)".
+Other generic functions in **mpi** work similarly.
+See the "Functions" section in @ref coll_comm to check which datatypes and MPI operations are supported out of the box.
 
 In case your datatype is not supported, you are free to provide your own specialization.
 
+## MPI one-sided communication and shared memory
+
+@ref mpi_osc_shm can be used to get data from or put data directly to the memory
+of another process.  This can be done without the involvement of processes that
+are unaffected by the data transfer, i.e. no collective call is required, only
+the origin and target process of the data transfer must cooperate.
+
+Another use-case of @ref mpi_osc_shm is the shared memory aspect by which
+MPI applications can reduce their memory requirements through the deduplication
+of replicated data between MPI ranks that are executed on the same SMP node.
+
 ## Lazy MPI communication
 
 @ref mpi_lazy can be used to provied collective MPI communication for lazy expression types.
 Most users probably won't need to use this functionality directly.
 
-We refer the interested reader to [TRIQS/nda](https://github.com/TRIQS/nda/blob/unstable/c%2B%2B/nda/mpi/reduce.hpp) for more details.
+We refer the interested reader to [TRIQS/nda](https://triqs.github.io/nda/latest/group__av__mpi.html) for more details.
 
-## Error handling
+## Event handling
 
-@ref err_handling provides the mpi::monitor class which can be used to communicate and handle errors across multiple processes.
+@ref event_handling provides the mpi::monitor class which can be used to communicate and handle events across multiple
+processes.
 
 @ref ex2 shows a simple use case.
 
 ## Utilities
 
-@ref utilities is a collection of various other tools which do not fit into any other category above.
-
-The following utilities are defined in **mpi**:
+@ref utilities is a collection of various other tools in **mpi** which do not fit into any other category above.
 
-* @ref mpi::regular_t "regular_t"
-* @ref mpi::chunk "chunk"
-* @ref mpi::chunk_length "chunk_length"
+For users, the most useful of them is probably mpi::check_mpi_call.
+A wrapper function that checks the error code returned by MPI C library routines and throws an exception in case the
+code is `!= MPI_SUCCESS`.
diff --git a/doc/ex2.md b/doc/ex2.md
index 024c8c17..7d2d7287 100644
--- a/doc/ex2.md
+++ b/doc/ex2.md
@@ -17,35 +17,35 @@ int main(int argc, char *argv[]) {
   // initialize monitor
   mpi::monitor monitor(world);
 
-  // in case a stop has been requested, print some info and return true
+  // in case an event has occurred, print some info and return true
   auto stop = [&monitor, world](int i) {
     bool res = false;
-    if (monitor.emergency_occured()) {
-      std::cerr << "Processor " << world.rank() << ": After " << i << " steps an emergency stop has been received.\n";
+    if (monitor.event_on_any_rank()) {
+      std::cerr << "Processor " << world.rank() << ": After " << i << " steps an event has been communicated.\n";
       res = true;
     }
     return res;
   };
 
-  // loop as long as no stop has been requested
-  int rank_to_req = 3;
+  // loop as long as no event has occurred
+  int event_rank = 3;
   for (int i = 0; i < 1000000; ++i) {
-    // request a stop on processor 3
-    if (world.rank() == rank_to_req) {
-      std::cerr << "Processor " << rank_to_req << ": Emergency stop requested.\n";
-      monitor.request_emergency_stop();
+    // report a local event on the event_rank
+    if (world.rank() == event_rank) {
+      std::cerr << "Processor " << event_rank << ": Local event reported.\n";
+      monitor.report_local_event();
     }
 
     // should we stop the loop?
     if (stop(i)) break;
   }
 
-  // check if all processes finished without an error
+  // check if all processes finished the loop
   if (world.rank() == 0) {
-    if (monitor.emergency_occured()) {
-      std::cout << "Oh no! An error occurred somewhere.\n";
+    if (monitor.event_on_any_rank()) {
+      std::cout << "Oh no! An event occurred somewhere and the loop has not been finished on all processes.\n";
     } else {
-      std::cout << "No worries, all processes finished without an error.\n";
+      std::cout << "No worries, all processes have finished the loop.\n";
     }
   }
 }
@@ -54,24 +54,24 @@ int main(int argc, char *argv[]) {
 Output (running with `-n 12`):
 
 ```
-Processor 3: Emergency stop requested.
-Processor 3: After 0 steps an emergency stop has been received.
-Processor 2: After 5950 steps an emergency stop has been received.
-Processor 4: After 10475 steps an emergency stop has been received.
-Processor 5: After 7379 steps an emergency stop has been received.
-Processor 6: After 8366 steps an emergency stop has been received.
-Processor 7: After 1302 steps an emergency stop has been received.
-Processor 8: After 1155 steps an emergency stop has been received.
-Processor 9: After 14445 steps an emergency stop has been received.
-Processor 11: After 9287 steps an emergency stop has been received.
-Processor 0: After 0 steps an emergency stop has been received.
-Processor 1: After 7443 steps an emergency stop has been received.
-Processor 10: After 1321 steps an emergency stop has been received.
-Oh no! An error occurred somewhere.
+Processor 3: Local event reported.
+Processor 3: After 0 steps an event has been communicated.
+Processor 4: After 8428 steps an event has been communicated.
+Processor 0: After 0 steps an event has been communicated.
+Processor 8: After 10723 steps an event has been communicated.
+Processor 5: After 10426 steps an event has been communicated.
+Processor 6: After 12172 steps an event has been communicated.
+Processor 7: After 9014 steps an event has been communicated.
+Processor 1: After 400 steps an event has been communicated.
+Processor 2: After 1646 steps an event has been communicated.
+Processor 11: After 12637 steps an event has been communicated.
+Processor 10: After 9120 steps an event has been communicated.
+Processor 9: After 1 steps an event has been communicated.
+Oh no! An event occurred somewhere and the loop has not been finished on all processes.
 ```
 
 Output (running with `-n 3`):
 
 ```
-No worries, all processes finished without an error.
+No worries, all processes have finished the loop.
 ```
\ No newline at end of file
diff --git a/doc/ex3.md b/doc/ex3.md
index 27bddd26..4e4810ea 100644
--- a/doc/ex3.md
+++ b/doc/ex3.md
@@ -2,7 +2,8 @@
 
 [TOC]
 
-In this example, we show how to use mpi::mpi_type_from_tie, mpi::map_C_function and mpi::map_add to register a new MPI datatype and to define MPI operations for it.
+In this example, we show how to register a new MPI datatype and how to use mpi::map_C_function and mpi::map_add to
+define MPI operations for it.
 
 ```cpp
 #include <mpi/mpi.hpp>
@@ -19,14 +20,11 @@ inline my_complex operator+(const my_complex& z1, const my_complex& z2) {
   return { z1.real + z2.real, z1.imag + z2.imag };
 }
 
-// define a tie_data function for mpi_type_from_tie
+// define a tie_data function for my_complex to make it MPI compatible
 inline auto tie_data(const my_complex& z) {
   return std::tie(z.real, z.imag);
 }
 
-// register my_complex as an MPI type
-template <> struct mpi::mpi_type<my_complex> : mpi::mpi_type_from_tie<my_complex> {};
-
 int main(int argc, char *argv[]) {
   // initialize MPI environment
   mpi::environment env(argc, argv);
diff --git a/doc/ex4.md b/doc/ex4.md
new file mode 100644
index 00000000..7d619d5b
--- /dev/null
+++ b/doc/ex4.md
@@ -0,0 +1,63 @@
+@page ex4 Example 4: Provide custom spezializations
+
+[TOC]
+
+In this example, we show how to write a specialized `mpi_reduce_into` for a custom type.
+
+```cpp
+#include <mpi/mpi.hpp>
+#include <iostream>
+#include <vector>
+
+// Custom type.
+class foo {
+  public:
+  // Constructor.
+  foo(int x = 5) : x_(x) {}
+
+  // Get the value stored in the class.
+  int x() const { return x_; }
+
+  // Specialization of mpi_reduce_into for the custom type.
+  friend void mpi_reduce_into(foo const &f_in, foo &f_out, mpi::communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
+    mpi::reduce_into(f_in.x_, f_out.x_, c, root, all, op);
+  }
+
+  private:
+  int x_;
+};
+
+int main(int argc, char *argv[]) {
+  // initialize MPI environment
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+
+  // create a vector of foo objects
+  std::vector<foo> vec {foo{1}, foo{2}, foo{3}, foo{4}, foo{5}};
+
+  // reduce the vector of foo objects
+  auto result = mpi::reduce(vec, world);
+
+  // print the result on rank 0
+  if (world.rank() == 0) {
+    std::cout << "Reduced vector: ";
+    for (auto const &f : result) std::cout << f.x() << " ";
+    std::cout << "\n";
+  }
+}
+```
+
+Output (running with `-n 4`):
+
+```
+Reduced vector: 4 8 12 16 20
+```
+
+Note that by providing a simple `mpi_reduce_into` for our custom `foo` type, we are able to reduce a `std::vector` of
+`foo` objects without any additional work.
+
+Under the hood, each `foo` object is reduced spearately using the above specialization.
+For large amounts of data or in performance critical code sections, this might not be desired.
+In such a case, it is usally better to make the type MPI compatible such that the reduction can be done with a single
+call to MPI C library.
+See @ref ex3 for more details.
diff --git a/doc/examples.md b/doc/examples.md
index 94c15688..2b85e2a2 100644
--- a/doc/examples.md
+++ b/doc/examples.md
@@ -5,13 +5,16 @@
 - @ref ex1 "Example 1: Hello world!"
 - @ref ex2 "Example 2: Use monitor to communicate errors"
 - @ref ex3 "Example 3: Custom type and operator"
+- @ref ex4 "Example 4: Provide custom spezializations"
 
 @section compiling Compiling the examples
 
-All examples have been compiled on a MacBook Pro with an Apple M2 Max chip and [open-mpi](https://www.open-mpi.org/) 4.1.5.
-We further used clang 16.0.6 together with cmake 3.27.2.
+All examples have been compiled on a MacBook Pro with an Apple M2 Max chip and [open-mpi](https://www.open-mpi.org/)
+5.0.1.
+We further used clang 19.1.7 together with cmake 3.31.5.
 
-Assuming that the actual example code is in a file `main.cpp`, the following generic `CMakeLists.txt` should work for all examples:
+Assuming that the actual example code is in a file `main.cpp`, the following generic `CMakeLists.txt` should work for
+all examples:
 
 ```cmake
 cmake_minimum_required(VERSION 3.20)
@@ -28,7 +31,7 @@ include (FetchContent)
 FetchContent_Declare(
   mpi
   GIT_REPOSITORY https://github.com/TRIQS/mpi.git
-  GIT_TAG        1.2.x
+  GIT_TAG        1.3.x
 )
 FetchContent_MakeAvailable(mpi)
 
diff --git a/doc/groups.dox b/doc/groups.dox
index 568c6471..c2ae3d88 100644
--- a/doc/groups.dox
+++ b/doc/groups.dox
@@ -32,32 +32,87 @@
 
 /**
  * @defgroup mpi_types_ops MPI datatypes and operations
- * @brief Specify supported MPI datatypes and provide tools to simplify the creation of user-defined MPI types and operations.
+ * @brief Specify supported MPI datatypes and provide tools to simplify the creation of user-defined MPI types and
+ * operations.
  *
- * @details See @ref ex3 for a detailed example.
+ * @details The following functionality is provided:
+ *
+ * - mpi::mpi_type and its specializations let **mpi** know that a certain type `T` can be used in MPI communications.
+ * The user is allowed to implement their own MPI compatible types and provide a specializtion of mpi::mpi_type.
+ * - mpi::get_mpi_type maps a given C++ type to its corresponding MPI datatype and mpi::has_mpi_type checks if a given
+ * type has a corresponding MPI datatype.
+ * - mpi::map_add and mpi::map_C_function can help the user to implement custom MPI operations.
+ *
+ * See @ref ex3 for a detailed example.
  */
 
 /**
  * @defgroup coll_comm Collective MPI communication
- * @brief Generic and specialized implementations for a subset of collective MPI communications (broadcast, reduce, gather, scatter).
+ * @brief Generic and specialized implementations for a subset of collective MPI communications (broadcast, reduce,
+ * gather, scatter).
+ *
+ * @details **mpi** provides several generic collective communications routines as well as specializations for certain
+ * common types. The generic functions usually simply forward the call to one of the specializations (`mpi_broadcast`,
+ * `mpi_gather`, `mpi_gather_into`, `mpi_reduce`, `mpi_reduce_into`, `mpi_scatter` or `mpi_scatter_into`) using ADL but
+ * can also perform some additional checks. It is therefore recommended to always use the generic versions when
+ * possible.
+ *
+ * Here is a short overview of the available generic functions:
+ * - mpi::broadcast: Calls the specialization `mpi_broadcast`.
+ * - mpi::gather: Calls the specialization `mpi_gather` if it is implemented. Otherwise, it calls mpi::gather_into with
+ * a default constructed output object.
+ * - mpi::gather_into: Calls the specialization `mpi_gather_into`.
+ * - mpi::reduce: Calls the specialization `mpi_reduce` if it is implemented. Otherwise, it calls mpi::reduce_into with
+ * a default constructed output object.
+ * - mpi::reduce_in_place: Calls the specialization `mpi_reduce_into` with the same input and output object.
+ * - mpi::reduce_into: Calls the specialization `mpi_reduce_into`.
+ * - mpi::scatter: Calls the specialization `mpi_scatter` if it is implemented. Otherwise, it calls mpi::scatter_into
+ * with a default constructed output object.
+ * - mpi::scatter_into: Calls the specialization `mpi_scatter_into`.
+ *
+ * In case, all processes should receive the result of the MPI operation, one can use the convenience functions
+ * mpi::all_gather, mpi::all_gather_into, mpi::all_reduce, mpi::all_reduce_in_place or mpi::all_reduce_into. They
+ * forward the given arguments to their "non-all" counterparts with the `all` argument set to true.
+ *
+ * **mpi** provides various specializations for several types. For example,
+ * - for MPI compatible types, i.e. for types that have a corresponding mpi::mpi_type, it provides an
+ * @ref "mpi::mpi_broadcast(T &x, mpi::communicator, int)" "mpi_broadcast",
+ * @ref "mpi::mpi_reduce(T const &, mpi::communicator, int, bool, MPI_Op)" "mpi_reduce",
+ * @ref "mpi::mpi_reduce_into(T const &, T &, mpi::communicator, int, bool, MPI_Op)" "mpi_reduce_into",
+ * @ref "mpi::mpi_gather(T const &, mpi::communicator, int, bool)" "mpi_gather" and an
+ * @ref "mpi::mpi_gather_into(T const &, R &&, mpi::communicator, int, bool)" "mpi_gather_into".
+ * - for strings, it provides an @ref "mpi::mpi_broadcast(std::string &, mpi::communicator, int)" "mpi_broadcast"
+ * and an @ref "mpi::mpi_gather_into(std::string const &, std::string &, mpi::communicator, int, bool)"
+ * "mpi_gather_into".
+ *
+ * Users are encouraged to implement their own specializations for their custom types or in case a specialization is
+ * missing (see e.g. @ref ex4).
+ *
+ * Furthermore, there are several functions to simplify communicating (contiguous) ranges: mpi::broadcast_range,
+ * mpi::gather_range, mpi::reduce_range and mpi::scatter_range. Some of these range functions are more generic than
+ * others. Please check the documentation of the specific function for more details.
+ */
+
+/**
+ * @defgroup mpi_osc_shm MPI one-sided communication and shared memory
+ * @brief Abstraction over the `MPI_Win` object that facilitates remote memory access and node-local shared memory.
  *
- * @details The generic functions (mpi::broadcast, mpi::reduce, mpi::scatter, ...) call their more specialized counterparts
- * (e.g. mpi::mpi_broadcast, mpi::mpi_reduce, mpi::mpi_scatter, ...).
+ * @details The primary usage is the shared memory to deduplicate data that is shared by all MPI ranks.
  */
 
 /**
  * @defgroup mpi_lazy Lazy MPI communication
  * @brief Allow specific types to use lazy MPI communication.
  *
- * @details See for example the [lazy reduce](https://github.com/TRIQS/nda/blob/unstable/c%2B%2B/nda/mpi/reduce.hpp)
- * in the [nda library](https://github.com/TRIQS/nda) for a reduce operation on a multi-dimensional array.
+ * @details Please look at the MPI interface in the [nda library](https://triqs.github.io/nda/latest/group__av__mpi.html)
+ * for more details.
  */
 
 /**
- * @defgroup err_handling Error handling
- * @brief Communicate and handle errors across multiple processes.
+ * @defgroup event_handling Event handling
+ * @brief Communicate and handle events across multiple processes.
  *
- * @details A typical use case for the mpi::monitor class could be:
+ * @details A typical use case for the mpi::monitor class could be to monitor and communicate exceptions:
  *
  * @code{.cpp}
  * // initialize monitor
@@ -65,19 +120,19 @@
  * ...
  *
  * // loop as long as everything is fine
- * while (!monitor.emergency_occured()) {
+ * while (!monitor.event_on_any_rank()) {
  *   try {
  *      // do some work
  *      ...
  *   } catch (my_exception const &e) {
- *     // send an emergency stop request
- *     monitor.request_emergency_stop();
+ *     // report an exception
+ *     monitor.report_local_event();
  *   }
  * }
  *
- * // finalize communications and check if the computation finished due to an error
+ * // finalize communications and check if the computation finished due to an exception
  * monitor.finalize_communications();
- * if (!monitor.emergency_occured()) {
+ * if (!monitor.event_on_any_rank()) {
  *    // do some clean up and maybe stop the program
  *    ...
  * }
diff --git a/doc/installation.md b/doc/installation.md
index ff2dedce..2ac3c37e 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -5,8 +5,8 @@
 **mpi** is a header only library and installation is not necessary.
 However, it still supports the usual installation procedure using CMake.
 
-If you want to skip the installation step, you can go directly to @ref integration to see how you can integrate
-**mpi** into your own C++ project.
+If you want to skip the installation step, you can go directly to @ref integration to see how you can integrate **mpi**
+into your own C++ project.
 
 > **Note:** To guarantee reproducibility in scientific calculations, we strongly recommend the use of a stable
 > [release version](https://github.com/TRIQS/mpi/releases).
@@ -63,7 +63,7 @@ $ cd mpi.src && git tag
 Checkout the version of the code that you want:
 
 ```console
-$ git checkout 1.2.0
+$ git checkout 1.3.0
 ```
 
 and follow steps 2 to 4 above to compile the code.
diff --git a/doc/integration.md b/doc/integration.md
index 720a3d61..3e844bbb 100644
--- a/doc/integration.md
+++ b/doc/integration.md
@@ -3,8 +3,9 @@
 [TOC]
 
 **mpi** is a header only library.
-To use it in your own `C++` code, you simply have to include the relevant header files and
-tell your compiler/build system where it can find the necessary files.
+To use it in your own `C++` code, you simply have to include the relevant header files and tell your compiler/build
+system where it can find the necessary files.
+
 For example:
 
 ```cpp
@@ -19,9 +20,9 @@ In the following, we describe some common ways to achieve this (with special foc
 
 @subsection fetch FetchContent
 
-If you use [CMake](https://cmake.org/) to build your source code, it is recommended to fetch the source code directly from the
-[Github repository](https://github.com/TRIQS/mpi) using CMake's [FetchContent](https://cmake.org/cmake/help/latest/module/FetchContent.html)
-module:
+If you use [CMake](https://cmake.org/) to build your source code, it is recommended to fetch the source code directly
+from the [Github repository](https://github.com/TRIQS/mpi) using CMake's
+[FetchContent](https://cmake.org/cmake/help/latest/module/FetchContent.html) module:
 
 ```cmake
 cmake_minimum_required(VERSION 3.20)
@@ -32,7 +33,7 @@ include(FetchContent)
 FetchContent_Declare(
   mpi
   GIT_REPOSITORY https://github.com/TRIQS/mpi.git
-  GIT_TAG        1.2.x
+  GIT_TAG        1.3.x
 )
 FetchContent_MakeAvailable(mpi)
 
@@ -42,12 +43,13 @@ target_link_libraries(my_executable mpi::mpi_c)
 ```
 
 Note that the above will also build [goolgetest](https://github.com/google/googletest) and the unit tests for **mpi**.
-To disable this, you can put `set(Build_Tests OFF CACHE BOOL "" FORCE)` before fetching the content or by specifying `-DBuild_Tests=OFF` on the command line.
+To disable this, you can put `set(Build_Tests OFF CACHE BOOL "" FORCE)` before fetching the content or by specifying
+`-DBuild_Tests=OFF` on the command line.
 
 @subsection find_package find_package
 
-If you have already installed **mpi** on your system by following the instructions from the @ref installation page, you can also make
-use of CMake's [find_package](https://cmake.org/cmake/help/latest/command/find_package.html) command.
+If you have already installed **mpi** on your system by following the instructions from the @ref installation page, you
+can also make use of CMake's [find_package](https://cmake.org/cmake/help/latest/command/find_package.html) command.
 This has the advantage that you don't need to download anything, i.e. no internet connection is required.
 
 Let's assume that **mpi** has been installed to `path_to_install_dir`.
@@ -65,8 +67,9 @@ add_executable(my_executable main.cpp)
 target_link_libraries(my_executable mpi::mpi_c)
 ```
 
-In case, CMake cannot find the package, you might have to tell it where to look for the `mpi-config.cmake` file by setting the variable
-`mpi_DIR` to `path_to_install_dir/lib/cmake/mpi` or by sourcing the provided `mpivars.sh` before running CMake:
+In case, CMake cannot find the package, you might have to tell it where to look for the `mpi-config.cmake` file by
+setting the variable `mpi_DIR` to `path_to_install_dir/lib/cmake/mpi` or by sourcing the provided `mpivars.sh` before
+running CMake:
 
 ```console
 $ source path_to_install_dir/share/mpi/mpivars.sh
@@ -74,7 +77,8 @@ $ source path_to_install_dir/share/mpi/mpivars.sh
 
 @subsection add_sub add_subdirectory
 
-You can also integrate **mpi** into our CMake project by placing the entire source tree in a subdirectory and call `add_subdirectory()`:
+You can also integrate **mpi** into our CMake project by placing the entire source tree in a subdirectory and call
+`add_subdirectory()`:
 
 ```cmake
 cmake_minimum_required(VERSION 3.20)
diff --git a/doc/issues.md b/doc/issues.md
index b2bb6b76..b14cd0ee 100644
--- a/doc/issues.md
+++ b/doc/issues.md
@@ -5,17 +5,14 @@
 Please report all problems and bugs directly at the [GitHub issues page](https://github.com/TRIQS/mpi/issues).
 In order to make it easier for us to solve the issue please follow these guidelines:
 
-1. In all cases specify which version of the application you are using. You can
-   find the version number in the file `CMakeLists.txt` at the root of the
-   application sources.
+1. In all cases specify which version of the application you are using. You can find the version number in the file
+   `CMakeLists.txt` at the root of the application sources.
 
-2. If you have a problem during the installation, give us information about
-   your operating system and the compiler you are using. Include the outputs of
-   the `cmake` and `make` commands as well as the `CMakeCache.txt` file
-   which is in the build directory. Please include these outputs in a
-   [gist](http://gist.github.com/>) file referenced in the issue.
+2. If you have a problem during the installation, give us information about your operating system and the compiler you
+   are using. Include the outputs of the `cmake` and `make` commands as well as the `CMakeCache.txt` file which is in
+   the build directory. Please include these outputs in a [gist](http://gist.github.com/>) file referenced in the issue.
 
-3. If you are experiencing a problem during the execution of the application, provide
-   a script which allows to quickly reproduce the problem.
+3. If you are experiencing a problem during the execution of the application, provide a script which allows to quickly
+   reproduce the problem.
 
 Thanks!
diff --git a/doc/overview.md.in b/doc/overview.md.in
index 876a9631..e1b8e773 100644
--- a/doc/overview.md.in
+++ b/doc/overview.md.in
@@ -28,10 +28,11 @@ int main(int argc, char *argv[]) {
 }
 ```
 
-**mpi** is a minimal C++ wrapper around the MPI C library and provides only a small subset of the functionality defined in the MPI standard.
+**mpi** is a minimal C++ wrapper around the MPI C library and provides only a small subset of the functionality defined
+in the MPI standard.
 
-The main purpose of the library is to simplify the most common tasks like initializing/finalizing the MPI execution environment or performing
-non-blocking collective communications.
+The main purpose of the library is to simplify the most common tasks like initializing/finalizing the MPI execution
+environment or performing non-blocking collective communications.
 
 For more advanced tasks, the user can always resort to the underlying MPI C-implementation.
 
diff --git a/share/cmake/extract_flags.cmake b/share/cmake/extract_flags.cmake
index dd4edffc..7313c17f 100644
--- a/share/cmake/extract_flags.cmake
+++ b/share/cmake/extract_flags.cmake
@@ -92,6 +92,11 @@ macro(extract_flags)
     endif()
   endforeach()
 
+  get_property_recursive(libdirs TARGET ${target} PROPERTY INTERFACE_LINK_DIRECTORIES)
+  foreach(dir ${libdirs})
+    set(${target}_LDFLAGS "${${target}_LDFLAGS} -L${dir}")
+  endforeach()
+
   # ==== We have to replace generator expressions explicitly ====
 
   if(ARG_BUILD_INTERFACE)
@@ -114,8 +119,12 @@ macro(extract_flags)
   endif()
 
   # Remove all remaining generator expressions
-  string(REGEX REPLACE " [^ ]*\\$<[^ ]*:[^>]*>" "" ${target}_LDFLAGS "${${target}_LDFLAGS}")
-  string(REGEX REPLACE " [^ ]*\\$<[^ ]*:[^>]*>" "" ${target}_CXXFLAGS "${${target}_CXXFLAGS}")
+  string(REGEX REPLACE " [^ ]*\\$<[^ ]*:[^ ]*>" "" ${target}_LDFLAGS "${${target}_LDFLAGS}")
+  string(REGEX REPLACE " [^ ]*\\$<[^ ]*:[^ ]*>" "" ${target}_CXXFLAGS "${${target}_CXXFLAGS}")
+
+  # Filter out ::@ expressions
+  string(REGEX REPLACE "::@[^ ]* " "" ${target}_LDFLAGS "${${target}_LDFLAGS}")
+  string(REGEX REPLACE "::@[^ ]* " "" ${target}_CXXFLAGS "${${target}_CXXFLAGS}")
 
   # Filter out system directories from LDFLAGS and CXXFLAGS
   string(REGEX REPLACE " -L/usr/lib " " " ${target}_LDFLAGS "${${target}_LDFLAGS}")
diff --git a/share/mpi.modulefile.in b/share/mpi.modulefile.in
index 03577b09..98db4ab0 100644
--- a/share/mpi.modulefile.in
+++ b/share/mpi.modulefile.in
@@ -28,7 +28,5 @@ setenv          @PROJECT_NAME@_VERSION   $version
 setenv          @PROJECT_NAME@_GIT_HASH  $git_hash
 
 prepend-path    PATH                $root/bin
-prepend-path    CPLUS_INCLUDE_PATH  $root/include
-prepend-path    LIBRARY_PATH        $root/lib
 prepend-path    LD_LIBRARY_PATH     $root/lib
 prepend-path    CMAKE_PREFIX_PATH   $root
diff --git a/share/mpivars.sh.in b/share/mpivars.sh.in
index b0ba52af..0ae70d7e 100644
--- a/share/mpivars.sh.in
+++ b/share/mpivars.sh.in
@@ -2,8 +2,6 @@
 
 export @PROJECT_NAME@_ROOT=@CMAKE_INSTALL_PREFIX@
 
-export CPLUS_INCLUDE_PATH=@CMAKE_INSTALL_PREFIX@/include:$CPLUS_INCLUDE_PATH
 export PATH=@CMAKE_INSTALL_PREFIX@/bin:$PATH
-export LIBRARY_PATH=@CMAKE_INSTALL_FULL_LIBDIR@:$LIBRARY_PATH
 export LD_LIBRARY_PATH=@CMAKE_INSTALL_FULL_LIBDIR@:$LD_LIBRARY_PATH
 export CMAKE_PREFIX_PATH=@CMAKE_INSTALL_PREFIX@:$CMAKE_PREFIX_PATH
diff --git a/test/c++/CMakeLists.txt b/test/c++/CMakeLists.txt
index be60f506..624a682e 100644
--- a/test/c++/CMakeLists.txt
+++ b/test/c++/CMakeLists.txt
@@ -9,8 +9,8 @@ file(GLOB_RECURSE all_tests RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
 
 # List of all no mpi tests
 file(GLOB_RECURSE nompi_tests RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
-# remove custom and monitor mpi tests as they explicitly use MPI
-list(REMOVE_ITEM nompi_tests mpi_custom.cpp mpi_monitor.cpp)
+# remove custom, monitor, and window mpi tests as they explicitly use MPI
+list(REMOVE_ITEM nompi_tests mpi_custom.cpp mpi_monitor.cpp mpi_window.cpp)
 
 # ========= OpenMP Dependency ==========
 
diff --git a/test/c++/custom_types.hpp b/test/c++/custom_types.hpp
new file mode 100644
index 00000000..f803ee7d
--- /dev/null
+++ b/test/c++/custom_types.hpp
@@ -0,0 +1,69 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#pragma once
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <algorithm>
+#include <tuple>
+#include <vector>
+
+// Custom type which is MPI compatible.
+struct mpi_t {
+  long a{0};
+  bool operator==(const mpi_t &) const = default;
+  mpi_t operator+(mpi_t x) const {
+    x.a += a;
+    return x;
+  }
+};
+
+// Tie the data (to make it MPI compatible).
+inline auto tie_data(mpi_t const &x) { return std::tie(x.a); }
+
+// Custom type which is not MPI compatible but has specialized mpi_xxx implementations.
+struct non_mpi_t {
+  int a{1};
+  bool operator==(const non_mpi_t &) const = default;
+};
+
+// Specialize mpi_broadcast for non_mpi_t.
+void mpi_broadcast(non_mpi_t &x, mpi::communicator c = {}, int root = 0) { broadcast(x.a, c, root); }
+
+// Specialize mpi_reduce_into for non_mpi_t.
+void mpi_reduce_into(non_mpi_t const &in, non_mpi_t &out, mpi::communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
+  mpi::reduce_into(in.a, out.a, c, root, all, op);
+}
+
+// Specialize mpi_gather for non_mpi_t.
+std::vector<non_mpi_t> mpi_gather(non_mpi_t const &x, mpi::communicator c = {}, int root = 0, bool all = false) {
+  std::vector<int> a_vec = gather(x.a, c, root, all);
+  std::vector<non_mpi_t> res{};
+  if (c.rank() == root || all) {
+    res.resize(c.size());
+    std::ranges::transform(a_vec, res.begin(), [](int a) { return non_mpi_t{a}; });
+  }
+  return res;
+}
+
+// Specialize mpi_gather_into for non_mpi_t.
+void mpi_gather_into(non_mpi_t const &x, auto &&rg, mpi::communicator c = {}, int root = 0, bool all = false) {
+  auto vec = mpi_gather(x, c, root, all);
+  if (c.rank() == root || all) std::ranges::copy(vec, std::ranges::begin(rg));
+}
diff --git a/test/c++/mpi_broadcast.cpp b/test/c++/mpi_broadcast.cpp
new file mode 100644
index 00000000..c7b91243
--- /dev/null
+++ b/test/c++/mpi_broadcast.cpp
@@ -0,0 +1,52 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <string>
+#include <utility>
+
+// Test broadcasting a single value/object.
+template <typename T> void test_broadcast(T root_value) {
+  mpi::communicator world;
+  for (int root = 0; root < world.size(); ++root) {
+    T bcast_value{};
+    if (world.rank() == root) bcast_value = root_value;
+    mpi::broadcast(bcast_value, world, root);
+    EXPECT_EQ(bcast_value, root_value);
+  }
+}
+
+TEST(MPI, BroadcastInteger) { test_broadcast(42); }
+
+TEST(MPI, BroadcastComplex) { test_broadcast(std::complex<double>{1.0, 2.0}); }
+
+TEST(MPI, BroadcastCustomMPIType) { test_broadcast(mpi_t{42}); }
+
+TEST(MPI, BroadcastCustomNonMPIType) { test_broadcast(non_mpi_t{42}); }
+
+TEST(MPI, BroadcastString) { test_broadcast(std::string{"Hello World"}); }
+
+TEST(MPI, BroadcastPairOfStringAndComplex) { test_broadcast(std::make_pair(std::string{"Hello"}, std::complex<double>{1.0, 2.0})); }
+
+TEST(MPI, BroadcastPairOfCustomMPITypeAndCustomNonMPIType) { test_broadcast(std::make_pair(mpi_t{42}, non_mpi_t{-5})); }
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_broadcast_array.cpp b/test/c++/mpi_broadcast_array.cpp
new file mode 100644
index 00000000..e5a50dd6
--- /dev/null
+++ b/test/c++/mpi_broadcast_array.cpp
@@ -0,0 +1,82 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <array>
+#include <complex>
+#include <ranges>
+#include <string>
+#include <utility>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test broadcasting arrays.
+template <typename T> void test_broadcast_array(std::array<T, 5> const &root_values) {
+  mpi::communicator world;
+  auto arr = root_values;
+
+  // broadcast an array from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    arr = {};
+    if (world.rank() == root) arr = root_values;
+    mpi::broadcast(arr, world, root);
+    expect_range_eq(arr, root_values);
+  }
+
+  // broadcast an empty array
+  std::array<T, 0> empty_arr{};
+  mpi::broadcast(empty_arr, world);
+  expect_range_eq(arr, root_values);
+}
+
+TEST(MPI, BroadcastIntegerArray) { test_broadcast_array(std::array<int, 5>{1, 2, 3, 4, 5}); }
+
+TEST(MPI, BroadcastComplexArray) {
+  using namespace std::complex_literals;
+  test_broadcast_array(std::array<std::complex<double>, 5>{1.0 - 1.0i, 2.0 - 2.0i, 3.0 - 3.0i, 4.0 - 4.0i, 5.0 - 5.0i});
+}
+
+TEST(MPI, BroadcastCustomMPITypeArray) { test_broadcast_array(std::array<mpi_t, 5>{mpi_t{1}, mpi_t{2}, mpi_t{3}, mpi_t{4}, mpi_t{5}}); }
+
+TEST(MPI, BroadcastCustomNonMPITypeArray) {
+  test_broadcast_array(std::array<non_mpi_t, 5>{non_mpi_t{1}, non_mpi_t{2}, non_mpi_t{3}, non_mpi_t{4}, non_mpi_t{5}});
+}
+
+TEST(MPI, BroadcastStringArray) { test_broadcast_array(std::array<std::string, 5>{"Hello", "World", "MPI", "Broadcast", "Array"}); }
+
+TEST(MPI, BroadcastPairArray) {
+  test_broadcast_array(std::array<std::pair<int, std::string>, 5>{{{1, "Hello"}, {2, "World"}, {3, "MPI"}, {4, "Broadcast"}, {5, "Array"}}});
+}
+
+TEST(MPI, BroadcastArrayOfDoubleArrays) {
+  std::array<std::array<double, 2>, 5> root_values{};
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 2; ++j) root_values[i][j] = i * 2 + j;
+  }
+  test_broadcast_array(root_values);
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_broadcast_range.cpp b/test/c++/mpi_broadcast_range.cpp
new file mode 100644
index 00000000..bde14643
--- /dev/null
+++ b/test/c++/mpi_broadcast_range.cpp
@@ -0,0 +1,89 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <array>
+#include <complex>
+#include <list>
+#include <ranges>
+#include <span>
+#include <utility>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test broadcasting a range of objects.
+template <typename T> void test_broadcast_range(std::array<T, 5> root_values) {
+  mpi::communicator world;
+  std::array<T, 5> def_arr{};
+  def_arr.fill(root_values[0]);
+
+  // broadcast a contiguous range from different roots
+  auto arr = root_values;
+  for (int root = 0; root < world.size(); ++root) {
+    if (world.rank() == root) {
+      arr = root_values;
+      mpi::broadcast_range(std::span{arr.begin() + 2, 3}, world, root);
+      expect_range_eq(arr, root_values);
+    } else {
+      arr = def_arr;
+      mpi::broadcast_range(std::span{arr.begin(), 3}, world, root);
+      expect_range_eq(std::span{arr.begin(), 3}, std::span{root_values.begin() + 2, 3});
+      expect_range_eq(std::span{arr.begin() + 3, 2}, std::span{def_arr.begin() + 3, 2});
+    }
+  }
+
+  // broadcast a view on a non-contiguous list
+  std::list<T> list(def_arr.begin(), def_arr.end());
+  if (world.rank() == 0) list.assign(root_values.begin(), root_values.end());
+  mpi::broadcast_range(std::ranges::drop_view(list, 2), world);
+  if (world.rank() == 0) {
+    expect_range_eq(list, root_values);
+  } else {
+    expect_range_eq(std::ranges::drop_view(list, 2), std::ranges::drop_view(root_values, 2));
+    expect_range_eq(std::ranges::take_view(list, 2), std::ranges::take_view(def_arr, 2));
+  }
+}
+
+TEST(MPI, BroadcastIntegerRange) { test_broadcast_range(std::array<int, 5>{1, 2, 3, 4, 5}); }
+
+TEST(MPI, BroadcastComplexRange) {
+  using namespace std::complex_literals;
+  test_broadcast_range(std::array<std::complex<double>, 5>{1.0 - 1.0i, 2.0 - 2.0i, 3.0 - 3.0i, 4.0 - 4.0i, 5.0 - 5.0i});
+}
+
+TEST(MPI, BroadcastCustomMPITypeRange) { test_broadcast_range(std::array<mpi_t, 5>{mpi_t{1}, mpi_t{2}, mpi_t{3}, mpi_t{4}, mpi_t{5}}); }
+
+TEST(MPI, BroadcastCustomNonMPITypeRange) {
+  test_broadcast_range(std::array<non_mpi_t, 5>{non_mpi_t{1}, non_mpi_t{2}, non_mpi_t{3}, non_mpi_t{4}, non_mpi_t{5}});
+}
+
+TEST(MPI, BroadcastStringRange) { test_broadcast_range(std::array<std::string, 5>{"Hello", "World", "MPI", "Broadcast", "Array"}); }
+
+TEST(MPI, BroadcastPairRange) {
+  test_broadcast_range(std::array<std::pair<int, std::string>, 5>{{{1, "Hello"}, {2, "World"}, {3, "MPI"}, {4, "Broadcast"}, {5, "Array"}}});
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_broadcast_vector.cpp b/test/c++/mpi_broadcast_vector.cpp
new file mode 100644
index 00000000..92ba8eef
--- /dev/null
+++ b/test/c++/mpi_broadcast_vector.cpp
@@ -0,0 +1,88 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <ranges>
+#include <string>
+#include <utility>
+#include <vector>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test broadcasting vectors.
+template <typename T> void test_broadcast_vector(std::vector<T> const &root_values) {
+  mpi::communicator world;
+  auto vec = root_values;
+
+  // broadcast a vector from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    vec.clear();
+    if (world.rank() == root) vec = root_values;
+    mpi::broadcast(vec, world, root);
+    expect_range_eq(vec, root_values);
+  }
+
+  // broadcast an empty vector
+  if (world.rank() == 0) {
+    vec.clear();
+    mpi::broadcast(vec, world);
+    EXPECT_TRUE(vec.empty());
+  } else {
+    vec = root_values;
+    mpi::broadcast(vec, world);
+    EXPECT_TRUE(vec.empty());
+  }
+}
+
+TEST(MPI, BroadcastIntegerVector) { test_broadcast_vector(std::vector<int>{1, 2, 3, 4, 5}); }
+
+TEST(MPI, BroadcastComplexVector) {
+  using namespace std::complex_literals;
+  test_broadcast_vector(std::vector<std::complex<double>>{1.0 - 1.0i, 2.0 - 2.0i, 3.0 - 3.0i, 4.0 - 4.0i, 5.0 - 5.0i});
+}
+
+TEST(MPI, BroadcastCustomMPITypeVector) { test_broadcast_vector(std::vector<mpi_t>{mpi_t{1}, mpi_t{2}, mpi_t{3}, mpi_t{4}, mpi_t{5}}); }
+
+TEST(MPI, BroadcastCustomNonMPITypeVector) {
+  test_broadcast_vector(std::vector<non_mpi_t>{non_mpi_t{1}, non_mpi_t{2}, non_mpi_t{3}, non_mpi_t{4}, non_mpi_t{5}});
+}
+
+TEST(MPI, BroadcastStringVector) { test_broadcast_vector(std::vector<std::string>{"Hello", "World", "MPI", "Broadcast", "Array"}); }
+
+TEST(MPI, BroadcastPairVector) {
+  test_broadcast_vector(std::vector<std::pair<int, std::string>>{{{1, "Hello"}, {2, "World"}, {3, "MPI"}, {4, "Broadcast"}, {5, "Array"}}});
+}
+
+TEST(MPI, BroadcastVectorOfDoubleVectors) {
+  std::vector<std::vector<double>> root_values(5, std::vector<double>(2));
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 2; ++j) root_values[i][j] = i * 2 + j;
+  }
+  test_broadcast_vector(root_values);
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_chunk.cpp b/test/c++/mpi_chunk.cpp
index 88d780f6..598a244e 100644
--- a/test/c++/mpi_chunk.cpp
+++ b/test/c++/mpi_chunk.cpp
@@ -67,4 +67,54 @@ TEST(MPI, OMPHybrid) {
   EXPECT_EQ(n * (n - 1) / 2, sum);
 }
 
+TEST(MPI, OMPHybridParallelForSingleLoop) {
+  // first divide a range among MPI processes and then among OMP threads
+  mpi::communicator world;
+  int const n = 10;
+  long sum    = 0;
+#pragma omp parallel for reduction(+ : sum)
+  for (auto i : mpi::chunk(range(n))) sum += i;
+
+  // reduce and check the sum, i.e. that every element of the range has been visited
+  sum = mpi::all_reduce(sum, world);
+  EXPECT_EQ(n * (n - 1) / 2, sum);
+}
+
+TEST(MPI, OMPHybridParallelForDoubleLoop) {
+  mpi::communicator world;
+  int const n1 = 11;
+  int const n2 = 13;
+
+  // divide the outer loop among MPI processes, collapse the inner and outer loop and divide it among OMP threads
+  long sum = 0;
+#pragma omp parallel for collapse(2) reduction(+ : sum)
+  for (auto i : mpi::chunk(range(n1))) {
+    for (auto j : range(n2)) sum += i * n2 + j;
+  }
+
+  // reduce and check the sum, i.e. that every element of the range has been visited
+  sum = mpi::all_reduce(sum, world);
+  EXPECT_EQ(n1 * n2 * (n1 * n2 - 1) / 2, sum);
+}
+
+TEST(MPI, OMPHybridParallelForTripleLoop) {
+  mpi::communicator world;
+  int const n1 = 11;
+  int const n2 = 13;
+  int const n3 = 17;
+
+  // divide the outer loop among MPI processes, collapse the inner and outer loops and divide it among OMP threads
+  long sum = 0;
+#pragma omp parallel for collapse(3) reduction(+ : sum)
+  for (auto i : mpi::chunk(range(n1))) {
+    for (auto j : range(n2)) {
+      for (auto k : range(n3)) sum += i * n2 * n3 + j * n3 + k;
+    }
+  }
+
+  // reduce and check the sum, i.e. that every element of the range has been visited
+  sum = mpi::all_reduce(sum, world);
+  EXPECT_EQ(n1 * n2 * n3 * (n1 * n2 * n3 - 1) / 2, sum);
+}
+
 MPI_TEST_MAIN;
diff --git a/test/c++/mpi_comm_split.cpp b/test/c++/mpi_communicator.cpp
similarity index 55%
rename from test/c++/mpi_comm_split.cpp
rename to test/c++/mpi_communicator.cpp
index 58d0e842..010e3d49 100644
--- a/test/c++/mpi_comm_split.cpp
+++ b/test/c++/mpi_communicator.cpp
@@ -19,7 +19,26 @@
 
 #include <array>
 
-TEST(MPI, CommunicatorSplit) {
+TEST(MPI, CommunicatorDuplicateWorld) {
+  mpi::communicator world;
+
+  // skit the rest of test if there is no active MPI runtime
+  if (!mpi::has_env) return;
+
+  // duplicate and check the communicator
+  auto dup = world.duplicate();
+  EXPECT_EQ(world.rank(), dup.rank());
+  EXPECT_EQ(world.size(), dup.size());
+  EXPECT_EQ(MPI_COMM_WORLD, world.get());
+  EXPECT_NE(world.get(), dup.get());
+
+  // free the communicator
+  EXPECT_FALSE(dup.is_null());
+  dup.free();
+  EXPECT_TRUE(dup.is_null());
+}
+
+TEST(MPI, CommunicatorSplitAndDuplicate) {
   mpi::communicator world;
   int rank = world.rank();
 
@@ -39,6 +58,29 @@ TEST(MPI, CommunicatorSplit) {
   auto exp_ranks = std::array{0, 0, 0, 1};
   EXPECT_EQ(exp_sizes[rank], comm.size());
   EXPECT_EQ(exp_ranks[rank], comm.rank());
+
+  // duplicate the split communicator and check
+  auto dup = comm.duplicate();
+  EXPECT_EQ(comm.rank(), dup.rank());
+  EXPECT_EQ(comm.size(), dup.size());
+
+  // free the communicators
+  EXPECT_FALSE(dup.is_null());
+  EXPECT_FALSE(comm.is_null());
+  dup.free();
+  comm.free();
+  EXPECT_TRUE(dup.is_null());
+  EXPECT_TRUE(comm.is_null());
+}
+
+TEST(MPI_Window, CommunicatorSplitShared) {
+  mpi::communicator world;
+  [[maybe_unused]] auto shm = world.split_shared();
+}
+
+TEST(MPI, SharedCommunicatorDefaultConstructor) {
+  mpi::shared_communicator comm{};
+  EXPECT_TRUE(comm.is_null());
 }
 
 MPI_TEST_MAIN;
diff --git a/test/c++/mpi_custom.cpp b/test/c++/mpi_custom.cpp
index 619c8470..62b4badd 100644
--- a/test/c++/mpi_custom.cpp
+++ b/test/c++/mpi_custom.cpp
@@ -39,29 +39,9 @@ struct custom_cplx {
 // tie the data (used to construct the custom MPI type)
 inline auto tie_data(custom_cplx z) { return std::tie(z.real, z.imag); }
 
-// specialize mpi_type for custom_cplx
-template <> struct mpi::mpi_type<custom_cplx> : mpi::mpi_type_from_tie<custom_cplx> {};
-
 // stand-alone add function (the same as the operator+ above)
 custom_cplx add(custom_cplx const &x, custom_cplx const &y) { return x + y; }
 
-// needs to be in the mpi namespace for ADL to work
-namespace mpi {
-
-  // specialize mpi_reduce for std::array
-  template <typename T, size_t N>
-  std::array<T, N> mpi_reduce(std::array<T, N> const &arr, mpi::communicator c = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
-    std::array<T, N> res{};
-    if (all) {
-      MPI_Allreduce(arr.data(), res.data(), N, mpi::mpi_type<T>::get(), op, c.get());
-    } else {
-      MPI_Reduce(arr.data(), res.data(), N, mpi::mpi_type<T>::get(), op, root, c.get());
-    }
-    return res;
-  }
-
-}
-
 TEST(MPI, CustomTypeMapAdd) {
   mpi::communicator world;
   int rank = world.rank();
@@ -148,9 +128,88 @@ TEST(MPI, TupleMPIDatatypes) {
 
   using type5 = std::tuple<int, double, char, custom_cplx, bool>;
   type5 tup5;
-  if (rank == root) { tup5 = std::make_tuple(100, 3.1314, 'r', custom_cplx{1.0, 2.0}, false); }
+  if (rank == root) { tup5 = std::make_tuple(100, 3.1314, 'r', custom_cplx{.real = 1.0, .imag = 2.0}, false); }
   mpi::broadcast(tup5, world, root);
   EXPECT_EQ(tup5, std::make_tuple(100, 3.1314, 'r', custom_cplx{1.0, 2.0}, false));
 }
 
+// a simple struct representing a complex number that is serializable
+struct serializable_cplx {
+  double real{}, imag{};
+
+  // add two serializable_cplx objects
+  serializable_cplx operator+(serializable_cplx z) const {
+    z.real += real;
+    z.imag += imag;
+    return z;
+  }
+
+  // default equal-to operator
+  bool operator==(const serializable_cplx &) const = default;
+
+  // serialize the object
+  void serialize(auto &ar) const { ar & real & imag; }
+  void deserialize(auto &ar) { ar & real & imag; }
+};
+
+// a simple struct that contains a serializable type and is serializable itself
+struct serializable_container {
+  serializable_cplx z1;
+  custom_cplx z2;
+
+  // add two serializable_container objects
+  serializable_container operator+(serializable_container z) const {
+    z.z1 = z.z1 + z1;
+    z.z2 = z.z2 + z2;
+    return z;
+  }
+
+  // default equal-to operator
+  bool operator==(const serializable_container &) const = default;
+
+  // serialize the object
+  void serialize(auto &ar) const { ar & z1 & z2; }
+  void deserialize(auto &ar) { ar & z1 & z2; }
+};
+
+// check Serializable concept
+static_assert(mpi::Serializable<serializable_cplx>);
+static_assert(mpi::Serializable<serializable_container>);
+
+TEST(MPI, SerializableMPIDatatypes) {
+  mpi::communicator world;
+  int rank = world.rank();
+  int root = 0;
+
+  // check broadcast
+  auto z_exp = serializable_cplx{.real = 1.0, .imag = 2.0};
+  auto z = (rank == root ? z_exp : serializable_cplx{});
+  mpi::broadcast(z, world, root);
+  EXPECT_EQ(z, z_exp);
+
+  // check all_reduce
+  auto z_red = mpi::all_reduce(z, world, mpi::map_add<serializable_cplx>());
+  EXPECT_DOUBLE_EQ(z_exp.real * world.size(), z_red.real);
+  EXPECT_DOUBLE_EQ(z.imag * world.size(), z_red.imag);
+}
+
+TEST(MPI, SerializableOfSerializableMPIDatatypes) {
+  mpi::communicator world;
+  int rank = world.rank();
+  int root = 0;
+
+  // check broadcast
+  auto c_exp = serializable_container{.z1 = {.real = 1.0, .imag = 2.0}, .z2 = {.real = 3.0, .imag = 4.0}};
+  auto c = (rank == root ? c_exp : serializable_container{});
+  mpi::broadcast(c, world, root);
+  EXPECT_EQ(c, c_exp);
+
+  // check all_reduce
+  auto c_red = mpi::all_reduce(c, world, mpi::map_add<serializable_container>());
+  EXPECT_DOUBLE_EQ(c_exp.z1.real * world.size(), c_red.z1.real);
+  EXPECT_DOUBLE_EQ(c_exp.z1.imag * world.size(), c_red.z1.imag);
+  EXPECT_DOUBLE_EQ(c_exp.z2.real * world.size(), c_red.z2.real);
+  EXPECT_DOUBLE_EQ(c_exp.z2.imag * world.size(), c_red.z2.imag);
+}
+
 MPI_TEST_MAIN;
diff --git a/test/c++/mpi_gather.cpp b/test/c++/mpi_gather.cpp
new file mode 100644
index 00000000..4d599285
--- /dev/null
+++ b/test/c++/mpi_gather.cpp
@@ -0,0 +1,127 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <ranges>
+#include <string>
+#include <vector>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test gathering single values/objects.
+template <typename T> void test_gather(std::vector<T> result) {
+  mpi::communicator world;
+
+  // gather from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // gather single objects into a vector
+    auto vec = mpi::gather(result[world.rank()], world, root);
+    if (world.rank() == root)
+      expect_range_eq(vec, result);
+    else
+      EXPECT_TRUE(vec.empty());
+
+    // gather single objects into an existing vector
+    if (world.rank() == root) {
+      vec.assign(world.size(), T{0});
+      mpi::gather_into(result[world.rank()], vec, world, root);
+      expect_range_eq(vec, result);
+    } else {
+      vec.clear();
+      mpi::gather_into(result[world.rank()], vec, world, root);
+      EXPECT_TRUE(vec.empty());
+    }
+  }
+
+  // allgather single objects into a vector
+  auto vec = mpi::all_gather(result[world.rank()], world);
+  expect_range_eq(vec, result);
+
+  // allgather single objects into an existing vector
+  vec.assign(world.size(), T{0});
+  mpi::all_gather_into(result[world.rank()], vec, world);
+  expect_range_eq(vec, result);
+}
+
+TEST(MPI, GatherInteger) {
+  mpi::communicator world;
+  std::vector<int> result(world.size());
+  for (int i = 0; i < world.size(); ++i) result[i] = i + 1;
+  test_gather(result);
+}
+
+TEST(MPI, GatherComplex) {
+  mpi::communicator world;
+  std::vector<std::complex<double>> result(world.size());
+  for (int i = 0; i < world.size(); ++i) result[i] = std::complex<double>{i + 1.0, -(i + 1.0)};
+  test_gather(result);
+}
+
+TEST(MPI, GatherCustomMPIType) {
+  mpi::communicator world;
+  std::vector<mpi_t> result(world.size());
+  for (int i = 0; i < world.size(); ++i) result[i] = mpi_t{i + 1};
+  test_gather(result);
+}
+
+TEST(MPI, GatherCustomNonMPIType) {
+  mpi::communicator world;
+  std::vector<non_mpi_t> result(world.size());
+  for (int i = 0; i < world.size(); ++i) result[i] = non_mpi_t{i + 1};
+  test_gather(result);
+}
+
+// Test gathering a string.
+TEST(MPI, GatherString) {
+  mpi::communicator world;
+  std::string str{}, result{};
+  for (int i = 0; i < world.size(); ++i) {
+    for (int j = 0; j < i + 1; ++j) result += "a";
+    result += std::to_string(i);
+  }
+  for (int i = 0; i < world.rank() + 1; ++i) str += "a";
+  str += std::to_string(world.rank());
+
+  // gather strings
+  for (int root = 0; root < world.size(); ++root) {
+    auto str_gathered = mpi::gather(str, world, root);
+    if (world.rank() == root)
+      EXPECT_EQ(str_gathered, result);
+    else
+      EXPECT_TRUE(str_gathered.empty());
+  }
+
+  // allgather strings
+  auto str_gathered = mpi::all_gather(str);
+  EXPECT_EQ(str_gathered, result);
+
+  // allgather empty strings
+  auto empty_str = mpi::all_gather(std::string{});
+  EXPECT_TRUE(empty_str.empty());
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_gather_range.cpp b/test/c++/mpi_gather_range.cpp
new file mode 100644
index 00000000..2768db7b
--- /dev/null
+++ b/test/c++/mpi_gather_range.cpp
@@ -0,0 +1,88 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <ranges>
+#include <vector>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test gathering a range of objects.
+template <typename T> void test_gather_range(std::vector<T> const &values, std::vector<T> const &result) {
+  mpi::communicator world;
+
+  // gather on different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // gather spans into a view of a vector
+    std::vector<T> vec(result.size() * 2, T{0});
+    mpi::gather_range(std::span{values}, std::ranges::drop_view(vec, result.size()), world, root);
+    if (world.rank() == root) {
+      expect_range_eq(std::ranges::drop_view(vec, result.size()), result);
+      expect_range_eq(std::ranges::take_view(vec, result.size()), std::vector<T>(result.size(), T{0}));
+    } else {
+      expect_range_eq(vec, std::vector<T>(result.size() * 2, T{0}));
+    }
+  }
+
+  // allgather vectors into an oversized vector
+  std::vector<T> vec(result.size() * 2, T{0});
+  mpi::gather_range(values, std::span{vec.begin(), result.size()}, world, 0, true);
+  expect_range_eq(std::ranges::take_view(vec, result.size()), result);
+  expect_range_eq(std::ranges::drop_view(vec, result.size()), std::vector<T>(result.size(), T{0}));
+}
+
+TEST(MPI, GatherIntegerRange) {
+  mpi::communicator world;
+  std::vector<int> values, result;
+  for (int i = 0; i < world.size(); ++i) {
+    for (int j = 0; j < 2 * (i + 1); ++j) result.emplace_back(i);
+  };
+  for (int i = 0; i < 2 * (world.rank() + 1); ++i) values.emplace_back(world.rank());
+  test_gather_range(values, result);
+}
+
+TEST(MPI, GatherComplexRange) {
+  mpi::communicator world;
+  std::vector<std::complex<double>> values, result;
+  for (int i = 0; i < world.size(); ++i) {
+    for (int j = 0; j < 2 * (i + 1); ++j) result.emplace_back(i, -i);
+  }
+  for (int i = 0; i < 2 * (world.rank() + 1); ++i) values.emplace_back(world.rank(), -world.rank());
+  test_gather_range(values, result);
+}
+
+TEST(MPI, GatherCustomMPITypeRange) {
+  mpi::communicator world;
+  std::vector<mpi_t> values, result;
+  for (int i = 0; i < world.size(); ++i) {
+    for (int j = 0; j < 2 * (i + 1); ++j) result.emplace_back(i);
+  }
+  for (int i = 0; i < 2 * (world.rank() + 1); ++i) values.emplace_back(world.rank());
+  test_gather_range(values, result);
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_gather_vector.cpp b/test/c++/mpi_gather_vector.cpp
new file mode 100644
index 00000000..f5a8cadd
--- /dev/null
+++ b/test/c++/mpi_gather_vector.cpp
@@ -0,0 +1,102 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <ranges>
+#include <vector>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test gathering vectors.
+template <typename T> void test_gather_vector(std::vector<T> const &values, std::vector<T> const &result) {
+  mpi::communicator world;
+
+  // gather on different roots
+  for (int root = 0; root < world.size(); ++root) {
+    if constexpr (mpi::has_mpi_type<T>) {
+      // gather vectors into a new vector
+      auto vec = mpi::gather(values, world, root);
+      if (world.rank() == root)
+        expect_range_eq(vec, result);
+      else
+        EXPECT_TRUE(vec.empty());
+
+      // gather vectors into an existing vector
+      vec.clear();
+      mpi::gather_into(values, vec, world, root);
+      if (world.rank() == root)
+        expect_range_eq(vec, result);
+      else
+        EXPECT_TRUE(vec.empty());
+    }
+
+    // gather empty vectors
+    auto vec = mpi::gather(std::vector<T>{}, world, root);
+    EXPECT_TRUE(vec.empty());
+  }
+
+  // allgather vectors into a new vector
+  auto vec = mpi::all_gather(values, world);
+  expect_range_eq(vec, result);
+
+  // allgather vectors into an existing vector
+  vec.clear();
+  mpi::all_gather_into(values, vec, world);
+  expect_range_eq(vec, result);
+}
+
+TEST(MPI, GatherIntegerVector) {
+  mpi::communicator world;
+  std::vector<int> values, result;
+  for (int i = 0; i < world.size(); ++i) {
+    for (int j = 0; j < 2 * (i + 1); ++j) result.emplace_back(i);
+  }
+  for (int i = 0; i < 2 * (world.rank() + 1); ++i) values.emplace_back(world.rank());
+  test_gather_vector(values, result);
+}
+
+TEST(MPI, GatherComplexVector) {
+  mpi::communicator world;
+  std::vector<std::complex<double>> values, result;
+  for (int i = 0; i < world.size(); ++i) {
+    for (int j = 0; j < 2 * (i + 1); ++j) result.emplace_back(i, -i);
+  }
+  for (int i = 0; i < 2 * (world.rank() + 1); ++i) values.emplace_back(world.rank(), -world.rank());
+  test_gather_vector(values, result);
+}
+
+TEST(MPI, GatherCustomMPITypeVector) {
+  mpi::communicator world;
+  std::vector<mpi_t> values, result;
+  for (int i = 0; i < world.size(); ++i) {
+    for (int j = 0; j < 2 * (i + 1); ++j) result.emplace_back(i);
+  }
+  for (int i = 0; i < 2 * (world.rank() + 1); ++i) values.emplace_back(world.rank());
+  test_gather_vector(values, result);
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_cplx.cpp b/test/c++/mpi_generic.cpp
similarity index 57%
rename from test/c++/mpi_cplx.cpp
rename to test/c++/mpi_generic.cpp
index 13c14451..f3905373 100644
--- a/test/c++/mpi_cplx.cpp
+++ b/test/c++/mpi_generic.cpp
@@ -18,18 +18,25 @@
 #include <gtest/gtest.h>
 #include <mpi/mpi.hpp>
 
-#include <complex>
-
-TEST(MPI, ComplexBroadcast) {
-  // broadcast a complex number
+TEST(MPI, AllEqual) {
+  // check if a value is equal on all ranks
   mpi::communicator world;
 
-  std::complex<double> cplx;
-  if (world.rank() == 0) cplx = std::complex<double>(1., 2.);
-
-  mpi::broadcast(cplx);
+  int val_i = 10;
+  EXPECT_TRUE(mpi::all_equal(val_i, world));
+  double val_d = 3.1415;
+  EXPECT_TRUE(mpi::all_equal(val_d, world));
+  std::vector<int> val_v = {1, 2, 3};
+  EXPECT_TRUE(mpi::all_equal(val_v, world));
 
-  EXPECT_EQ(cplx, std::complex<double>(1., 2.));
+  if (world.size() > 1) {
+    if (world.rank() == 1) val_i -= 1;
+    EXPECT_FALSE(mpi::all_equal(val_i, world));
+    if (world.rank() == 1) val_d -= 1.0;
+    EXPECT_FALSE(mpi::all_equal(val_d, world));
+    if (world.rank() == 1) val_v[0] -= 1;
+    EXPECT_FALSE(mpi::all_equal(val_v, world));
+  }
 }
 
 MPI_TEST_MAIN;
diff --git a/test/c++/mpi_group.cpp b/test/c++/mpi_group.cpp
new file mode 100644
index 00000000..1f80dc18
--- /dev/null
+++ b/test/c++/mpi_group.cpp
@@ -0,0 +1,68 @@
+// Copyright (c) 2020-2024 Simons Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Olivier Parcollet, Nils Wentzell
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <vector>
+
+TEST(MPI, GroupDefaultConstructor) {
+  mpi::group g;
+  EXPECT_TRUE(g.is_null());
+}
+
+TEST(MPI, GroupCommWorld) {
+  mpi::communicator world;
+
+  mpi::group g(world);
+  EXPECT_EQ(g.rank(), world.rank());
+  EXPECT_EQ(g.size(), world.size());
+
+  // move operations
+  auto g2 = std::move(g);
+  EXPECT_EQ(g2.rank(), world.rank());
+  EXPECT_EQ(g2.size(), world.size());
+  EXPECT_TRUE(g.is_null());
+}
+
+TEST(MPI, GroupInclude) {
+  mpi::communicator world;
+  mpi::group g(world);
+
+  // include every second rank
+  std::vector<int> ranks;
+  for (int i = 0; i < world.size(); i += 2) ranks.push_back(i);
+  auto g2 = g.include(ranks);
+  EXPECT_EQ(g2.size(), ranks.size());
+  if (std::ranges::find(ranks, world.rank()) != ranks.end()) {
+    EXPECT_EQ(world.rank(), ranks[g2.rank()]);
+  } else {
+    EXPECT_EQ(g2.rank(), MPI_UNDEFINED);
+  }
+
+  // include every second rank (starting from the back)
+  ranks.clear();
+  for (int i = world.size() - 1; i >= 0; i -= 2) ranks.push_back(i);
+  auto g3 = g.include(ranks);
+  EXPECT_EQ(g3.size(), ranks.size());
+  if (std::ranges::find(ranks, world.rank()) != ranks.end()) {
+    EXPECT_EQ(world.rank(), ranks[g3.rank()]);
+  } else {
+    EXPECT_EQ(g3.rank(), MPI_UNDEFINED);
+  }
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_monitor.cpp b/test/c++/mpi_monitor.cpp
index dab028a4..97dc4252 100644
--- a/test/c++/mpi_monitor.cpp
+++ b/test/c++/mpi_monitor.cpp
@@ -15,74 +15,82 @@
 // Authors: Philipp Dumitrescu, Thomas Hahn, Olivier Parcollet
 
 #include <gtest/gtest.h>
-#include <mpi/monitor.hpp>
+#include <mpi/mpi.hpp>
+#include <mpi.h>
 
 #include <algorithm>
 #include <iostream>
+#include <numeric>
 #include <vector>
 #include <unistd.h>
 
 // in micro second = 1 milli second
 const int delta_tau_sleep = 1000;
 
-// Monitor all nodes while some of them might fail.
+// Monitor all nodes while some of them might report an event.
 //
-// c: MPI communicator
-// fastest_node: rank of the fastest node
-// rank_failing: ranks of the nodes that will fail
-// iteration_failure: iteration at which the nodes will fail
-bool test(mpi::communicator c, int fastest_node, std::vector<int> rank_failing, int iteration_failure = 3) {
+// c: MPI communicator.
+// fastest_node: Rank of the fastest node.
+// rank_reporting: Ranks of the nodes that will report an event.
+// all_events: If true, the all_events_occurred() function will be used instead of some_event_occurred().
+// iteration_event: Iteration at which the nodes will report an event.
+bool test_monitor(mpi::communicator c, int fastest_node, std::vector<int> rank_reporting, bool all_events = false, int iteration_event = 3) {
   const int niter = 10;
   const int size  = c.size();
   int sleeptime   = delta_tau_sleep * (((c.rank() - fastest_node + size) % size) + 1);
-  bool will_fail  = std::any_of(rank_failing.cbegin(), rank_failing.cend(), [&c](int i) { return i == c.rank(); });
+  bool will_fail  = std::any_of(rank_reporting.cbegin(), rank_reporting.cend(), [&c](int i) { return i == c.rank(); });
   std::cerr << "Node " << c.rank() << ": sleeptime " << sleeptime << std::endl;
 
   mpi::monitor monitor{c};
+  auto events_occurred = [all_events, &monitor]() { return all_events ? monitor.event_on_all_ranks() : monitor.event_on_any_rank(); };
 
-  for (int i = 0; (!monitor.emergency_occured()) and (i < niter); ++i) {
+  for (int i = 0; (!events_occurred()) and (i < niter); ++i) {
     usleep(sleeptime);
-    std::cerr << "Node " << c.rank() << "is in iteration " << i << std::endl;
-    if (will_fail and (i >= iteration_failure)) {
+    std::cerr << "Node " << c.rank() << " is in iteration " << i << std::endl;
+    if (will_fail and (i >= iteration_event)) {
       std::cerr << "Node " << c.rank() << " is failing" << std::endl;
-      monitor.request_emergency_stop();
-      monitor.request_emergency_stop(); // 2nd call should not resend MPI message
+      monitor.report_local_event();
+      monitor.report_local_event(); // 2nd call should not resend MPI message
     }
     if (i == niter - 1) { std::cerr << "Node " << c.rank() << " has done all tasks" << std::endl; }
   }
 
   monitor.finalize_communications();
   std::cerr << "Ending on node " << c.rank() << std::endl;
-  return not monitor.emergency_occured();
+  return not events_occurred();
 }
 
-TEST(MPI, MonitorNoFailure) {
-  // no failure
+TEST(MPI, MonitorNoEvent) {
+  // no event
   usleep(1000);
   mpi::communicator world;
   for (int i = 0; i < world.size(); ++i) {
     world.barrier();
     if (world.rank() == 0) std::cerr << "***\nNode " << i << " is the fastest" << std::endl;
-    EXPECT_TRUE(test(world, i, {}));
+    EXPECT_TRUE(test_monitor(world, i, {}));
+    world.barrier();
+    EXPECT_TRUE(test_monitor(world, i, {}, true));
     world.barrier();
   }
 }
 
-TEST(MPI, MonitorOneFailureOnRoot) {
-  // root node fails
+TEST(MPI, MonitorOneEventOnRoot) {
+  // one event on root node
   usleep(1000);
   mpi::communicator world;
   for (int i = 0; i < world.size(); ++i) {
     world.barrier();
     if (world.rank() == 0) std::cerr << "***\nNode " << i << " is the fastest" << std::endl;
-    EXPECT_EQ(test(world, i, {0}), false);
+    EXPECT_EQ(test_monitor(world, i, {0}), false);
+    world.barrier();
+    EXPECT_EQ(test_monitor(world, i, {0}, true), world.size() > 1);
     world.barrier();
   }
   usleep(1000);
 }
 
-TEST(MPI, MonitorOneFailureOnNonRoot) {
-  // one non-root node fails
+TEST(MPI, MonitorOneEventOnNonRoot) {
+  // one event on non-root node
   usleep(1000);
   mpi::communicator world;
   if (world.size() < 2) {
@@ -92,15 +100,17 @@ TEST(MPI, MonitorOneFailureOnNonRoot) {
       world.barrier();
       if (world.rank() == 0) std::cerr << "***\nNode " << i << " is the fastest" << std::endl;
       bool has_failure = (world.size() > 1 ? false : true); // No failure if only rank 0 exists
-      EXPECT_EQ(test(world, i, {1}), has_failure);
+      EXPECT_EQ(test_monitor(world, i, {1}), has_failure);
+      world.barrier();
+      EXPECT_EQ(test_monitor(world, i, {1}, true), world.size() > 1);
       world.barrier();
     }
   }
   usleep(1000);
 }
 
-TEST(MPI, MonitorTwoFailuresWithRoot) {
-  // two nodes fail including the root process
+TEST(MPI, MonitorTwoEventsWithRoot) {
+  // two events on nodes including the root process
   usleep(1000);
   mpi::communicator world;
   if (world.size() < 2) {
@@ -109,15 +119,17 @@ TEST(MPI, MonitorTwoFailuresWithRoot) {
     for (int i = 0; i < world.size(); ++i) {
       world.barrier();
       if (world.rank() == 0) std::cerr << "***\nNode " << i << " is the fastest" << std::endl;
-      EXPECT_EQ(test(world, i, {0, 1}), false);
+      EXPECT_EQ(test_monitor(world, i, {0, 1}), false);
+      world.barrier();
+      EXPECT_EQ(test_monitor(world, i, {0, 1}, true), world.size() > 2);
       world.barrier();
     }
   }
   usleep(1000);
 }
 
-TEST(MPI, MonitorTwoFailuresWithoutRoot) {
-  // two nodes fail excluding the root process
+TEST(MPI, MonitorTwoEventsWithoutRoot) {
+  // two events on nodes excluding the root process
   usleep(1000);
   mpi::communicator world;
   if (world.size() < 3) {
@@ -126,11 +138,54 @@ TEST(MPI, MonitorTwoFailuresWithoutRoot) {
     for (int i = 0; i < world.size(); ++i) {
       world.barrier();
       if (world.rank() == 0) std::cerr << "***\nNode " << i << " is the fastest" << std::endl;
-      EXPECT_EQ(test(world, i, {1, 2}), false);
+      EXPECT_EQ(test_monitor(world, i, {1, 2}), false);
+      world.barrier();
+      EXPECT_EQ(test_monitor(world, i, {1, 2}, true), world.size() > 2);
       world.barrier();
     }
   }
   usleep(1000);
 }
 
+TEST(MPI, MonitorAllEvents) {
+  // events on all nodes
+  usleep(1000);
+  mpi::communicator world;
+  std::vector<int> rank_reporting(world.size());
+  std::iota(rank_reporting.begin(), rank_reporting.end(), 0);
+  for (int i = 0; i < world.size(); ++i) {
+    world.barrier();
+    if (world.rank() == 0) std::cerr << "***\nNode " << i << " is the fastest" << std::endl;
+    EXPECT_FALSE(test_monitor(world, i, rank_reporting));
+    world.barrier();
+    EXPECT_FALSE(test_monitor(world, i, rank_reporting, true));
+    world.barrier();
+  }
+  usleep(1000);
+}
+
+TEST(MPI, MultipleMonitors) {
+  // test multiple monitors
+  usleep(1000);
+  mpi::communicator world;
+  mpi::monitor monitor1{world};
+  mpi::monitor monitor2{world};
+  mpi::monitor monitor3{world};
+  if (world.rank() == 0) {
+    monitor3.report_local_event();
+  }
+  monitor2.report_local_event();
+  monitor1.finalize_communications();
+  monitor2.finalize_communications();
+  monitor3.finalize_communications();
+  EXPECT_FALSE(monitor1.event_on_any_rank());
+  EXPECT_FALSE(monitor1.event_on_all_ranks());
+  EXPECT_TRUE(monitor2.event_on_any_rank());
+  EXPECT_TRUE(monitor2.event_on_all_ranks());
+  EXPECT_TRUE(monitor3.event_on_any_rank());
+  if (world.size() == 1) EXPECT_TRUE(monitor3.event_on_all_ranks());
+  else EXPECT_FALSE(monitor3.event_on_all_ranks());
+  usleep(1000);
+}
+
 MPI_TEST_MAIN;
diff --git a/test/c++/mpi_optional.cpp b/test/c++/mpi_optional.cpp
new file mode 100644
index 00000000..5f812308
--- /dev/null
+++ b/test/c++/mpi_optional.cpp
@@ -0,0 +1,103 @@
+// Copyright (c) 2024 Simons Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Nils Wentzell
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <optional>
+
+// Test broadcasting an optional with a value.
+template <typename T> void test_broadcast_optional(T root_value) {
+  mpi::communicator world;
+  for (int root = 0; root < world.size(); ++root) {
+    std::optional<T> bcast_value{};
+    if (world.rank() == root) bcast_value = root_value;
+    mpi::broadcast(bcast_value, world, root);
+    EXPECT_EQ(bcast_value, root_value);
+  }
+}
+
+TEST(MPI, BroadcastOptionalInt) { test_broadcast_optional(42); }
+
+TEST(MPI, BroadcastOptionalComplex) { test_broadcast_optional(std::complex<double>{1.0, 2.0}); }
+
+TEST(MPI, BroadcastEmptyOptionalInt) {
+  mpi::communicator world;
+  for (int root = 0; root < world.size(); ++root) {
+    std::optional<int> bcast_value{};
+    if (world.rank() != root) bcast_value = {}; // non-root has value, root is empty
+    mpi::broadcast(bcast_value, world, root);
+    EXPECT_FALSE(bcast_value.has_value()); // after broadcast, all should be empty like root
+  }
+}
+
+// Test reducing an optional with a value.
+template <typename T> void test_reduce_optional(T value, T result) {
+  mpi::communicator world;
+
+  // reduce from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // reduce into new object
+    auto red_value = mpi::reduce(std::optional{value}, world, root);
+    if (world.rank() == root) { EXPECT_EQ(red_value, result); }
+
+    // reduce into existing object
+    std::optional<T> red_out{};
+    mpi::reduce_into(std::optional{value}, red_out, world, root);
+    if (world.rank() == root) { EXPECT_EQ(red_out, result); }
+  }
+
+  // allreduce into new object
+  auto red_value = mpi::all_reduce(std::optional{value}, world);
+  EXPECT_EQ(red_value, result);
+
+  // allreduce into existing object
+  std::optional<T> red_out{};
+  mpi::all_reduce_into(std::optional{value}, red_out, world);
+  EXPECT_EQ(red_out, result);
+}
+
+TEST(MPI, ReduceOptionalInt) {
+  mpi::communicator world;
+  int result = world.size() * (world.size() - 1) / 2;
+  test_reduce_optional(world.rank(), result);
+}
+
+TEST(MPI, ReduceOptionalComplex) {
+  mpi::communicator world;
+  double rank   = world.rank();
+  double result = world.size() * (world.size() - 1) * 0.5;
+  test_reduce_optional(std::complex<double>{rank, -rank}, std::complex<double>{result, -result});
+}
+
+TEST(MPI, ReduceEmptyOptionalInt) {
+  mpi::communicator world;
+
+  for (int root = 0; root < world.size(); ++root) {
+    auto red_value = mpi::reduce(std::optional<int>{}, world, root);
+    EXPECT_FALSE(red_value.has_value());
+
+    std::optional<int> red_out{};
+    mpi::reduce_into(std::optional<int>{}, red_out, world, root);
+    EXPECT_FALSE(red_out.has_value());
+  }
+
+  auto red_value = mpi::all_reduce(std::optional<int>{}, world);
+  EXPECT_FALSE(red_value.has_value());
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_pair.cpp b/test/c++/mpi_pair.cpp
deleted file mode 100644
index 3c6a14c6..00000000
--- a/test/c++/mpi_pair.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2021-2024 Simons Foundation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Authors: Thomas Hahn, Nils Wentzell
-
-#include <gtest/gtest.h>
-#include <mpi/pair.hpp>
-#include <mpi/string.hpp>
-
-#include <complex>
-#include <string>
-#include <utility>
-
-TEST(MPI, PairBroadcast) {
-  // broadcast a pair consisting of a string and a complex number
-  std::pair<std::string, std::complex<double>> p;
-
-  auto str  = std::string{"Hello"};
-  auto cplx = std::complex<double>(1.0, 2.0);
-
-  mpi::communicator world;
-  if (world.rank() == 0) p = {str, cplx};
-
-  mpi::broadcast(p);
-  auto [str_bc, cplx_bc] = p;
-  EXPECT_EQ(str, str_bc);
-  EXPECT_EQ(cplx, cplx_bc);
-}
-
-TEST(MPI, PairReduce) {
-  // reduce a pair of integers
-  mpi::communicator world;
-  auto r = world.rank();
-  auto p = std::pair{1, r};
-
-  auto [r1, r2] = mpi::all_reduce(p);
-  auto nr       = world.size();
-  EXPECT_EQ(r1, nr);
-  EXPECT_EQ(r2, nr * (nr - 1) / 2);
-}
-
-MPI_TEST_MAIN;
diff --git a/test/c++/mpi_reduce.cpp b/test/c++/mpi_reduce.cpp
new file mode 100644
index 00000000..67a06fc4
--- /dev/null
+++ b/test/c++/mpi_reduce.cpp
@@ -0,0 +1,121 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <utility>
+
+// Test reducing a single value/object.
+template <typename T> void test_reduce(T value, T result, T def_value, MPI_Op op = MPI_SUM) {
+  mpi::communicator world;
+
+  // reduce from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // reduce an object into new object
+    auto red_value = mpi::reduce(value, world, root, false, op);
+    if (world.rank() == root) { EXPECT_EQ(red_value, result); }
+
+    // reduce an object in place
+    red_value = value;
+    mpi::reduce_in_place(red_value, world, root, false, op);
+    if (world.rank() == root)
+      EXPECT_EQ(red_value, result);
+    else
+      EXPECT_EQ(red_value, value);
+
+    // reduce an object into an existing object
+    red_value = def_value;
+    mpi::reduce_into(value, red_value, world, root, false, op);
+    if (world.rank() == root)
+      EXPECT_EQ(red_value, result);
+    else
+      EXPECT_EQ(red_value, def_value);
+  }
+
+  // allreduce an object into a new object
+  auto red_value = mpi::all_reduce(value, world, op);
+  EXPECT_EQ(red_value, result);
+
+  // allreduce an object in place
+  red_value = value;
+  mpi::all_reduce_in_place(red_value, world, op);
+  EXPECT_EQ(red_value, result);
+
+  // allreduce an object using all_reduce_into
+  red_value = value;
+  mpi::all_reduce_into(value, red_value, world, op);
+  EXPECT_EQ(red_value, result);
+
+  // allreduce an object in place using all_reduce_into
+  red_value = value;
+  mpi::all_reduce_into(red_value, red_value, world, op);
+  EXPECT_EQ(red_value, result);
+}
+
+TEST(MPI, ReduceInteger) {
+  mpi::communicator world;
+  int rank     = world.rank() + 1;
+  int red_rank = world.size() * (world.size() + 1) / 2;
+  test_reduce(rank, red_rank, 0);
+}
+
+TEST(MPI, ReduceComplex) {
+  mpi::communicator world;
+  double rank     = world.rank() + 1.0;
+  double red_rank = world.size() * (world.size() + 1) * 0.5;
+  test_reduce(std::complex<double>{rank, -rank}, std::complex<double>{red_rank, -red_rank}, std::complex<double>{0, 0});
+}
+
+TEST(MPI, ReduceCustomMPIType) {
+  mpi::communicator world;
+  int rank     = world.rank() + 1;
+  int red_rank = world.size() * (world.size() + 1) / 2;
+  if (world.size() > 1) test_reduce(mpi_t{rank}, mpi_t{red_rank}, mpi_t{0}, mpi::map_add<mpi_t>());
+}
+
+TEST(MPI, ReduceCustomNonMPIType) {
+  mpi::communicator world;
+  int rank     = world.rank() + 1;
+  int red_rank = world.size() * (world.size() + 1) / 2;
+  test_reduce(non_mpi_t{rank}, non_mpi_t{red_rank}, non_mpi_t{0});
+}
+
+// Test reducing a pair.
+TEST(MPI, ReducePair) {
+  mpi::communicator world;
+
+  // allreduce a pair of integers
+  auto p1 = mpi::all_reduce(std::pair{world.rank(), -world.rank()}, world, MPI_MAX);
+  EXPECT_EQ(p1.first, world.size() - 1);
+  EXPECT_EQ(p1.second, 0);
+
+  // reduce a pair of non_mpi_t
+  auto p2 = mpi::reduce(std::pair{non_mpi_t{1}, non_mpi_t{world.rank() + 1}}, world, world.size() - 1);
+  if (world.rank() == world.size() - 1) {
+    EXPECT_EQ(p2.first, non_mpi_t(world.size()));
+    EXPECT_EQ(p2.second, non_mpi_t(world.size() * (world.size() + 1) / 2));
+  } else {
+    EXPECT_EQ(p2.first, non_mpi_t());
+    EXPECT_EQ(p2.second, non_mpi_t());
+  }
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_reduce_array.cpp b/test/c++/mpi_reduce_array.cpp
new file mode 100644
index 00000000..96720a2a
--- /dev/null
+++ b/test/c++/mpi_reduce_array.cpp
@@ -0,0 +1,127 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <array>
+#include <complex>
+#include <ranges>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test reducing arrays.
+template <typename T> void test_reduce_array(std::array<T, 5> const &values, std::array<T, 5> const &result, MPI_Op op = MPI_SUM) {
+  mpi::communicator world;
+
+  // reduce from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // reduce an array into a new array
+    auto arr = mpi::reduce(values, world, root, false, op);
+    if (world.rank() == root) expect_range_eq(arr, result);
+
+    // reduce an empty array
+    std::array<T, 0> empty_arr{};
+    auto empty_red = mpi::reduce(empty_arr, world, root, false, op);
+    static_assert(empty_red.size() == 0);
+
+    // reduce an array in place
+    arr = values;
+    mpi::reduce_in_place(arr, world, root, false, op);
+    if (world.rank() == root)
+      expect_range_eq(arr, result);
+    else
+      expect_range_eq(arr, values);
+
+    // reduce an array into an existing array
+    arr = {};
+    mpi::reduce_into(values, arr, world, root, false, op);
+    if (world.rank() == root) expect_range_eq(arr, result);
+
+    // reduce an empty array into an existing array
+    mpi::reduce_into(empty_arr, empty_arr, world, root, false, op);
+  }
+
+  // allreduce an array into new array
+  auto arr = mpi::all_reduce(values, world, op);
+  expect_range_eq(arr, result);
+
+  // allreduce an array in place
+  arr = values;
+  mpi::all_reduce_in_place(arr, world, op);
+  expect_range_eq(arr, result);
+
+  // allreduce an array in place using all_reduce_into
+  arr = values;
+  mpi::all_reduce_into(arr, arr, world, op);
+  expect_range_eq(arr, result);
+}
+
+TEST(MPI, ReduceIntegerArray) {
+  mpi::communicator world;
+  std::array<int, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = (i + 1) * (world.rank() + 1);
+    result[i] = (i + 1) * world.size() * (world.size() + 1) / 2;
+  }
+  test_reduce_array(values, result);
+}
+
+TEST(MPI, ReduceComplexArray) {
+  mpi::communicator world;
+  double rank     = world.rank() + 1.0;
+  double red_rank = world.size() * (world.size() + 1) * 0.5;
+  std::array<std::complex<double>, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = std::complex<double>{rank * (i + 1), -rank * (i + 1)};
+    result[i] = std::complex<double>{red_rank * (i + 1), -red_rank * (i + 1)};
+  }
+  test_reduce_array(values, result);
+}
+
+TEST(MPI, ReduceCustomMPITypeArray) {
+  mpi::communicator world;
+  long rank     = world.rank() + 1;
+  long red_rank = world.size() * (world.size() + 1) / 2;
+  std::array<mpi_t, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = mpi_t{rank * (i + 1)};
+    result[i] = mpi_t{red_rank * (i + 1)};
+  }
+  if (world.size() > 1) { test_reduce_array(values, result, mpi::map_add<mpi_t>()); }
+}
+
+TEST(MPI, ReduceCustomNonMPITypeArray) {
+  mpi::communicator world;
+  int rank     = world.rank() + 1;
+  int red_rank = world.size() * (world.size() + 1) / 2;
+  std::array<non_mpi_t, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = non_mpi_t{rank * (i + 1)};
+    result[i] = non_mpi_t{red_rank * (i + 1)};
+  }
+  test_reduce_array(values, result);
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_reduce_range.cpp b/test/c++/mpi_reduce_range.cpp
new file mode 100644
index 00000000..5f8cd193
--- /dev/null
+++ b/test/c++/mpi_reduce_range.cpp
@@ -0,0 +1,154 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <array>
+#include <complex>
+#include <list>
+#include <ranges>
+#include <span>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test reducing a range of objects.
+template <typename T> void test_reduce_range(std::array<T, 5> const &values, std::array<T, 5> const &result, MPI_Op op = MPI_SUM) {
+  mpi::communicator world;
+
+  // reduce from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // reduce a span into an array
+    auto arr = values;
+    mpi::reduce_range(std::span{values.data() + 2, 3}, std::span{arr.begin(), 3}, world, root, false, op);
+    if (world.rank() == root) {
+      expect_range_eq(std::span{arr.data(), 3}, std::span{result.data() + 2, 3});
+      expect_range_eq(std::span{arr.data() + 3, 2}, std::span{values.data() + 3, 2});
+    } else {
+      expect_range_eq(arr, values);
+    }
+
+    // reduce a list into a list
+    std::list<T> list(values.begin(), values.end()), list_red(values.begin(), values.end());
+    if (world.rank() == root) {
+      mpi::reduce_range(list, list_red, world, root, false, op);
+      expect_range_eq(list_red, result);
+    } else {
+      list_red.clear();
+      mpi::reduce_range(list, list_red, world, root, false, op);
+      EXPECT_TRUE(list_red.empty());
+    }
+
+    // reduce a view on a list in place
+    list.assign(values.begin(), values.end());
+    mpi::reduce_range(std::ranges::take_view(list, 2), std::ranges::take_view(list, 2), world, root, false, op);
+    if (world.rank() == root) {
+      expect_range_eq(std::ranges::take_view(list, 2), std::ranges::take_view(result, 2));
+      expect_range_eq(std::ranges::drop_view(list, 2), std::ranges::drop_view(values, 2));
+    } else {
+      expect_range_eq(list, values);
+    }
+
+    // reduce a span in place
+    arr = values;
+    mpi::reduce_range(std::span{arr.data() + 2, 3}, std::span{arr.data() + 2, 3}, world, root, false, op);
+    if (world.rank() == root) {
+      expect_range_eq(std::span{arr.data() + 2, 3}, std::span{result.data() + 2, 3});
+      expect_range_eq(std::span{arr.data(), 2}, std::span{values.data(), 2});
+    } else {
+      expect_range_eq(arr, values);
+    }
+
+    // reduce an array into a list
+    if (world.rank() == root) {
+      list = std::list<T>(5);
+      mpi::reduce_range(values, list, world, root, false, op);
+      expect_range_eq(list, result);
+    } else {
+      list.clear();
+      mpi::reduce_range(values, list, world, root, false, op);
+      EXPECT_TRUE(list.empty());
+    }
+  }
+
+  // allreduce a list in place using reduce_range
+  std::list<T> list(values.begin(), values.end());
+  mpi::reduce_range(list, list, world, 0, true, op);
+  expect_range_eq(list, result);
+
+  // allreduce a span in place
+  auto arr = values;
+  mpi::reduce_range(std::span{arr.data() + 1, 3}, std::span{arr.data() + 1, 3}, world, 0, true, op);
+  expect_range_eq(std::span{arr.data() + 1, 3}, std::span{result.data() + 1, 3});
+  EXPECT_EQ(arr[0], values[0]);
+  EXPECT_EQ(arr[4], values[4]);
+}
+
+TEST(MPI, ReduceIntegerRange) {
+  mpi::communicator world;
+  std::array<int, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = (i + 1) * (world.rank() + 1);
+    result[i] = (i + 1) * world.size() * (world.size() + 1) / 2;
+  }
+  test_reduce_range(values, result);
+}
+
+TEST(MPI, ReduceComplexRange) {
+  mpi::communicator world;
+  double rank     = world.rank() + 1.0;
+  double red_rank = world.size() * (world.size() + 1) * 0.5;
+  std::array<std::complex<double>, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = std::complex<double>{rank * (i + 1), -rank * (i + 1)};
+    result[i] = std::complex<double>{red_rank * (i + 1), -red_rank * (i + 1)};
+  }
+  test_reduce_range(values, result);
+}
+
+TEST(MPI, ReduceCustomMPITypeRange) {
+  mpi::communicator world;
+  long rank     = world.rank() + 1;
+  long red_rank = world.size() * (world.size() + 1) / 2;
+  std::array<mpi_t, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = mpi_t{rank * (i + 1)};
+    result[i] = mpi_t{red_rank * (i + 1)};
+  }
+  if (world.size() > 1) { test_reduce_range(values, result, mpi::map_add<mpi_t>()); }
+}
+
+TEST(MPI, ReduceCustomNonMPITypeRange) {
+  mpi::communicator world;
+  int rank     = world.rank() + 1;
+  int red_rank = world.size() * (world.size() + 1) / 2;
+  std::array<non_mpi_t, 5> values{}, result{};
+  for (int i = 0; i < 5; ++i) {
+    values[i] = non_mpi_t{rank * (i + 1)};
+    result[i] = non_mpi_t{red_rank * (i + 1)};
+  }
+  test_reduce_range(values, result);
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_reduce_vector.cpp b/test/c++/mpi_reduce_vector.cpp
new file mode 100644
index 00000000..29142bcb
--- /dev/null
+++ b/test/c++/mpi_reduce_vector.cpp
@@ -0,0 +1,138 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <ranges>
+#include <vector>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test reducing a vector.
+template <typename T> void test_reduce_vector(std::vector<T> const &values, std::vector<T> const &result, MPI_Op op = MPI_SUM) {
+  mpi::communicator world;
+
+  // reduce from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // reduce a vector into a new vector
+    auto vec = mpi::reduce(values, world, root, false, op);
+    if (world.rank() == root) expect_range_eq(vec, result);
+
+    // reduce an empty vector
+    auto empty_vec = mpi::reduce(std::vector<T>{}, world, root, false, op);
+    EXPECT_EQ(empty_vec.size(), 0);
+
+    // reduce a vector in place
+    vec = values;
+    mpi::reduce_in_place(vec, world, root, false, op);
+    if (world.rank() == root)
+      expect_range_eq(vec, result);
+    else
+      expect_range_eq(vec, values);
+
+    // reduce an empty vector in place
+    mpi::reduce_in_place(empty_vec, world, root, false, op);
+    EXPECT_EQ(empty_vec.size(), 0);
+
+    // reduce a vector into an existing empty vector
+    vec.clear();
+    mpi::reduce_into(values, vec, world, root, false, op);
+    if (world.rank() == root)
+      expect_range_eq(vec, result);
+    else
+      EXPECT_TRUE(vec.empty());
+
+    // reduce an empty vector into an existing vector
+    vec = values;
+    mpi::reduce_into(empty_vec, vec, world, root, false, op);
+    if (world.rank() == root)
+      EXPECT_EQ(vec.size(), 0);
+    else
+      expect_range_eq(vec, values);
+  }
+
+  // allreduce a vector into a new vector
+  auto vec = mpi::all_reduce(values, world, op);
+  expect_range_eq(vec, result);
+
+  // allreduce a vector in place
+  vec = values;
+  mpi::all_reduce_in_place(vec, world, op);
+  expect_range_eq(vec, result);
+
+  // allreduce a vector in place using all_reduce_into
+  vec = values;
+  mpi::all_reduce_into(vec, vec, world, op);
+  expect_range_eq(vec, result);
+}
+
+TEST(MPI, ReduceIntegerVector) {
+  mpi::communicator world;
+  std::vector<int> values(5), result(5);
+  for (int i = 0; i < 5; ++i) {
+    values[i] = (i + 1) * (world.rank() + 1);
+    result[i] = (i + 1) * world.size() * (world.size() + 1) / 2;
+  }
+  test_reduce_vector(values, result);
+}
+
+TEST(MPI, ReduceComplexVector) {
+  mpi::communicator world;
+  double rank     = world.rank() + 1.0;
+  double red_rank = world.size() * (world.size() + 1) * 0.5;
+  std::vector<std::complex<double>> values(5), result(5);
+  for (int i = 0; i < 5; ++i) {
+    values[i] = std::complex<double>{rank * (i + 1), -rank * (i + 1)};
+    result[i] = std::complex<double>{red_rank * (i + 1), -red_rank * (i + 1)};
+  }
+  test_reduce_vector(values, result);
+}
+
+TEST(MPI, ReduceCustomMPITypeVector) {
+  mpi::communicator world;
+  long rank     = world.rank() + 1;
+  long red_rank = world.size() * (world.size() + 1) / 2;
+  std::vector<mpi_t> values(5), result(5);
+  for (int i = 0; i < 5; ++i) {
+    values[i] = mpi_t{rank * (i + 1)};
+    result[i] = mpi_t{red_rank * (i + 1)};
+  }
+  if (world.size() > 1) { test_reduce_vector(values, result, mpi::map_add<mpi_t>()); }
+}
+
+TEST(MPI, ReduceCustomNonMPITypeVector) {
+  mpi::communicator world;
+  int rank     = world.rank() + 1;
+  int red_rank = world.size() * (world.size() + 1) / 2;
+  std::vector<non_mpi_t> values(5), result(5);
+  for (int i = 0; i < 5; ++i) {
+    values[i] = non_mpi_t{rank * (i + 1)};
+    result[i] = non_mpi_t{red_rank * (i + 1)};
+  }
+  test_reduce_vector(values, result);
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_scatter_range.cpp b/test/c++/mpi_scatter_range.cpp
new file mode 100644
index 00000000..a81e3a35
--- /dev/null
+++ b/test/c++/mpi_scatter_range.cpp
@@ -0,0 +1,114 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <numeric>
+#include <ranges>
+#include <span>
+#include <vector>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test scattering a vector.
+template <typename T> void test_scatter_range(std::vector<T> const &values, long chunk_size) {
+  mpi::communicator world;
+  const int rank = world.rank();
+  auto sizes     = std::vector<int>(world.size());
+  for (int i = 0; i < world.size(); ++i) sizes[i] = static_cast<int>(mpi::chunk_length(values.size(), world.size(), i, chunk_size));
+  auto acc_sizes = std::vector<int>(world.size() + 1, 0);
+  std::partial_sum(sizes.begin(), sizes.end(), std::next(acc_sizes.begin()));
+  EXPECT_EQ(acc_sizes.back(), values.size());
+
+  // scatter from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // scatter a vector into a span
+    auto vec = std::vector<T>(sizes[rank], T{0});
+    mpi::scatter_range(values, std::span(vec.begin(), sizes[rank]), values.size(), world, root, chunk_size);
+    expect_range_eq(vec, std::span(values.begin() + acc_sizes[rank], sizes[rank]));
+
+    // scatter with chunk size = number of elements to be scattered
+    vec = std::vector<T>((rank == 0 ? values.size() : 0), T{0});
+    mpi::scatter_range(values, vec, values.size(), world, root, values.size());
+    if (world.rank() == 0)
+      expect_range_eq(vec, values);
+    else
+      EXPECT_TRUE(vec.empty());
+  }
+}
+
+TEST(MPI, ScatterIntegerRange) {
+  mpi::communicator world;
+  const long min_nchunks = 3;
+  const long chunk_size  = 4;
+  for (int i = 0; i < world.size(); ++i) {
+    // chunk size = 1
+    std::vector<int> values(min_nchunks * world.size() + i);
+    std::iota(values.begin(), values.end(), 0);
+    test_scatter_range(values, 1);
+
+    // chunk size = 4
+    values.resize((min_nchunks * world.size() + i) * chunk_size);
+    std::iota(values.begin(), values.end(), 0);
+    test_scatter_range(values, chunk_size);
+  }
+}
+
+TEST(MPI, ScatterComplexRange) {
+  mpi::communicator world;
+  const long min_nchunks = 3;
+  const long chunk_size  = 4;
+  for (int i = 0; i < world.size(); ++i) {
+    // chunk size = 1
+    std::vector<std::complex<double>> values(min_nchunks * world.size() + i);
+    for (int j = 0; j < values.size(); ++j) values[j] = std::complex<double>(j, -j);
+    test_scatter_range(values, 1);
+
+    // chunk size = 4
+    values.resize((min_nchunks * world.size() + i) * chunk_size);
+    for (int j = 0; j < values.size(); ++j) values[j] = std::complex<double>(j, -j);
+    test_scatter_range(values, chunk_size);
+  }
+}
+
+TEST(MPI, ScatterCustomMPITypeRange) {
+  mpi::communicator world;
+  const long min_nchunks = 3;
+  const long chunk_size  = 4;
+  for (int i = 0; i < world.size(); ++i) {
+    // chunk size = 1
+    std::vector<mpi_t> values(min_nchunks * world.size() + i);
+    for (int j = 0; j < values.size(); ++j) values[j].a = j;
+    test_scatter_range(values, 1);
+
+    // chunk size = 4
+    values.resize((min_nchunks * world.size() + i) * chunk_size);
+    for (int j = 0; j < values.size(); ++j) values[j].a = j;
+    test_scatter_range(values, chunk_size);
+  }
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_scatter_vector.cpp b/test/c++/mpi_scatter_vector.cpp
new file mode 100644
index 00000000..e019080c
--- /dev/null
+++ b/test/c++/mpi_scatter_vector.cpp
@@ -0,0 +1,90 @@
+// Copyright (c) 2022-2024 Simons Foundation
+// Copyright (c) 2022 Hugo U.R. Strand
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Hugo U.R. Strand
+
+#include "./custom_types.hpp"
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <complex>
+#include <numeric>
+#include <ranges>
+#include <span>
+#include <vector>
+
+// Check if two ranges are equal.
+void expect_range_eq(auto &&rg1, auto &&rg2) {
+  EXPECT_EQ(std::ranges::size(rg1), std::ranges::size(rg2));
+  auto it2 = std::ranges::begin(rg2);
+  for (auto &&a : rg1) { EXPECT_EQ(a, *it2++); }
+}
+
+// Test scattering a vector.
+template <typename T> void test_scatter_vector(std::vector<T> const &values) {
+  mpi::communicator world;
+  auto recvcounts = std::vector<int>(world.size());
+  for (int i = 0; i < world.size(); ++i) recvcounts[i] = static_cast<int>(mpi::chunk_length(values.size(), world.size(), i));
+  auto displs = std::vector<int>(world.size() + 1, 0);
+  std::partial_sum(recvcounts.begin(), recvcounts.end(), std::next(displs.begin()));
+  auto const recvcount = recvcounts[world.rank()];
+  auto const displ     = displs[world.rank()];
+
+  // scatter from different roots
+  for (int root = 0; root < world.size(); ++root) {
+    // scatter a vector into a new vector
+    auto vec = mpi::scatter(world.rank() == root ? values : std::vector<T>{}, world, root);
+    expect_range_eq(vec, std::span(values.begin() + displ, recvcount));
+
+    // scatter a vector into an existing vector
+    vec.clear();
+    mpi::scatter_into(values, vec, world, root);
+    expect_range_eq(vec, std::span(values.begin() + displ, recvcount));
+  }
+
+  // scatter an empty vector
+  auto vec = mpi::scatter(std::vector<T>{}, world);
+  EXPECT_TRUE(vec.empty());
+}
+
+TEST(MPI, ScatterIntegerVector) {
+  mpi::communicator world;
+  for (int total_size = 3 * world.size(); total_size < 4 * world.size(); ++total_size) {
+    std::vector<int> values(total_size);
+    std::iota(values.begin(), values.end(), 0);
+    test_scatter_vector(values);
+  }
+}
+
+TEST(MPI, ScatterComplexVector) {
+  mpi::communicator world;
+  for (int total_size = 3 * world.size(); total_size < 4 * world.size(); ++total_size) {
+    std::vector<std::complex<double>> values(total_size);
+    for (int i = 0; i < total_size; ++i) values[i] = std::complex<double>(i, -i);
+    test_scatter_vector(values);
+  }
+}
+
+TEST(MPI, ScatterCustomMPITypeVector) {
+  mpi::communicator world;
+  for (int total_size = 3 * world.size(); total_size < 4 * world.size(); ++total_size) {
+    std::vector<mpi_t> values(total_size);
+    for (int i = 0; i < total_size; ++i) values[i].a = i;
+    test_scatter_vector(values);
+  }
+}
+
+MPI_TEST_MAIN;
diff --git a/test/c++/mpi_string.cpp b/test/c++/mpi_utils.cpp
similarity index 73%
rename from test/c++/mpi_string.cpp
rename to test/c++/mpi_utils.cpp
index 617ab884..0dc12b0b 100644
--- a/test/c++/mpi_string.cpp
+++ b/test/c++/mpi_utils.cpp
@@ -15,20 +15,14 @@
 // Authors: Thomas Hahn, Nils Wentzell
 
 #include <gtest/gtest.h>
-#include <mpi/string.hpp>
+#include <mpi/mpi.hpp>
 
-#include <string>
 
-TEST(MPI, StringBroadcast) {
-  // broadcast a string
-  mpi::communicator world;
-
-  std::string s;
-  if (world.rank() == 0) s = "Hello World";
-
-  mpi::broadcast(s);
-
-  EXPECT_EQ(s, std::string{"Hello World"});
+TEST(MPI, CheckMPICall) {
+  // test if check_mpi_call throws an exception
+  try {
+    mpi::check_mpi_call(MPI_SUCCESS - 1, "not_a_real_mpi_call");
+  } catch (std::runtime_error const &e) { std::cout << e.what() << std::endl; }
 }
 
 MPI_TEST_MAIN;
diff --git a/test/c++/mpi_vector.cpp b/test/c++/mpi_vector.cpp
deleted file mode 100644
index 4c64b2cf..00000000
--- a/test/c++/mpi_vector.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2020-2024 Simons Foundation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Authors: Thomas Hahn, Nils Wentzell
-
-#include <gtest/gtest.h>
-#include <itertools/itertools.hpp>
-#include <mpi/pair.hpp>
-#include <mpi/string.hpp>
-#include <mpi/vector.hpp>
-
-#include <complex>
-#include <string>
-#include <utility>
-#include <vector>
-
-TEST(MPI, VectorReduce) {
-  // reduce a vector of complex numbers
-  mpi::communicator world;
-  using vec_type = std::vector<std::complex<double>>;
-
-  const int size = 7;
-  vec_type vec(size), reduced_vec;
-
-  for (int i = 0; i < size; ++i) vec[i] = i;
-
-  reduced_vec = mpi::all_reduce(vec, world);
-
-  vec_type exp_vec(size);
-  for (int i = 0; i < size; ++i) exp_vec[i] = world.size() * i;
-
-  EXPECT_EQ(reduced_vec, exp_vec);
-}
-
-TEST(MPI, EmptyVectorReduce) {
-  // reduce an empty vector
-  mpi::communicator world;
-  std::vector<double> v1{};
-  std::vector<double> v2 = mpi::reduce(v1, world);
-}
-
-TEST(MPI, VectorGatherScatter) {
-  // scatter and gather a vector of complex numbers
-  mpi::communicator world;
-
-  std::vector<std::complex<double>> vec(7), scattered_vec(7), gathered_vec(7, {0.0, 0.0});
-
-  for (auto [i, v_i] : itertools::enumerate(vec)) v_i = static_cast<double>(i) + 1.0;
-
-  scattered_vec = mpi::scatter(vec, world);
-  auto tmp      = mpi::scatter(vec, world);
-
-  for (auto &x : scattered_vec) x *= -1;
-  for (auto &x : vec) x *= -1;
-
-  gathered_vec = mpi::all_gather(scattered_vec, world);
-
-  EXPECT_EQ(vec, gathered_vec);
-}
-
-TEST(MPI, VectorGatherScatterPair) {
-  // scatter and gather a vector of pairs
-  auto v = std::vector<std::pair<int, std::string>>{{1, "one"}, {2, "two"}, {3, "three"}, {4, "four"}, {5, "five"}};
-
-  auto vsct = mpi::scatter(v);
-  auto vgth = mpi::all_gather(vsct);
-
-  mpi::communicator world;
-  if (world.size() > 1) { EXPECT_NE(vsct, vgth); }
-  EXPECT_EQ(v, vgth);
-}
-
-MPI_TEST_MAIN;
diff --git a/test/c++/mpi_window.cpp b/test/c++/mpi_window.cpp
new file mode 100644
index 00000000..7d3c7fca
--- /dev/null
+++ b/test/c++/mpi_window.cpp
@@ -0,0 +1,396 @@
+// Copyright (c) 2023 Simons Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Philipp Dumitrescu, Olivier Parcollet, Nils Wentzell
+
+#include <gtest/gtest.h>
+#include <mpi/mpi.hpp>
+
+#include <array>
+#include <cstddef>
+#include <numeric>
+#include <span>
+#include <utility>
+#include <vector>
+
+// Test cases are adapted from slides and exercises of the HLRS course:
+// Introduction to the Message Passing Interface (MPI)
+// Authors: Joel Malard, Alan Simpson, (EPCC)
+//          Rolf Rabenseifner, Traugott Streicher, Tobias Haas (HLRS)
+// https://fs.hlrs.de/projects/par/par_prog_ws/pdf/mpi_3.1_rab.pdf
+// https://fs.hlrs.de/projects/par/par_prog_ws/practical/MPI31single.tar.gz
+
+TEST(MPI, WindowCommunicatorMember) {
+  mpi::communicator world;
+
+  int data = world.rank();
+
+  mpi::window<int> win(world, &data, 1);
+
+  auto win_comm = win.get_communicator();
+
+  EXPECT_EQ(win_comm.rank(), world.rank());
+  EXPECT_EQ(win_comm.size(), world.size());
+}
+
+TEST(MPI, WindowSharedCommunicatorMember) {
+  auto shm = mpi::communicator{}.split_shared();
+
+  mpi::shared_window<int> win{shm, 1};
+
+  auto sh_win_comm = win.get_communicator();
+
+  EXPECT_EQ(sh_win_comm.rank(), shm.rank());
+  EXPECT_EQ(sh_win_comm.size(), shm.size());
+}
+
+TEST(MPI, WindowGetAttrBase) {
+  mpi::communicator world;
+
+  int buffer = world.rank();
+  mpi::window<int> win{world, &buffer, 1};
+
+  void *base_ptr = win.base();
+  EXPECT_NE(base_ptr, nullptr);
+  EXPECT_EQ(base_ptr, &buffer);
+}
+
+TEST(MPI, WindowAllocate) {
+  mpi::communicator world;
+  int rank = world.rank();
+
+  mpi::window<int> win{world, 1};
+  *(win.base()) = rank;
+
+  win.fence();
+  int rcv{};
+  win.get(&rcv, 1, rank);
+  win.fence();
+
+  EXPECT_EQ(rcv, rank);
+}
+
+TEST(MPI, WindowPassiveTargetCommunication) {
+  mpi::communicator world;
+  if (world.size() < 2) { GTEST_SKIP() << "Test requires at least 2 processes\n"; }
+  int rank = world.rank();
+
+  auto win_comm = world.split(rank == 0 || rank == 1 ? 0 : MPI_UNDEFINED);
+
+  if (rank == 0 || rank == 1) {
+    mpi::window<int> win{win_comm, 1};
+    *(win.base()) = -1;
+
+    win.fence();
+    if (rank == 0) {
+      int val = 42;
+      win.put(&val, 1, 1);
+    }
+    win.fence();
+
+    if (rank == 1) { EXPECT_EQ(*(win.base()), 42); }
+  }
+}
+
+TEST(MPI, WindowActiveTargetCommunication) {
+  mpi::communicator world;
+  if (world.size() < 2) {
+    // Target rank cannot be equal to origin rank (deadlocks), so we need at
+    // least two ranks for this test case.
+    GTEST_SKIP();
+  }
+  int rank = world.rank();
+
+  mpi::window<int> win{world, 1};
+  *(win.base()) = -1;
+
+  int origin_rank = 0;
+  int target_rank = 1;
+
+  // Only the origin and target ranks will participate in the communication.
+  mpi::group world_group(world);
+  auto origin_group = world_group.include({origin_rank});
+  auto target_group = world_group.include({target_rank});
+
+  if (rank == target_rank) {
+    win.post(origin_group);
+    win.wait(); // blocks until origin_rank calls complete()
+    EXPECT_EQ(*(win.base()), 42);
+  }
+
+  if (rank == origin_rank) {
+    win.start(target_group); // blocks until target_rank calls post()
+    auto origin_arr  = std::array<int, 1>{42};
+    int origin_count = 1;
+    win.put(origin_arr.data(), origin_count, target_rank);
+    win.complete();
+  }
+}
+
+TEST(MPI, WindowGetAttrSize) {
+  mpi::communicator world;
+  int buffer{};
+  mpi::window<int> win{world, &buffer, 1};
+
+  MPI_Aint size = win.size();
+  EXPECT_EQ(size, 1);
+}
+
+TEST(MPI, WindowMoveConstructor) {
+  mpi::communicator world;
+  int i = 1;
+  mpi::window<int> win1{world, &i, 1};
+
+  mpi::window<int> win2 = std::move(win1);
+
+  EXPECT_EQ(win2.base(), &i);
+  EXPECT_EQ(win1.base(), nullptr);
+}
+
+TEST(MPI, WindowNullptrSizeZero) {
+  mpi::communicator world;
+  mpi::window<int> win{world, nullptr, 0};
+
+  EXPECT_EQ(win.base(), nullptr);
+  EXPECT_EQ(win.size(), 0);
+}
+
+TEST(MPI, WindowOneSidedGet) {
+  mpi::communicator world;
+  int const rank = world.rank();
+
+  int snd_buf{}, rcv_buf = -1;
+  mpi::window<int> win{world, &snd_buf, 1};
+  snd_buf = rank;
+
+  win.fence();
+  win.get(&rcv_buf, 1, rank);
+  win.fence();
+
+  EXPECT_EQ(rcv_buf, rank);
+}
+
+TEST(MPI, WindowOneSidedPut) {
+  mpi::communicator world;
+  int const rank = world.rank();
+
+  int snd_buf{}, rcv_buf = -1;
+  mpi::window<int> win{world, &rcv_buf, 1};
+  snd_buf = rank;
+
+  win.fence();
+  win.put(&snd_buf, 1, rank);
+  win.fence();
+
+  EXPECT_EQ(rcv_buf, rank);
+}
+
+TEST(MPI, WindowRingOneSidedGet) {
+  mpi::communicator world;
+  int const rank = world.rank();
+  int const size = world.size();
+  int const left = (rank - 1 + size) % size;
+
+  int snd_buf{}, rcv_buf{};
+  mpi::window<int> win{world, &snd_buf, 1};
+  snd_buf = rank;
+
+  int sum = 0;
+  for (int i = 0; i < size; ++i) {
+    win.fence();
+    win.get(&rcv_buf, 1, left);
+    win.fence();
+    snd_buf = rcv_buf;
+    sum += rcv_buf;
+  }
+
+  EXPECT_EQ(sum, (size * (size - 1)) / 2);
+}
+
+TEST(MPI, WindowRingOneSidedPut) {
+  mpi::communicator world;
+  int const rank  = world.rank();
+  int const size  = world.size();
+  int const right = (rank + 1) % size;
+
+  int snd_buf{}, rcv_buf{};
+  mpi::window<int> win{world, &rcv_buf, 1};
+  snd_buf = rank;
+
+  int sum = 0;
+  for (int i = 0; i < size; ++i) {
+    win.fence();
+    win.put(&snd_buf, 1, right);
+    win.fence();
+    snd_buf = rcv_buf;
+    sum += rcv_buf;
+  }
+
+  EXPECT_EQ(sum, (size * (size - 1)) / 2);
+}
+
+TEST(MPI, WindowRingOneSidedAllocShared) {
+  mpi::communicator world;
+  auto shm           = world.split_shared();
+  int const rank_shm = shm.rank();
+  int const size_shm = shm.size();
+  int const right    = (rank_shm + 1) % size_shm;
+
+  mpi::shared_window<int> win{shm, 1};
+  int *rcv_buf_ptr = win.base(rank_shm);
+
+  int snd_buf = rank_shm;
+  int sum     = 0;
+  for (int i = 0; i < size_shm; ++i) {
+    win.fence();
+    win.put(&snd_buf, 1, right);
+    win.fence();
+    snd_buf = *rcv_buf_ptr;
+    sum += *rcv_buf_ptr;
+  }
+
+  EXPECT_EQ(sum, (size_shm * (size_shm - 1)) / 2);
+}
+
+TEST(MPI, WindowRingOneSidedStoreWinAllocSharedSignal) {
+  if (not mpi::has_env) {
+    // Test doesn't make sense without MPI
+    GTEST_SKIP();
+  }
+  mpi::communicator world;
+  auto shm = world.split_shared();
+
+  int const rank_shm = shm.rank();
+  int const size_shm = shm.size();
+  int const right    = (rank_shm + 1) % size_shm;
+  int const left     = (rank_shm - 1 + size_shm) % size_shm;
+
+  mpi::shared_window<int> win{shm, 1};
+  int *rcv_buf_ptr = win.base(rank_shm);
+  win.lock();
+
+  int sum     = 0;
+  int snd_buf = rank_shm;
+
+  MPI_Request rq{};
+  MPI_Status status;
+  int snd_dummy{}, rcv_dummy{};
+
+  for (int i = 0; i < size_shm; ++i) {
+    // ... The local Win_syncs are needed to sync the processor and real memory.
+    // ... The following pair of syncs is needed that the read-write-rule is fulfilled.
+    win.sync();
+
+    // ... tag=17: posting to left that rcv_buf is exposed to left, i.e.,
+    //             the left process is now allowed to store data into the local rcv_buf
+    MPI_Irecv(&rcv_dummy, 0, MPI_INT, right, 17, shm.get(), &rq);
+    MPI_Send(&snd_dummy, 0, MPI_INT, left, 17, shm.get());
+    MPI_Wait(&rq, &status);
+
+    win.sync();
+
+    // MPI_Put(&snd_buf, 1, MPI_INT, right, (MPI_Aint) 0, 1, MPI_INT, win);
+    //   ... is substited by (with offset "right-my_rank" to store into right neigbor's rcv_buf):
+    *(rcv_buf_ptr + (right - rank_shm)) = snd_buf;
+
+    // ... The following pair of syncs is needed that the write-read-rule is fulfilled.
+    win.sync();
+
+    // ... The following communication synchronizes the processors in the way
+    //     that the origin processor has finished the store
+    //     before the target processor starts to load the data.
+    // ... tag=18: posting to right that rcv_buf was stored from left
+    MPI_Irecv(&rcv_dummy, 0, MPI_INT, left, 18, shm.get(), &rq);
+    MPI_Send(&snd_dummy, 0, MPI_INT, right, 18, shm.get());
+    MPI_Wait(&rq, &status);
+
+    win.sync();
+
+    snd_buf = *rcv_buf_ptr;
+    sum += *rcv_buf_ptr;
+  }
+
+  EXPECT_EQ(sum, (size_shm * (size_shm - 1)) / 2);
+
+  win.unlock();
+}
+
+TEST(MPI, WindowSharedArray) {
+  mpi::communicator world;
+  auto shm = world.split_shared();
+
+  const int array_size = 23;
+
+  // Only rank 0 allocates the shared array
+  mpi::shared_window<int> win{shm, shm.rank() == 0 ? array_size : 0};
+  std::span array_view{win.base(0), static_cast<std::size_t>(win.size(0))};
+
+  // Fill array in parallel: each rank fills its chunk with array indices
+  win.fence();
+  for (auto i : mpi::chunk(itertools::range(array_size), shm)) { array_view[i] = static_cast<int>(i); }
+  win.fence();
+
+  // Total sum is just sum of numbers in interval [0, array_size)
+  int sum = std::accumulate(array_view.begin(), array_view.end(), int{0});
+  EXPECT_EQ(sum, (array_size * (array_size - 1)) / 2);
+}
+
+TEST(MPI, WindowDistributedSharedArray) {
+  mpi::communicator world;
+  auto island_comm = world.split_shared();
+
+  // Number of total array elements (prime number to make it a bit more exciting)
+  const int array_size_total = 197;
+
+  // Create communicator of island leaders (rank 0 on each node)
+  bool is_head   = island_comm.rank() == 0;
+  auto head_comm = world.split(is_head ? 0 : MPI_UNDEFINED);
+
+  // Each world rank gets a chunk of the global array
+  auto [my_start, my_end] = itertools::chunk_range(0, array_size_total, world.size(), world.rank());
+  int my_chunk_size       = static_cast<int>(my_end - my_start);
+
+  // Gather all chunk sizes within the island
+  auto island_chunk_sizes = mpi::all_gather(my_chunk_size, island_comm);
+  int island_array_size   = std::accumulate(island_chunk_sizes.begin(), island_chunk_sizes.end(), int{0});
+
+  // Allocate shared array combining all island ranks' chunks
+  mpi::shared_window<int> win{island_comm, is_head ? island_array_size : 0};
+  std::span array_view(win.base(0), island_array_size);
+
+  // Calculate offset within the island's shared array
+  int my_offset = std::accumulate(island_chunk_sizes.begin(), island_chunk_sizes.begin() + island_comm.rank(), int{0});
+
+  // Each rank fills its chunk with global indices
+  win.fence();
+  auto my_chunk = array_view.subspan(my_offset, my_chunk_size);
+  for (int i = 0; i < my_chunk_size; ++i) { my_chunk[i] = static_cast<int>(my_start + i); }
+  win.fence();
+
+  // Partial sum over my chunk
+  int my_sum = std::accumulate(my_chunk.begin(), my_chunk.end(), int{0});
+
+  // Partial sum over each island
+  int island_sum = mpi::reduce(my_sum, island_comm);
+
+  // Calculate Total sum on head ranks
+  int total_sum = 0;
+  if (is_head) { total_sum = mpi::reduce(island_sum, head_comm); }
+  mpi::broadcast(total_sum, world);
+
+  // Total sum is just sum of numbers in interval [0, array_size_total)
+  EXPECT_EQ(total_sum, (array_size_total * (array_size_total - 1)) / 2);
+}
+
+MPI_TEST_MAIN;